simple_doc_parser.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. import json
  2. import os
  3. import re
  4. import time
  5. from collections import Counter
  6. from typing import Dict, List, Optional, Union
  7. import json5
  8. from qwen_agent.log import logger
  9. from qwen_agent.settings import DEFAULT_WORKSPACE
  10. from qwen_agent.tools.base import BaseTool, register_tool
  11. from qwen_agent.tools.storage import KeyNotExistsError, Storage
  12. from qwen_agent.utils.str_processing import rm_cid, rm_continuous_placeholders, rm_hexadecimal
  13. from qwen_agent.utils.tokenization_qwen import count_tokens
  14. from qwen_agent.utils.utils import (get_file_type, hash_sha256, is_http_url, read_text_from_file,
  15. sanitize_chrome_file_path, save_url_to_local_work_dir)
  16. def clean_paragraph(text):
  17. text = rm_cid(text)
  18. text = rm_hexadecimal(text)
  19. text = rm_continuous_placeholders(text)
  20. return text
  21. PARAGRAPH_SPLIT_SYMBOL = '\n'
  22. def parse_word(docx_path: str, extract_image: bool = False):
  23. if extract_image:
  24. raise ValueError('Currently, extracting images is not supported!')
  25. from docx import Document
  26. doc = Document(docx_path)
  27. content = []
  28. for para in doc.paragraphs:
  29. content.append({'text': para.text})
  30. for table in doc.tables:
  31. tbl = []
  32. for row in table.rows:
  33. tbl.append('|' + '|'.join([cell.text for cell in row.cells]) + '|')
  34. tbl = '\n'.join(tbl)
  35. content.append({'table': tbl})
  36. # Due to the pages in Word are not fixed, the entire document is returned as one page
  37. return [{'page_num': 1, 'content': content}]
  38. def parse_ppt(path: str, extract_image: bool = False):
  39. if extract_image:
  40. raise ValueError('Currently, extracting images is not supported!')
  41. from pptx import Presentation
  42. ppt = Presentation(path)
  43. doc = []
  44. for slide_number, slide in enumerate(ppt.slides):
  45. page = {'page_num': slide_number + 1, 'content': []}
  46. for shape in slide.shapes:
  47. if not shape.has_text_frame and not shape.has_table:
  48. pass
  49. if shape.has_text_frame:
  50. for paragraph in shape.text_frame.paragraphs:
  51. paragraph_text = ''.join(run.text for run in paragraph.runs)
  52. paragraph_text = clean_paragraph(paragraph_text)
  53. if paragraph_text.strip():
  54. page['content'].append({'text': paragraph_text})
  55. if shape.has_table:
  56. tbl = []
  57. for row_number, row in enumerate(shape.table.rows):
  58. tbl.append('|' + '|'.join([cell.text for cell in row.cells]) + '|')
  59. tbl = '\n'.join(tbl)
  60. page['content'].append({'table': tbl})
  61. doc.append(page)
  62. return doc
  63. def parse_txt(path: str):
  64. text = read_text_from_file(path)
  65. paras = text.split(PARAGRAPH_SPLIT_SYMBOL)
  66. content = []
  67. for p in paras:
  68. content.append({'text': p})
  69. # Due to the pages in txt are not fixed, the entire document is returned as one page
  70. return [{'page_num': 1, 'content': content}]
  71. def df_to_md(df) -> str:
  72. def replace_long_dashes(text):
  73. if text.replace('-', '').replace(':', '').strip():
  74. return text
  75. pattern = r'-{6,}'
  76. replaced_text = re.sub(pattern, '-----', text)
  77. return replaced_text
  78. from tabulate import tabulate
  79. md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
  80. md_table = '\n'.join([
  81. '|'.join(replace_long_dashes(' ' + cell.strip() + ' ' if cell else '')
  82. for cell in row.split('|'))
  83. for row in md_table.split('\n')
  84. ])
  85. return md_table
  86. def parse_excel(file_path: str, extract_image: bool = False) -> List[dict]:
  87. if extract_image:
  88. raise ValueError('Currently, extracting images is not supported!')
  89. import pandas as pd
  90. excel_file = pd.ExcelFile(file_path)
  91. md_tables = []
  92. for sheet_name in excel_file.sheet_names:
  93. df = pd.read_excel(file_path, sheet_name=sheet_name)
  94. md_table = df_to_md(df)
  95. md_tables.append(f'### Sheet: {sheet_name}\n{md_table}')
  96. return [{'page_num': i + 1, 'content': [{'table': md_tables[i]}]} for i in range(len(md_tables))]
  97. def parse_csv(file_path: str, extract_image: bool = False) -> List[dict]:
  98. if extract_image:
  99. raise ValueError('Currently, extracting images is not supported!')
  100. import pandas as pd
  101. md_tables = []
  102. df = pd.read_csv(file_path, encoding_errors='replace', on_bad_lines='skip')
  103. md_table = df_to_md(df)
  104. md_tables.append(md_table) # There is only one table available
  105. return [{'page_num': i + 1, 'content': [{'table': md_tables[i]}]} for i in range(len(md_tables))]
  106. def parse_tsv(file_path: str, extract_image: bool = False) -> List[dict]:
  107. if extract_image:
  108. raise ValueError('Currently, extracting images is not supported!')
  109. import pandas as pd
  110. md_tables = []
  111. df = pd.read_csv(file_path, sep='\t', encoding_errors='replace', on_bad_lines='skip')
  112. md_table = df_to_md(df)
  113. md_tables.append(md_table) # There is only one table available
  114. return [{'page_num': i + 1, 'content': [{'table': md_tables[i]}]} for i in range(len(md_tables))]
  115. def parse_html_bs(path: str, extract_image: bool = False):
  116. if extract_image:
  117. raise ValueError('Currently, extracting images is not supported!')
  118. def pre_process_html(s):
  119. # replace multiple newlines
  120. s = re.sub('\n+', '\n', s)
  121. # replace special string
  122. s = s.replace("Add to Qwen's Reading List", '')
  123. return s
  124. try:
  125. from bs4 import BeautifulSoup
  126. except Exception:
  127. raise ValueError('Please install bs4 by `pip install beautifulsoup4`')
  128. bs_kwargs = {'features': 'lxml'}
  129. with open(path, 'r', encoding='utf-8') as f:
  130. soup = BeautifulSoup(f, **bs_kwargs)
  131. text = soup.get_text()
  132. if soup.title:
  133. title = str(soup.title.string)
  134. else:
  135. title = ''
  136. text = pre_process_html(text)
  137. paras = text.split(PARAGRAPH_SPLIT_SYMBOL)
  138. content = []
  139. for p in paras:
  140. p = clean_paragraph(p)
  141. if p.strip():
  142. content.append({'text': p})
  143. # The entire document is returned as one page
  144. return [{'page_num': 1, 'content': content, 'title': title}]
  145. def parse_pdf(pdf_path: str, extract_image: bool = False) -> List[dict]:
  146. # Todo: header and footer
  147. from pdfminer.high_level import extract_pages
  148. from pdfminer.layout import LTImage, LTRect, LTTextContainer
  149. doc = []
  150. for i, page_layout in enumerate(extract_pages(pdf_path)):
  151. page = {'page_num': page_layout.pageid, 'content': []}
  152. elements = []
  153. for element in page_layout:
  154. elements.append(element)
  155. # Init params for table
  156. table_num = 0
  157. tables = []
  158. for element in elements:
  159. if isinstance(element, LTRect):
  160. if not tables:
  161. tables = extract_tables(pdf_path, i)
  162. if table_num < len(tables):
  163. table_string = table_converter(tables[table_num])
  164. table_num += 1
  165. if table_string:
  166. page['content'].append({'table': table_string, 'obj': element})
  167. elif isinstance(element, LTTextContainer):
  168. # Delete line breaks in the same paragraph
  169. text = element.get_text()
  170. # Todo: Further analysis using font
  171. font = get_font(element)
  172. if text.strip():
  173. new_content_item = {'text': text, 'obj': element}
  174. if font:
  175. new_content_item['font-size'] = round(font[1])
  176. # new_content_item['font-name'] = font[0]
  177. page['content'].append(new_content_item)
  178. elif extract_image and isinstance(element, LTImage):
  179. # Todo: ocr
  180. raise ValueError('Currently, extracting images is not supported!')
  181. else:
  182. pass
  183. # merge elements
  184. page['content'] = postprocess_page_content(page['content'])
  185. doc.append(page)
  186. return doc
  187. def postprocess_page_content(page_content: list) -> list:
  188. # rm repetitive identification for table and text
  189. # Some documents may repeatedly recognize LTRect and LTTextContainer
  190. table_obj = [p['obj'] for p in page_content if 'table' in p]
  191. tmp = []
  192. for p in page_content:
  193. repetitive = False
  194. if 'text' in p:
  195. for t in table_obj:
  196. if t.bbox[0] <= p['obj'].bbox[0] and p['obj'].bbox[1] <= t.bbox[1] and t.bbox[2] <= p['obj'].bbox[
  197. 2] and p['obj'].bbox[3] <= t.bbox[3]:
  198. repetitive = True
  199. break
  200. if not repetitive:
  201. tmp.append(p)
  202. page_content = tmp
  203. # merge paragraphs that have been separated by mistake
  204. new_page_content = []
  205. for p in page_content:
  206. if new_page_content and 'text' in new_page_content[-1] and 'text' in p and abs(
  207. p.get('font-size', 12) -
  208. new_page_content[-1].get('font-size', 12)) < 2 and p['obj'].height < p.get('font-size', 12) + 1:
  209. # Merge those lines belonging to a paragraph
  210. new_page_content[-1]['text'] += f' {p["text"]}'
  211. # new_page_content[-1]['font-name'] = p.get('font-name', '')
  212. new_page_content[-1]['font-size'] = p.get('font-size', 12)
  213. else:
  214. p.pop('obj')
  215. new_page_content.append(p)
  216. for i in range(len(new_page_content)):
  217. if 'text' in new_page_content[i]:
  218. new_page_content[i]['text'] = clean_paragraph(new_page_content[i]['text'])
  219. return new_page_content
  220. def get_font(element):
  221. from pdfminer.layout import LTChar, LTTextContainer
  222. fonts_list = []
  223. for text_line in element:
  224. if isinstance(text_line, LTTextContainer):
  225. for character in text_line:
  226. if isinstance(character, LTChar):
  227. fonts_list.append((character.fontname, character.size))
  228. fonts_list = list(set(fonts_list))
  229. if fonts_list:
  230. counter = Counter(fonts_list)
  231. most_common_fonts = counter.most_common(1)[0][0]
  232. return most_common_fonts
  233. else:
  234. return []
  235. def extract_tables(pdf_path, page_num):
  236. import pdfplumber
  237. pdf = pdfplumber.open(pdf_path)
  238. table_page = pdf.pages[page_num]
  239. tables = table_page.extract_tables()
  240. return tables
  241. def table_converter(table):
  242. table_string = ''
  243. for row_num in range(len(table)):
  244. row = table[row_num]
  245. cleaned_row = [
  246. item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item
  247. for item in row
  248. ]
  249. table_string += ('|' + '|'.join(cleaned_row) + '|' + '\n')
  250. table_string = table_string[:-1]
  251. return table_string
  252. PARSER_SUPPORTED_FILE_TYPES = ['pdf', 'docx', 'pptx', 'txt', 'html', 'csv', 'tsv', 'xlsx', 'xls']
  253. def get_plain_doc(doc: list):
  254. paras = []
  255. for page in doc:
  256. for para in page['content']:
  257. for k, v in para.items():
  258. if k in ['text', 'table', 'image']:
  259. paras.append(v)
  260. return PARAGRAPH_SPLIT_SYMBOL.join(paras)
  261. @register_tool('simple_doc_parser')
  262. class SimpleDocParser(BaseTool):
  263. description = f'提取出一个文档的内容,支持类型包括:{"/".join(PARSER_SUPPORTED_FILE_TYPES)}'
  264. parameters = [{
  265. 'name': 'url',
  266. 'type': 'string',
  267. 'description': '待提取的文件的路径,可以是一个本地路径或可下载的http(s)链接',
  268. 'required': True
  269. }]
  270. def __init__(self, cfg: Optional[Dict] = None):
  271. super().__init__(cfg)
  272. self.data_root = self.cfg.get('path', os.path.join(DEFAULT_WORKSPACE, 'tools', self.name))
  273. self.extract_image = self.cfg.get('extract_image', False)
  274. self.structured_doc = self.cfg.get('structured_doc', False)
  275. self.db = Storage({'storage_root_path': self.data_root})
  276. def call(self, params: Union[str, dict], **kwargs) -> Union[str, list]:
  277. """Parse pdf by url, and return the formatted content.
  278. Returns:
  279. Extracted doc as plain text or the following list format:
  280. [
  281. {'page_num': 1,
  282. 'content': [
  283. {'text': 'This is one paragraph'},
  284. {'table': 'This is one table'}
  285. ],
  286. 'title': 'If extracted, this is the title of the doc.'},
  287. {'page_num': 2,
  288. 'content': [
  289. {'text': 'This is one paragraph'},
  290. {'table': 'This is one table'}
  291. ]}
  292. ]
  293. """
  294. params = self._verify_json_format_args(params)
  295. path = params['url']
  296. cached_name_ori = f'{hash_sha256(path)}_ori'
  297. try:
  298. # Directly load the parsed doc
  299. parsed_file = self.db.get(cached_name_ori)
  300. try:
  301. parsed_file = json5.loads(parsed_file)
  302. except ValueError:
  303. logger.warning(f'Encountered ValueError raised by json5. Fall back to json. File: {cached_name_ori}')
  304. parsed_file = json.loads(parsed_file)
  305. logger.info(f'Read parsed {path} from cache.')
  306. except KeyNotExistsError:
  307. logger.info(f'Start parsing {path}...')
  308. time1 = time.time()
  309. f_type = get_file_type(path)
  310. if f_type in PARSER_SUPPORTED_FILE_TYPES:
  311. if path.startswith('https://') or path.startswith('http://') or re.match(
  312. r'^[A-Za-z]:\\', path) or re.match(r'^[A-Za-z]:/', path):
  313. path = path
  314. else:
  315. path = sanitize_chrome_file_path(path)
  316. os.makedirs(self.data_root, exist_ok=True)
  317. if is_http_url(path):
  318. # download online url
  319. tmp_file_root = os.path.join(self.data_root, hash_sha256(path))
  320. os.makedirs(tmp_file_root, exist_ok=True)
  321. path = save_url_to_local_work_dir(path, tmp_file_root)
  322. if f_type == 'pdf':
  323. parsed_file = parse_pdf(path, self.extract_image)
  324. elif f_type == 'docx':
  325. parsed_file = parse_word(path, self.extract_image)
  326. elif f_type == 'pptx':
  327. parsed_file = parse_ppt(path, self.extract_image)
  328. elif f_type == 'txt':
  329. parsed_file = parse_txt(path)
  330. elif f_type == 'html':
  331. parsed_file = parse_html_bs(path, self.extract_image)
  332. elif f_type == 'csv':
  333. parsed_file = parse_csv(path, self.extract_image)
  334. elif f_type == 'tsv':
  335. parsed_file = parse_tsv(path, self.extract_image)
  336. elif f_type in ['xlsx', 'xls']:
  337. parsed_file = parse_excel(path, self.extract_image)
  338. else:
  339. raise ValueError(
  340. f'Failed: The current parser does not support this file type! Supported types: {"/".join(PARSER_SUPPORTED_FILE_TYPES)}'
  341. )
  342. for page in parsed_file:
  343. for para in page['content']:
  344. # Todo: More attribute types
  345. para['token'] = count_tokens(para.get('text', para.get('table')))
  346. time2 = time.time()
  347. logger.info(f'Finished parsing {path}. Time spent: {time2 - time1} seconds.')
  348. # Cache the parsing doc
  349. self.db.put(cached_name_ori, json.dumps(parsed_file, ensure_ascii=False, indent=2))
  350. if not self.structured_doc:
  351. return get_plain_doc(parsed_file)
  352. else:
  353. return parsed_file