parse_doc.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import sys
  2. import html2text
  3. import os
  4. sys.path.append('/home/ys/Qwen-Agent')
  5. print(os.path.dirname(__file__))
  6. from qwen_agent.actions import Simple
  7. import re
  8. def gen_q(text):
  9. agent = Simple(stream=False)
  10. query = '根据参考资料提几个最合适的问题,问题答案可以在参考资料中找出'
  11. res = agent.run(text, query)
  12. return res
  13. def parse_pdf_pypdf(path, pre_gen_question=False):
  14. from langchain.document_loaders import PyPDFLoader
  15. loader = PyPDFLoader(path)
  16. pages = loader.load_and_split()
  17. # print(pages)
  18. if pre_gen_question:
  19. res = []
  20. for page in pages:
  21. print(len(page.page_content.split(' ')))
  22. res.append({'page_content': page.page_content, 'metadata': page.metadata, 'related_questions': gen_q(page.page_content)})
  23. else:
  24. res = [{'page_content': page.page_content, 'metadata': page.metadata} for page in pages]
  25. return res
  26. def parse_html(htmltext):
  27. return html2text.html2text(htmltext)
  28. def pre_process_html(s):
  29. # replace multiple newlines
  30. s = re.sub('\n+', '\n', s)
  31. # replace special string
  32. s = s.replace('Add to Qwen\'s Reading List', '')
  33. return s
  34. def parse_html_bs(path, pre_gen_question=False):
  35. from langchain.document_loaders import BSHTMLLoader
  36. loader = BSHTMLLoader(path, open_encoding='utf-8')
  37. pages = loader.load_and_split()
  38. if pre_gen_question:
  39. res = []
  40. for page in pages:
  41. print(len(page.page_content.split(' ')))
  42. res.append({'page_content': pre_process_html(page.page_content), 'metadata': page.metadata, 'related_questions': gen_q(page.page_content)})
  43. else:
  44. res = [{'page_content': pre_process_html(page.page_content), 'metadata': page.metadata} for page in pages]
  45. return res
  46. if __name__ == '__main__':
  47. res = parse_html_bs('/home/ys/Qwen-Agent/data/test_reponse_body.html')
  48. print(res)