12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- import sys
- import html2text
- import os
- sys.path.append('/home/ys/Qwen-Agent')
- print(os.path.dirname(__file__))
- from qwen_agent.actions import Simple
- import re
- def gen_q(text):
- agent = Simple(stream=False)
- query = '根据参考资料提几个最合适的问题,问题答案可以在参考资料中找出'
- res = agent.run(text, query)
- return res
- def parse_pdf_pypdf(path, pre_gen_question=False):
- from langchain.document_loaders import PyPDFLoader
- loader = PyPDFLoader(path)
- pages = loader.load_and_split()
- # print(pages)
- if pre_gen_question:
- res = []
- for page in pages:
- print(len(page.page_content.split(' ')))
- res.append({'page_content': page.page_content, 'metadata': page.metadata, 'related_questions': gen_q(page.page_content)})
- else:
- res = [{'page_content': page.page_content, 'metadata': page.metadata} for page in pages]
- return res
- def parse_html(htmltext):
- return html2text.html2text(htmltext)
- def pre_process_html(s):
- # replace multiple newlines
- s = re.sub('\n+', '\n', s)
- # replace special string
- s = s.replace('Add to Qwen\'s Reading List', '')
- return s
- def parse_html_bs(path, pre_gen_question=False):
- from langchain.document_loaders import BSHTMLLoader
- loader = BSHTMLLoader(path, open_encoding='utf-8')
- pages = loader.load_and_split()
- if pre_gen_question:
- res = []
- for page in pages:
- print(len(page.page_content.split(' ')))
- res.append({'page_content': pre_process_html(page.page_content), 'metadata': page.metadata, 'related_questions': gen_q(page.page_content)})
- else:
- res = [{'page_content': pre_process_html(page.page_content), 'metadata': page.metadata} for page in pages]
- return res
- if __name__ == '__main__':
- res = parse_html_bs('/home/ys/Qwen-Agent/data/test_reponse_body.html')
- print(res)
|