import sys import html2text import os sys.path.append('/home/ys/Qwen-Agent') print(os.path.dirname(__file__)) from qwen_agent.actions import Simple import re def gen_q(text): agent = Simple(stream=False) query = '根据参考资料提几个最合适的问题,问题答案可以在参考资料中找出' res = agent.run(text, query) return res def parse_pdf_pypdf(path, pre_gen_question=False): from langchain.document_loaders import PyPDFLoader loader = PyPDFLoader(path) pages = loader.load_and_split() # print(pages) if pre_gen_question: res = [] for page in pages: print(len(page.page_content.split(' '))) res.append({'page_content': page.page_content, 'metadata': page.metadata, 'related_questions': gen_q(page.page_content)}) else: res = [{'page_content': page.page_content, 'metadata': page.metadata} for page in pages] return res def parse_html(htmltext): return html2text.html2text(htmltext) def pre_process_html(s): # replace multiple newlines s = re.sub('\n+', '\n', s) # replace special string s = s.replace('Add to Qwen\'s Reading List', '') return s def parse_html_bs(path, pre_gen_question=False): from langchain.document_loaders import BSHTMLLoader loader = BSHTMLLoader(path, open_encoding='utf-8') pages = loader.load_and_split() if pre_gen_question: res = [] for page in pages: print(len(page.page_content.split(' '))) res.append({'page_content': pre_process_html(page.page_content), 'metadata': page.metadata, 'related_questions': gen_q(page.page_content)}) else: res = [{'page_content': pre_process_html(page.page_content), 'metadata': page.metadata} for page in pages] return res if __name__ == '__main__': res = parse_html_bs('/home/ys/Qwen-Agent/data/test_reponse_body.html') print(res)