main.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. import datetime
  2. import importlib
  3. import multiprocessing
  4. import os
  5. import re
  6. import sys
  7. from pathlib import Path
  8. from urllib.parse import unquote, urlparse
  9. import jsonlines
  10. import requests
  11. import uvicorn
  12. from fastapi import FastAPI, Request
  13. from fastapi.middleware.cors import CORSMiddleware
  14. from fastapi.responses import JSONResponse
  15. from fastapi.staticfiles import StaticFiles
  16. import config_browserqwen
  17. sys.path.insert(
  18. 0,
  19. str(Path(__file__).absolute().parent.parent)) # NOQA
  20. from qwen_agent.actions import Simple # NOQA
  21. from qwen_agent.tools.parse_doc import parse_pdf_pypdf, parse_html_bs # NOQA
  22. from qwen_agent.utils.util import print_traceback, save_text_to_file # NOQA
  23. from qwen_agent.schema import Record # NOQA
  24. prompt_lan = sys.argv[1]
  25. llm_name = sys.argv[2]
  26. max_ref_token = int(sys.argv[3])
  27. workstation_port = int(sys.argv[4])
  28. model_server = sys.argv[5]
  29. api_key = sys.argv[6]
  30. server_host = sys.argv[7]
  31. app = FastAPI()
  32. origins = [
  33. 'http://127.0.0.1:'+str(workstation_port),
  34. 'http://localhost:'+str(workstation_port),
  35. 'http://0.0.0.0:'+str(workstation_port),
  36. ]
  37. app.add_middleware(
  38. CORSMiddleware,
  39. allow_origins=origins,
  40. allow_credentials=True,
  41. allow_methods=['*'],
  42. allow_headers=['*'],
  43. )
  44. app.mount('/static', StaticFiles(directory=config_browserqwen.code_interpreter_ws), name='static')
  45. if llm_name.startswith('gpt'):
  46. module = 'qwen_agent.llm.gpt'
  47. llm = importlib.import_module(module).GPT(llm_name)
  48. elif llm_name.startswith('Qwen') or llm_name.startswith('qwen'):
  49. module = 'qwen_agent.llm.qwen'
  50. llm = importlib.import_module(module).Qwen(llm_name, model_server=model_server, api_key=api_key)
  51. else:
  52. raise NotImplementedError
  53. def update_pop_url(data, cache_file_popup_url):
  54. new_line = {'url': data['url']}
  55. with jsonlines.open(cache_file_popup_url, mode='w') as writer:
  56. writer.write(new_line)
  57. response = 'Update URL'
  58. return response
  59. def is_local_path(path):
  60. if path.startswith('file://'):
  61. return True
  62. else:
  63. return False
  64. def get_title(text, cacheprompt=''):
  65. agent = Simple(llm=llm, stream=False)
  66. extract = agent.run(text, cacheprompt)
  67. return extract
  68. def download_pdf(url, save_path):
  69. response = requests.get(url)
  70. with open(save_path, 'wb') as file:
  71. file.write(response.content)
  72. def sanitize_chrome_file_path(file_path: str) -> str:
  73. # For Linux and macOS.
  74. if os.path.exists(file_path):
  75. return file_path
  76. # For native Windows, drop the leading '/' in '/C:/'
  77. win_path = file_path
  78. if win_path.startswith('/'):
  79. win_path = win_path[1:]
  80. if os.path.exists(win_path):
  81. return win_path
  82. # For Windows + WSL.
  83. if re.match(r'^[A-Za-z]:/', win_path):
  84. wsl_path = f'/mnt/{win_path[0].lower()}/{win_path[3:]}'
  85. if os.path.exists(wsl_path):
  86. return wsl_path
  87. # For native Windows, replace / with \.
  88. win_path = win_path.replace('/', '\\')
  89. if os.path.exists(win_path):
  90. return win_path
  91. return file_path
  92. def cache_data(data, cache_file):
  93. extract = '' # extract a title for display
  94. print('Begin cache...')
  95. if data['url'][-4:] in ['.pdf', '.PDF']:
  96. date1 = datetime.datetime.now()
  97. # generate one processing record
  98. new_record = Record(url=data['url'], time='', type=data['type'], raw=[],
  99. extract='', topic='', checked=False, session=[]).to_dict()
  100. with jsonlines.open(cache_file, mode='a') as writer:
  101. writer.write(new_record)
  102. # deal pdf path
  103. if is_local_path(data['url']):
  104. parsed_url = urlparse(data['url'])
  105. print('parsed_url: ', parsed_url)
  106. pdf_path = unquote(parsed_url.path)
  107. pdf_path = sanitize_chrome_file_path(pdf_path)
  108. else:
  109. pdf_path = data['url']
  110. try:
  111. pdf_content = parse_pdf_pypdf(pdf_path, pre_gen_question=config_browserqwen.pre_gen_question)
  112. except Exception:
  113. print_traceback()
  114. # del the processing record
  115. lines = []
  116. if os.path.exists(cache_file):
  117. for line in jsonlines.open(cache_file):
  118. if line['url'] != data['url']:
  119. lines.append(line)
  120. with jsonlines.open(cache_file, mode='w') as writer:
  121. for new_line in lines:
  122. writer.write(new_line)
  123. return 'failed'
  124. date2 = datetime.datetime.now()
  125. print('parse pdf time: ', date2 - date1)
  126. data['content'] = pdf_content
  127. data['type'] = 'pdf'
  128. if prompt_lan == 'CN':
  129. cacheprompt = '参考资料是一篇论文的首页,请提取出一句话作为标题。'
  130. elif prompt_lan == 'EN':
  131. cacheprompt = 'The reference material is the first page of a paper. Please extract one sentence as the title'
  132. extract = get_title(pdf_content[0]['page_content'], cacheprompt=cacheprompt)
  133. else:
  134. if data['content'] and data['type'] == 'html':
  135. new_record = Record(url=data['url'], time='', type=data['type'], raw=[], extract='', topic='', checked=False, session=[]).to_dict()
  136. with jsonlines.open(cache_file, mode='a') as writer:
  137. writer.write(new_record)
  138. try:
  139. tmp_html_file = os.path.join(config_browserqwen.cache_root, 'tmp.html')
  140. save_text_to_file(tmp_html_file, data['content'])
  141. data['content'] = parse_html_bs(tmp_html_file, pre_gen_question=config_browserqwen.pre_gen_question)
  142. except Exception:
  143. print_traceback()
  144. extract = data['content'][0]['metadata']['title']
  145. today = datetime.date.today()
  146. new_record = Record(url=data['url'], time=str(today), type=data['type'], raw=data['content'],
  147. extract=extract, topic='', checked=True, session=[])
  148. lines = []
  149. if os.path.exists(cache_file):
  150. for line in jsonlines.open(cache_file):
  151. if line['url'] != data['url']:
  152. lines.append(line)
  153. lines.append(new_record.to_dict()) # cache
  154. with jsonlines.open(cache_file, mode='w') as writer:
  155. for new_line in lines:
  156. writer.write(new_line)
  157. response = 'Cached'
  158. return response
  159. def change_checkbox_state(text, cache_file):
  160. if not os.path.exists(cache_file):
  161. return {'result': 'no file'}
  162. lines = []
  163. for line in jsonlines.open(cache_file):
  164. if line['url'] == text[3:]:
  165. if line['checked']:
  166. line['checked'] = False
  167. else:
  168. line['checked'] = True
  169. lines.append(line)
  170. with jsonlines.open(cache_file, mode='w') as writer:
  171. for new_line in lines:
  172. writer.write(new_line)
  173. return {'result': 'changed'}
  174. def update_addr_for_figure(address):
  175. new_line = {'address': address}
  176. with jsonlines.open(config_browserqwen.address_file, mode='w') as writer:
  177. writer.write(new_line)
  178. response = 'Update Address'
  179. print('Update Address')
  180. return response
  181. @app.post('/endpoint')
  182. async def web_listening(request: Request):
  183. data = await request.json()
  184. print(data)
  185. msg_type = data['task']
  186. cache_file_popup_url = os.path.join(config_browserqwen.cache_root, config_browserqwen.url_file)
  187. cache_file = os.path.join(config_browserqwen.cache_root, config_browserqwen.browser_cache_file)
  188. if msg_type == 'change_checkbox':
  189. rsp = change_checkbox_state(data['ckid'], cache_file)
  190. elif msg_type == 'cache':
  191. cache_obj = multiprocessing.Process(target=cache_data, args=(data, cache_file))
  192. cache_obj.start()
  193. # rsp = cache_data(data, cache_file)
  194. rsp = 'caching'
  195. elif msg_type == 'pop_url':
  196. # What a misleading name! pop_url actually means add_url. pop is referring to the pop_up ui.
  197. rsp = update_pop_url(data, cache_file_popup_url)
  198. elif msg_type == 'set_addr':
  199. rsp = update_addr_for_figure(data['addr'])
  200. return JSONResponse(content=rsp)
  201. if __name__ == '__main__':
  202. uvicorn.run(app='main:app', host=server_host, port=config_browserqwen.fast_api_port, reload=True)