Natural_p1
/
CSPON-TSC


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
							import datetime
import importlib
import multiprocessing
import os
import re
import sys
from pathlib import Path
from urllib.parse import unquote, urlparse

import jsonlines
import requests
import uvicorn
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles

import config_browserqwen

sys.path.insert(
    0,
    str(Path(__file__).absolute().parent.parent))  # NOQA

from qwen_agent.actions import Simple  # NOQA
from qwen_agent.tools.parse_doc import parse_pdf_pypdf, parse_html_bs  # NOQA
from qwen_agent.utils.util import print_traceback, save_text_to_file  # NOQA
from qwen_agent.schema import Record  # NOQA

prompt_lan = sys.argv[1]
llm_name = sys.argv[2]
max_ref_token = int(sys.argv[3])
workstation_port = int(sys.argv[4])
model_server = sys.argv[5]
api_key = sys.argv[6]
server_host = sys.argv[7]

app = FastAPI()

origins = [
    'http://127.0.0.1:'+str(workstation_port),
    'http://localhost:'+str(workstation_port),
    'http://0.0.0.0:'+str(workstation_port),
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

app.mount('/static', StaticFiles(directory=config_browserqwen.code_interpreter_ws), name='static')

if llm_name.startswith('gpt'):
    module = 'qwen_agent.llm.gpt'
    llm = importlib.import_module(module).GPT(llm_name)
elif llm_name.startswith('Qwen') or llm_name.startswith('qwen'):
    module = 'qwen_agent.llm.qwen'
    llm = importlib.import_module(module).Qwen(llm_name, model_server=model_server, api_key=api_key)
else:
    raise NotImplementedError


def update_pop_url(data, cache_file_popup_url):
    new_line = {'url': data['url']}

    with jsonlines.open(cache_file_popup_url, mode='w') as writer:
        writer.write(new_line)

    response = 'Update URL'
    return response


def is_local_path(path):
    if path.startswith('file://'):
        return True
    else:
        return False


def get_title(text, cacheprompt=''):
    agent = Simple(llm=llm, stream=False)
    extract = agent.run(text, cacheprompt)

    return extract


def download_pdf(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as file:
        file.write(response.content)


def sanitize_chrome_file_path(file_path: str) -> str:
    # For Linux and macOS.
    if os.path.exists(file_path):
        return file_path

    # For native Windows, drop the leading '/' in '/C:/'
    win_path = file_path
    if win_path.startswith('/'):
        win_path = win_path[1:]
    if os.path.exists(win_path):
        return win_path

    # For Windows + WSL.
    if re.match(r'^[A-Za-z]:/', win_path):
        wsl_path = f'/mnt/{win_path[0].lower()}/{win_path[3:]}'
        if os.path.exists(wsl_path):
            return wsl_path

    # For native Windows, replace / with \.
    win_path = win_path.replace('/', '\\')
    if os.path.exists(win_path):
        return win_path

    return file_path


def cache_data(data, cache_file):
    extract = ''  # extract a title for display
    print('Begin cache...')
    if data['url'][-4:] in ['.pdf', '.PDF']:
        date1 = datetime.datetime.now()

        # generate one processing record
        new_record = Record(url=data['url'], time='', type=data['type'], raw=[],
                            extract='', topic='', checked=False, session=[]).to_dict()
        with jsonlines.open(cache_file, mode='a') as writer:
            writer.write(new_record)

        # deal pdf path
        if is_local_path(data['url']):
            parsed_url = urlparse(data['url'])
            print('parsed_url: ', parsed_url)
            pdf_path = unquote(parsed_url.path)
            pdf_path = sanitize_chrome_file_path(pdf_path)
        else:
            pdf_path = data['url']

        try:
            pdf_content = parse_pdf_pypdf(pdf_path, pre_gen_question=config_browserqwen.pre_gen_question)
        except Exception:
            print_traceback()
            # del the processing record
            lines = []
            if os.path.exists(cache_file):
                for line in jsonlines.open(cache_file):
                    if line['url'] != data['url']:
                        lines.append(line)
            with jsonlines.open(cache_file, mode='w') as writer:
                for new_line in lines:
                    writer.write(new_line)
            return 'failed'

        date2 = datetime.datetime.now()
        print('parse pdf time: ', date2 - date1)
        data['content'] = pdf_content
        data['type'] = 'pdf'
        if prompt_lan == 'CN':
            cacheprompt = '参考资料是一篇论文的首页，请提取出一句话作为标题。'
        elif prompt_lan == 'EN':
            cacheprompt = 'The reference material is the first page of a paper. Please extract one sentence as the title'
        extract = get_title(pdf_content[0]['page_content'], cacheprompt=cacheprompt)
    else:
        if data['content'] and data['type'] == 'html':
            new_record = Record(url=data['url'], time='', type=data['type'], raw=[], extract='', topic='', checked=False, session=[]).to_dict()
            with jsonlines.open(cache_file, mode='a') as writer:
                writer.write(new_record)

            try:
                tmp_html_file = os.path.join(config_browserqwen.cache_root, 'tmp.html')
                save_text_to_file(tmp_html_file, data['content'])
                data['content'] = parse_html_bs(tmp_html_file, pre_gen_question=config_browserqwen.pre_gen_question)
            except Exception:
                print_traceback()
            extract = data['content'][0]['metadata']['title']

    today = datetime.date.today()
    new_record = Record(url=data['url'], time=str(today), type=data['type'], raw=data['content'],
                        extract=extract, topic='', checked=True, session=[])
    lines = []
    if os.path.exists(cache_file):
        for line in jsonlines.open(cache_file):
            if line['url'] != data['url']:
                lines.append(line)
    lines.append(new_record.to_dict())  # cache
    with jsonlines.open(cache_file, mode='w') as writer:
        for new_line in lines:
            writer.write(new_line)

    response = 'Cached'
    return response


def change_checkbox_state(text, cache_file):
    if not os.path.exists(cache_file):
        return {'result': 'no file'}
    lines = []
    for line in jsonlines.open(cache_file):
        if line['url'] == text[3:]:
            if line['checked']:
                line['checked'] = False
            else:
                line['checked'] = True
        lines.append(line)

    with jsonlines.open(cache_file, mode='w') as writer:
        for new_line in lines:
            writer.write(new_line)

    return {'result': 'changed'}


def update_addr_for_figure(address):
    new_line = {'address': address}

    with jsonlines.open(config_browserqwen.address_file, mode='w') as writer:
        writer.write(new_line)

    response = 'Update Address'
    print('Update Address')
    return response


@app.post('/endpoint')
async def web_listening(request: Request):
    data = await request.json()
    print(data)
    msg_type = data['task']

    cache_file_popup_url = os.path.join(config_browserqwen.cache_root, config_browserqwen.url_file)
    cache_file = os.path.join(config_browserqwen.cache_root, config_browserqwen.browser_cache_file)

    if msg_type == 'change_checkbox':
        rsp = change_checkbox_state(data['ckid'], cache_file)
    elif msg_type == 'cache':
        cache_obj = multiprocessing.Process(target=cache_data, args=(data, cache_file))
        cache_obj.start()
        # rsp = cache_data(data, cache_file)
        rsp = 'caching'
    elif msg_type == 'pop_url':
        # What a misleading name! pop_url actually means add_url. pop is referring to the pop_up ui.
        rsp = update_pop_url(data, cache_file_popup_url)
    elif msg_type == 'set_addr':
        rsp = update_addr_for_figure(data['addr'])

    return JSONResponse(content=rsp)


if __name__ == '__main__':
    uvicorn.run(app='main:app', host=server_host, port=config_browserqwen.fast_api_port, reload=True)