Natural_p1
/
CSPON-TSC


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
							# from llama_cpp import Llama

# # llm = Llama(model_path="/mnt/nas/model/nlp/DeepSeek_GGUF/deepseek-coder-33b-instruct.Q5_K_M.gguf")
# # input="""You are an AI programming assistant, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.
# # ### Instruction:
# # 写一个求编辑距离的python函数
# # ### Response:
# # """
# # import time
# # a = time.time()
# # output = llm(input, max_tokens=512, echo=True,top_k=1)
# # print(output,'\ntime:',time.time()-a)
from transformers.generation import LogitsProcessor
from typing import Tuple, List, Union, Iterable
import numpy as np
from transformers.generation.logits_process import LogitsProcessorList
import torch
class StopWordsLogitsProcessor(LogitsProcessor):
    """
    :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration.

    Args:
        stop_words_ids (:obj:`List[List[int]]`):
            List of list of token ids of stop ids. In order to get the tokens of the words
            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
            add_prefix_space=True).input_ids`.
        eos_token_id (:obj:`int`):
            The id of the `end-of-sequence` token.
    """

    def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int):

        if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0:
            raise ValueError(
                f"`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}."
            )
        if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids):
            raise ValueError(
                f"`stop_words_ids` has to be a list of lists, but is {stop_words_ids}."
            )
        if any(
            any(
                (not isinstance(token_id, (int, np.integer)) or token_id < 0)
                for token_id in stop_word_ids
            )
            for stop_word_ids in stop_words_ids
        ):
            raise ValueError(
                f"Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}."
            )

        self.stop_words_ids = list(
            filter(
                lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids
            )
        )
        self.eos_token_id = eos_token_id
        for stop_token_seq in self.stop_words_ids:
            assert (
                len(stop_token_seq) > 0
            ), "Stop words token sequences {} cannot have an empty list".format(
                stop_words_ids
            )

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
    ) -> torch.FloatTensor:
        stopped_samples = self._calc_stopped_samples(input_ids)
        for i, should_stop in enumerate(stopped_samples):
            if should_stop:
                scores[i, self.eos_token_id] = float(2**15)
        return scores

    def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool:
        if len(tokens) == 0:
            # if bad word tokens is just one token always ban it
            return True
        elif len(tokens) > len(prev_tokens):
            # if bad word tokens are longer then prev input_ids they can't be equal
            return False
        elif prev_tokens[-len(tokens) :].tolist() == tokens:
            # if tokens match
            return True
        else:
            return False

    def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]:
        stopped_samples = []
        for prev_input_ids_slice in prev_input_ids:
            match = False
            for stop_token_seq in self.stop_words_ids:
                if self._tokens_match(prev_input_ids_slice, stop_token_seq):
                    # if tokens do not match continue
                    match = True
                    break
            stopped_samples.append(match)

        return stopped_samples


import torch
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer,TextStreamer
tokenizer = AutoTokenizer.from_pretrained("/mnt/nas/model/nlp/Deepseek", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
                "/mnt/nas/model/nlp/Deepseek",
                device_map="cuda:2",
                trust_remote_code=True,torch_dtype=torch.float16
        ).eval()
streamer = TextIteratorStreamer(tokenizer,skip_prompt=True,decode_kwargs={'skip_special_tokens':True,'errors':'ignore'})

# def stream_generator(input_ids):
#     outputs = []
#     for token in NewGenerationMixin.generate(
#             input_ids,
#             return_dict_in_generate=False,
#             generation_config=stream_config,
#             seed=-1):
#         outputs.append(token.item())
#         yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore')


messages=[
    { 'role': 'user', 'content': """你是一个MySQL专家，当前需要根据用户问题和上下文，生成语法正确的MySQL查询语句。'
#数据库表的表名和表结构如下：
`agent_bidding_history_detail`(
        `标题`,
        `行业`,
        `发布年份`,
        `发布月份`,
        `发布日`,
        `发布日期`,
        `省`,
        `市`,
        `区`,
        `招标单位`,
        `中标单位`,
        `代理单位`,
        `中标实际金额`,
        `招标预算金额`,
        `招标产品`,
        `招标类型`,# 有两种：[中标结果]和[招标公告]。
)
有几个注意事项：
当涉及到时间时，尽量用CURDATE(),YEAR(),MONTH()等函数
请仔细区分"去年","今年","N年前"等时间关键词。
当涉及到地理位置时，请注意省市区的区分。

以下是可供参考的SQL写法（仅供参考，也可自由发挥）:
```
0       请分析下这两年product的中标金额情况:SELECT 发布年份,SUM(中标实际金额) as 中标金额 FROM agent_bidding_history_detail WHERE 招标类型 = '中标结果' and (招标产品 like '%product%' or 标题 like '%product%') and 发布年份 BETWEEN YEAR(CURDATE())-2 AND YEAR(CURDATE()) GROUP BY 发布年份
1       company_A和company_B的一些合作记录发给我:SELECT 标题,发布日期,中标实际金额 FROM agent_bidding_history_detail WHERE 招标类型 = '中标结果' and ((招标单位='company_A' and 中标单位='company_B') or (招标单位='company_B' and 中标单位='company_A')) LIMIT 20
2       companyname【过去一年】的招标中，中标单位分布情况如何？:SELECT 中标单位,COUNT(1) as 中标个数, SUM(中标实际金额) as 中标金额 FROM agent_bidding_history_detail WHERE 招标类型 = '中标结果' and 招标单位 LIKE '%companyname%' and 发布年份 BETWEEN YEAR(CURDATE())-1 AND YEAR(CURDATE()) GROUP BY 中标单位
3       organization今年的招标情况如何？:SELECT COUNT(1) as 招标次数, SUM(招标预算金额) as 招标预算,GROUP_CONCAT( `招标产品`,',') as 招标产品 FROM agent_bidding_history_detail WHERE 招标单位 LIKE '%organization%' AND 招标类型 = '招标公告' AND 发布年份 = YEAR(CURDATE())
4       对比下organization去年和今年每个月的招标数量:SELECT 发布年份,发布月份,COUNT(1) as 招标次数 FROM agent_bidding_history_detail WHERE 招标单位 LIKE '%organization_name%' AND 招标类型 = '招标公告' GROUP BY 发布年份,发布月份 ORDER BY 发布年份,发布月份
```


下面是API列表，可以选择有助于完成用户需求的一个或多个API：
#API列表
TenderResultSqlAgent: Call this tool to interact with the 查询招投标数据库 API. What is the 查询招投标数据库 API useful for? 
                当需要连接MySQL数据库并执行一段sql时，请使用此功能。
             Format the arguments as a JSON object. Parameters: [{"name": "sql_code", "type": "string", "description": "合法的MySQL查询语言。不接受【select *】,必须使用【select xxx,yyy】"}]

请依据以上可选择的API，制定计划完成用户需求，按照如下格式返回：
Question: 用户需求。
Thought: 生成计划的原因。
Action: 当前需要使用的API,必须包含在[TenderResultSqlAgent] 中。注意这里只需要放API的名字（name_for_model），不需要额外的信息
Action Input: 当前API的输入参数。注意这里只需要放JSON格式的API的参数（parameters），不需要额外的信息
Observation: API的输出。
... (以上 /Thought/Action/Action Input/Observation的过程可以重复多次，直到产生预期的效果)。

Begin!

Question: 用户的原始Question为：最近有什么轮胎的招标需求
对Question进行分析，我认为需要用以下执行计划完成用户的查询需求：[{"action_name": "TenderResultSqlAgent", "instruction": "查询最近的轮胎招标信息"}, {"action_name": "summary", "instruction": "对查询结果进行总结，回答用户的问题"}]
已经执行结束的Action如下：

需要执行的Action如下：
Instruction: 查询最近的轮胎招标信息"""}
]

stop_words = []
if "\nObservation:" not in stop_words:
    stop_words.append("\nObservation:")
stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
print('stop_words_ids:',stop_words_ids)
if stop_words_ids is not None:
    stop_words_logits_processor = StopWordsLogitsProcessor(
        stop_words_ids=stop_words_ids,
        eos_token_id=32021,
    )
    logits_processor = LogitsProcessorList([stop_words_logits_processor])

inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

generation_kwargs = {}
generation_kwargs['inputs']=inputs
generation_kwargs['max_new_tokens']=512
generation_kwargs['do_sample']=False
generation_kwargs['num_return_sequences']=1
# generation_kwargs['logits_processor']=logits_processor
# 32021 is the id of <|EOT|> token
stop_words_ids[0].append(32021)
# generation_kwargs['eos_token_id']=stop_words_ids[0]
generation_kwargs['streamer']=streamer
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ''
for new_text in streamer:
    if len(new_text)==0:
        continue
    if new_text != '<|EOT|>':
        generated_text+=new_text
    if 'Observation:' in generated_text:
        generated_text = generated_text.split('Observation:')[0]
    print(new_text,end='',flush=True)

print('-----------')
print(generated_text)

# response = """
# Action: TenderResultSqlAgent
# Action Input: {"sql_code": "SELECT 标题, 发布日期, 中标实际金额 FROM agent_bidding_history_detail WHERE 招标类型 = '招标公告' AND 招标产品 LIKE '%轮胎%' ORDER BY 发布日期 DESC LIMIT 10"}
# Observation: 
# ```
# [
#     {"标题": "2022年轮胎招标", "发布日期": "2022-01-01", "中标实际金额": 1000000},
#     {"标题": "2021年轮胎招标", "发布日期": "2021-12-31", "中标实际金额": 900000},
#     {"标题": "2021年轮胎招标", "发布日期": "2021-12-30", "中标实际金额": 800000},
#     ...
# ]
# ```

# Instruction: 对查询结果进行总结，回答用户的问题
# Observation: 最近的轮胎招标信息包括：2022年1月1日的2022年轮胎招标，中标金额为1000000元；2021年12月31日的2021年轮胎招标，中标金额为900000元；2021年12月30日的2021年轮胎招标，中标金额为800000元。

# 以上就是我根据用户需求生成的执行计划。
# """
# def parse_response_func(response):
#     func_name, func_args = "", ""
#     i = response.find("Action:")
#     j = response.find("\nAction Input:")
#     k = response.find("\nObservation:")
#     print(i,j,k)
#     if 0 <= i < j:  # If the text has `Action` and `Action input`,
#         func_name = response[i + len("Action:") : j].strip()
#         func_args = response[j + len("\nAction Input:") : k].strip()
#     if func_name:
#         choice_data = {'role':"assistant","content":response[:i],
#                         "function_call":{"name": func_name, "arguments": func_args}
#                         }
#         return choice_data
# print(parse_response_func(response=response))