# from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain.vectorstores.faiss import FAISS from langchain.schema import Document import sentence_transformers import jsonlines import json import os from qwen_agent.utils.util import get_data_from_jsons os.environ["TOKENIZERS_PARALLELISM"] = "false" embedding_model_dict = { # "text2vec": "/data/m3e-base", "text2vec": "E:\AI_temp\m3e-base", } EMBEDDING_MODEL = "text2vec" # embedding 模型,对应 embedding_model_dict DEVICE = "cpu" embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],) embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name,device=DEVICE) # embeddings = DashScopeEmbeddings(model="text-embedding-v1", # dashscope_api_key="sk-cb5c097eb78f4dae8daa6a833590d757") class SqlRetriever(): def __init__(self, query_type='bidding') -> None: few_shot_docs = [] self.data = get_data_from_jsons(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data'), 'sql_examples') for line in self.data: if line['query_type'] == query_type: few_shot_docs.append(Document(page_content=line['query'], metadata={'sql_code': line['sql_code']})) self.vector_db = FAISS.from_documents(few_shot_docs, embeddings) def longest_common_substring(self, str1, str2): m, n = len(str1), len(str2) dp = [[0] * (n + 1) for _ in range(m + 1)] max_length = 0 for i in range(1, m + 1): for j in range(1, n + 1): if str1[i - 1] == str2[j - 1]: dp[i][j] = dp[i - 1][j - 1] + 1 max_length = max(max_length, dp[i][j]) return max_length def find_most_similar_queries(self, data, text, top_n=3): similarity_scores = [(item, self.longest_common_substring(item['query'], text)) for item in data] similarity_scores.sort(key=lambda x: x[1], reverse=True) return [item[0] for item in similarity_scores[:top_n]] def get_relevant_documents(self, query, top_k=4): results = [] for r in self.vector_db.similarity_search(query, k=top_k): results.append((r.page_content, r.metadata['sql_code'])) return results if __name__ == "__main__": def longest_common_substring(str1, str2): m, n = len(str1), len(str2) dp = [[0] * (n + 1) for _ in range(m + 1)] max_length = 0 for i in range(1, m + 1): for j in range(1, n + 1): if str1[i - 1] == str2[j - 1]: dp[i][j] = dp[i - 1][j - 1] + 1 max_length = max(max_length, dp[i][j]) return max_length def find_most_similar_queries(data, text, top_n=3): similarity_scores = [(item, longest_common_substring(item['query'], text)) for item in data] similarity_scores.sort(key=lambda x: x[1], reverse=True) return [item[0] for item in similarity_scores[:top_n]] # data = [{"query":"example1", "sql_code": "sql1"},{"query":"example2", "sql_code": "sql2"}] # text = "Some input text" # print(find_most_similar_queries(data, text)) records = [] data = json.load(open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data/sql_examples.jsonl'), 'r')) for line in data: records.append(line) results = [] for item in find_most_similar_queries(records, '浙江万维今年中了几个标?'): results.append((item['query'], item['sql_code'])) print(results) # print(find_most_similar_queries(records, '浙江万维今年中了几个标?'))