|
@@ -1,9 +1,7 @@
|
|
|
-# from langchain.embeddings.openai import OpenAIEmbeddings
|
|
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
|
|
from langchain.vectorstores.faiss import FAISS
|
|
|
from langchain.schema import Document
|
|
|
import sentence_transformers
|
|
|
-import jsonlines
|
|
|
import json
|
|
|
import os
|
|
|
|
|
@@ -13,14 +11,16 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
|
|
|
embedding_model_dict = {
|
|
|
# "text2vec": "/data/m3e-base",
|
|
|
- "text2vec": "E:\AI_temp\m3e-base",
|
|
|
-
|
|
|
+ "text2vec": "E:\项目临时\AI大模型\m3e-base",
|
|
|
+ # "text2vec": r"E:\项目临时\AI大模型\bge_large_zh_v1.5",#使用bge-large-zh-v3模型也可以进行相似度搜索
|
|
|
}
|
|
|
EMBEDDING_MODEL = "text2vec" # embedding 模型,对应 embedding_model_dict
|
|
|
DEVICE = "cpu"
|
|
|
|
|
|
-embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],)
|
|
|
-embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name,device=DEVICE)
|
|
|
+embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL])
|
|
|
+embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name, device=DEVICE)
|
|
|
+
|
|
|
+
|
|
|
# embeddings = DashScopeEmbeddings(model="text-embedding-v1",
|
|
|
# dashscope_api_key="sk-cb5c097eb78f4dae8daa6a833590d757")
|
|
|
|
|
@@ -29,13 +29,17 @@ class SqlRetriever():
|
|
|
def __init__(self, query_type='bidding') -> None:
|
|
|
|
|
|
few_shot_docs = []
|
|
|
- self.data = get_data_from_jsons(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data'), 'sql_examples')
|
|
|
+ self.data = get_data_from_jsons(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data'),
|
|
|
+ 'sql_examples')
|
|
|
for line in self.data:
|
|
|
if line['query_type'] == query_type:
|
|
|
few_shot_docs.append(Document(page_content=line['query'], metadata={'sql_code': line['sql_code']}))
|
|
|
-
|
|
|
+ # page_content='帮我在萧山区推荐几块50亩左右的工业用地,数据表是控制性详细规划' metadata={'sql_code': "select id from sde.kzxxxgh where xzqmc = '萧山区' and ydxz like '%工业%' and abs(ydmj - 50*0.0667) <= 1 and shape is not null order by ydmj nulls last limit 5"}
|
|
|
+ # page_content是query,metadata是sql
|
|
|
self.vector_db = FAISS.from_documents(few_shot_docs, embeddings)
|
|
|
|
|
|
+ # 以前没有用向量数据库进行相似度搜索,用的是find_most_similar_queries进行字符串匹配实现的这些功能
|
|
|
+ # 现在这2个方法已经被废弃调了,使用get_relevant_documents方法进行替代
|
|
|
def longest_common_substring(self, str1, str2):
|
|
|
m, n = len(str1), len(str2)
|
|
|
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
@@ -61,34 +65,41 @@ class SqlRetriever():
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
- def longest_common_substring(str1, str2):
|
|
|
- m, n = len(str1), len(str2)
|
|
|
- dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
|
- max_length = 0
|
|
|
- for i in range(1, m + 1):
|
|
|
- for j in range(1, n + 1):
|
|
|
- if str1[i - 1] == str2[j - 1]:
|
|
|
- dp[i][j] = dp[i - 1][j - 1] + 1
|
|
|
- max_length = max(max_length, dp[i][j])
|
|
|
- return max_length
|
|
|
-
|
|
|
-
|
|
|
- def find_most_similar_queries(data, text, top_n=3):
|
|
|
- similarity_scores = [(item, longest_common_substring(item['query'], text)) for item in data]
|
|
|
- similarity_scores.sort(key=lambda x: x[1], reverse=True)
|
|
|
- return [item[0] for item in similarity_scores[:top_n]]
|
|
|
-
|
|
|
-
|
|
|
- # data = [{"query":"example1", "sql_code": "sql1"},{"query":"example2", "sql_code": "sql2"}]
|
|
|
- # text = "Some input text"
|
|
|
- # print(find_most_similar_queries(data, text))
|
|
|
-
|
|
|
- records = []
|
|
|
- data = json.load(open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data/sql_examples.jsonl'), 'r'))
|
|
|
- for line in data:
|
|
|
- records.append(line)
|
|
|
- results = []
|
|
|
- for item in find_most_similar_queries(records, '浙江万维今年中了几个标?'):
|
|
|
- results.append((item['query'], item['sql_code']))
|
|
|
- print(results)
|
|
|
+ # def longest_common_substring(str1, str2):
|
|
|
+ # m, n = len(str1), len(str2)
|
|
|
+ # dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
|
+ # max_length = 0
|
|
|
+ # for i in range(1, m + 1):
|
|
|
+ # for j in range(1, n + 1):
|
|
|
+ # if str1[i - 1] == str2[j - 1]:
|
|
|
+ # dp[i][j] = dp[i - 1][j - 1] + 1
|
|
|
+ # max_length = max(max_length, dp[i][j])
|
|
|
+ # return max_length
|
|
|
+ #
|
|
|
+ #
|
|
|
+ # def find_most_similar_queries(data, text, top_n=3):
|
|
|
+ # similarity_scores = [(item, longest_common_substring(item['query'], text)) for item in data]
|
|
|
+ # similarity_scores.sort(key=lambda x: x[1], reverse=True)
|
|
|
+ # return [item[0] for item in similarity_scores[:top_n]]
|
|
|
+ #
|
|
|
+ #
|
|
|
+ # # data = [{"query":"example1", "sql_code": "sql1"},{"query":"example2", "sql_code": "sql2"}]
|
|
|
+ # # text = "Some input text"
|
|
|
+ # # print(find_most_similar_queries(data, text))
|
|
|
+ #
|
|
|
+ # records = []
|
|
|
+ # data = json.load(open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data/sql_examples.jsonl'), 'r'))
|
|
|
+ # for line in data:
|
|
|
+ # records.append(line)
|
|
|
+ # results = []
|
|
|
+ # for item in find_most_similar_queries(records, '浙江万维今年中了几个标?'):
|
|
|
+ # results.append((item['query'], item['sql_code']))
|
|
|
+ # print(results)
|
|
|
# print(find_most_similar_queries(records, '浙江万维今年中了几个标?'))
|
|
|
+
|
|
|
+ sql_retrieval = SqlRetriever("land_site_selection")
|
|
|
+ results = sql_retrieval.get_relevant_documents("萧山区推荐几块工业用地", top_k=2)
|
|
|
+ for r in results:
|
|
|
+ print(r)
|
|
|
+ # ('帮我在萧山区推荐几块50亩左右的工业用地,数据表是公告地块', "select id from sde.ecgap_klyzy where xzqmc = '萧山区' and tdyt like '%工业%' and abs(dkmj-5) <= 1 and shape is not null and sfsj=1 order by dkmj nulls last limit 5")
|
|
|
+ # ('帮我在萧山区推荐几块50亩左右的工业用地,数据表是控制性详细规划', "select id from sde.kzxxxgh where xzqmc = '萧山区' and ydxz like '%工业%' and abs(ydmj - 50*0.0667) <= 1 and shape is not null order by ydmj nulls last limit 5")
|