SqlMemory.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. from langchain.embeddings.huggingface import HuggingFaceEmbeddings
  2. from langchain.vectorstores.faiss import FAISS
  3. from langchain.schema import Document
  4. import sentence_transformers
  5. import json
  6. import os
  7. from qwen_agent.utils.util import get_data_from_jsons
  8. os.environ["TOKENIZERS_PARALLELISM"] = "false"
  9. embedding_model_dict = {
  10. # "text2vec": "/data/m3e-base",
  11. "text2vec": "E:\项目临时\AI大模型\m3e-base",
  12. # "text2vec": r"E:\项目临时\AI大模型\bge_large_zh_v1.5",#使用bge-large-zh-v3模型也可以进行相似度搜索
  13. }
  14. EMBEDDING_MODEL = "text2vec" # embedding 模型,对应 embedding_model_dict
  15. DEVICE = "cpu"
  16. embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL])
  17. embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name, device=DEVICE)
  18. # embeddings = DashScopeEmbeddings(model="text-embedding-v1",
  19. # dashscope_api_key="sk-cb5c097eb78f4dae8daa6a833590d757")
  20. class SqlRetriever():
  21. def __init__(self, query_type='bidding') -> None:
  22. few_shot_docs = []
  23. self.data = get_data_from_jsons(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data'),
  24. 'sql_examples')
  25. for line in self.data:
  26. if line['query_type'] == query_type:
  27. few_shot_docs.append(Document(page_content=line['query'], metadata={'sql_code': line['sql_code']}))
  28. # page_content='帮我在萧山区推荐几块50亩左右的工业用地,数据表是控制性详细规划' metadata={'sql_code': "select id from sde.kzxxxgh where xzqmc = '萧山区' and ydxz like '%工业%' and abs(ydmj - 50*0.0667) <= 1 and shape is not null order by ydmj nulls last limit 5"}
  29. # page_content是query,metadata是sql
  30. self.vector_db = FAISS.from_documents(few_shot_docs, embeddings)
  31. # 以前没有用向量数据库进行相似度搜索,用的是find_most_similar_queries进行字符串匹配实现的这些功能
  32. # 现在这2个方法已经被废弃调了,使用get_relevant_documents方法进行替代
  33. def longest_common_substring(self, str1, str2):
  34. m, n = len(str1), len(str2)
  35. dp = [[0] * (n + 1) for _ in range(m + 1)]
  36. max_length = 0
  37. for i in range(1, m + 1):
  38. for j in range(1, n + 1):
  39. if str1[i - 1] == str2[j - 1]:
  40. dp[i][j] = dp[i - 1][j - 1] + 1
  41. max_length = max(max_length, dp[i][j])
  42. return max_length
  43. def find_most_similar_queries(self, data, text, top_n=3):
  44. similarity_scores = [(item, self.longest_common_substring(item['query'], text)) for item in data]
  45. similarity_scores.sort(key=lambda x: x[1], reverse=True)
  46. return [item[0] for item in similarity_scores[:top_n]]
  47. def get_relevant_documents(self, query, top_k=4):
  48. results = []
  49. for r in self.vector_db.similarity_search(query, k=top_k):
  50. results.append((r.page_content, r.metadata['sql_code']))
  51. return results
  52. if __name__ == "__main__":
  53. # def longest_common_substring(str1, str2):
  54. # m, n = len(str1), len(str2)
  55. # dp = [[0] * (n + 1) for _ in range(m + 1)]
  56. # max_length = 0
  57. # for i in range(1, m + 1):
  58. # for j in range(1, n + 1):
  59. # if str1[i - 1] == str2[j - 1]:
  60. # dp[i][j] = dp[i - 1][j - 1] + 1
  61. # max_length = max(max_length, dp[i][j])
  62. # return max_length
  63. #
  64. #
  65. # def find_most_similar_queries(data, text, top_n=3):
  66. # similarity_scores = [(item, longest_common_substring(item['query'], text)) for item in data]
  67. # similarity_scores.sort(key=lambda x: x[1], reverse=True)
  68. # return [item[0] for item in similarity_scores[:top_n]]
  69. #
  70. #
  71. # # data = [{"query":"example1", "sql_code": "sql1"},{"query":"example2", "sql_code": "sql2"}]
  72. # # text = "Some input text"
  73. # # print(find_most_similar_queries(data, text))
  74. #
  75. # records = []
  76. # data = json.load(open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data/sql_examples.jsonl'), 'r'))
  77. # for line in data:
  78. # records.append(line)
  79. # results = []
  80. # for item in find_most_similar_queries(records, '浙江万维今年中了几个标?'):
  81. # results.append((item['query'], item['sql_code']))
  82. # print(results)
  83. # print(find_most_similar_queries(records, '浙江万维今年中了几个标?'))
  84. sql_retrieval = SqlRetriever("land_site_selection")
  85. results = sql_retrieval.get_relevant_documents("萧山区推荐几块工业用地", top_k=2)
  86. for r in results:
  87. print(r)
  88. # ('帮我在萧山区推荐几块50亩左右的工业用地,数据表是公告地块', "select id from sde.ecgap_klyzy where xzqmc = '萧山区' and tdyt like '%工业%' and abs(dkmj-5) <= 1 and shape is not null and sfsj=1 order by dkmj nulls last limit 5")
  89. # ('帮我在萧山区推荐几块50亩左右的工业用地,数据表是控制性详细规划', "select id from sde.kzxxxgh where xzqmc = '萧山区' and ydxz like '%工业%' and abs(ydmj - 50*0.0667) <= 1 and shape is not null order by ydmj nulls last limit 5")