SqlMemory.py 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # from langchain.embeddings.openai import OpenAIEmbeddings
  2. from langchain.embeddings.huggingface import HuggingFaceEmbeddings
  3. from langchain.vectorstores.faiss import FAISS
  4. from langchain.schema import Document
  5. import sentence_transformers
  6. import jsonlines
  7. import json
  8. import os
  9. from qwen_agent.utils.util import get_data_from_jsons
  10. os.environ["TOKENIZERS_PARALLELISM"] = "false"
  11. embedding_model_dict = {
  12. "text2vec": "/data/m3e-base",
  13. # "text2vec": "D:\m3e-base",
  14. }
  15. EMBEDDING_MODEL = "text2vec" # embedding 模型,对应 embedding_model_dict
  16. DEVICE = "cpu"
  17. embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],)
  18. embeddings.client = sentence_transformers.SentenceTransformer(embeddings.model_name,device=DEVICE)
  19. # embeddings = DashScopeEmbeddings(model="text-embedding-v1",
  20. # dashscope_api_key="sk-cb5c097eb78f4dae8daa6a833590d757")
  21. class SqlRetriever():
  22. def __init__(self, query_type='bidding') -> None:
  23. few_shot_docs = []
  24. self.data = get_data_from_jsons(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data'), 'sql_examples')
  25. for line in self.data:
  26. if line['query_type'] == query_type:
  27. few_shot_docs.append(Document(page_content=line['query'], metadata={'sql_code': line['sql_code']}))
  28. self.vector_db = FAISS.from_documents(few_shot_docs, embeddings)
  29. def longest_common_substring(self, str1, str2):
  30. m, n = len(str1), len(str2)
  31. dp = [[0] * (n + 1) for _ in range(m + 1)]
  32. max_length = 0
  33. for i in range(1, m + 1):
  34. for j in range(1, n + 1):
  35. if str1[i - 1] == str2[j - 1]:
  36. dp[i][j] = dp[i - 1][j - 1] + 1
  37. max_length = max(max_length, dp[i][j])
  38. return max_length
  39. def find_most_similar_queries(self, data, text, top_n=3):
  40. similarity_scores = [(item, self.longest_common_substring(item['query'], text)) for item in data]
  41. similarity_scores.sort(key=lambda x: x[1], reverse=True)
  42. return [item[0] for item in similarity_scores[:top_n]]
  43. def get_relevant_documents(self, query, top_k=4):
  44. results = []
  45. for r in self.vector_db.similarity_search(query, k=top_k):
  46. results.append((r.page_content, r.metadata['sql_code']))
  47. return results
  48. if __name__ == "__main__":
  49. def longest_common_substring(str1, str2):
  50. m, n = len(str1), len(str2)
  51. dp = [[0] * (n + 1) for _ in range(m + 1)]
  52. max_length = 0
  53. for i in range(1, m + 1):
  54. for j in range(1, n + 1):
  55. if str1[i - 1] == str2[j - 1]:
  56. dp[i][j] = dp[i - 1][j - 1] + 1
  57. max_length = max(max_length, dp[i][j])
  58. return max_length
  59. def find_most_similar_queries(data, text, top_n=3):
  60. similarity_scores = [(item, longest_common_substring(item['query'], text)) for item in data]
  61. similarity_scores.sort(key=lambda x: x[1], reverse=True)
  62. return [item[0] for item in similarity_scores[:top_n]]
  63. # data = [{"query":"example1", "sql_code": "sql1"},{"query":"example2", "sql_code": "sql2"}]
  64. # text = "Some input text"
  65. # print(find_most_similar_queries(data, text))
  66. records = []
  67. data = json.load(open(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data/sql_examples.jsonl'), 'r'))
  68. for line in data:
  69. records.append(line)
  70. results = []
  71. for item in find_most_similar_queries(records, '浙江万维今年中了几个标?'):
  72. results.append((item['query'], item['sql_code']))
  73. print(results)
  74. # print(find_most_similar_queries(records, '浙江万维今年中了几个标?'))