query_understanding.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. from LAC import LAC
  2. import hanlp
  3. import addressparser
  4. import jionlp as jio
  5. import time
  6. import re
  7. import copy
  8. lac = LAC(mode='rank')
  9. # .append(hanlp.load('COARSE_ELECTRA_SMALL_ZH'), output_key='tok') \
  10. HanLP = hanlp.pipeline() \
  11. .append(hanlp.load('FINE_ELECTRA_SMALL_ZH'), output_key='tok') \
  12. .append(hanlp.load('PKU_POS_ELECTRA_SMALL'), output_key='pos') \
  13. .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok')
  14. # jio.recognize_location('TEST')
  15. def gen_product_search_dsl(text):
  16. loc=set()
  17. org=[]
  18. products=[]
  19. dsl = {}
  20. lac_result = lac.run(text)
  21. han_result = HanLP(text)
  22. # print(f'lac_result:{lac_result}')
  23. # print(f'han_result:{han_result}')
  24. for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
  25. if pos == 'LOC':
  26. loc.add(word)
  27. else:
  28. if int(level)>2:
  29. products.append(word)
  30. for word,pos in zip(han_result['tok'],han_result['pos']):
  31. if pos == 'ns':
  32. loc.add(word)
  33. # elif pos == 'nt':
  34. # if word not in companyname:
  35. # companyname.append(word)
  36. # elif pos == 'nr':
  37. # person.add(word)
  38. dsl["query"]={"bool":{"should":[]}}
  39. # if len(text.replace(' ',''))<=6:
  40. # dsl['query']['bool']['should'].append({"match_phrase": {"ori_product_tag_list":{"query":text.replace(' ',''),"boost":3.0}}})
  41. dsl['query']['bool']['should'].append({"match": {"ori_product_tag_list":{"query": " ".join(list(products)),"boost": 2.0}}})
  42. dsl['query']['bool']['should'].append({"match_phrase": {"ori_product_tag_list":{"query": "".join(list(products)),"boost": 5.0}}})
  43. dsl['query']['bool']['should'].append({"match": {"companyname":{"query": " ".join(list(products)),"boost":1.5}}})
  44. dsl['query']['bool']['should'].append({"match_phrase": {"scope":{"query": "".join(list(products)),"boost":1.8}}})
  45. # dsl['query']['bool']['should'].append({"match": {"product":{"query": " ".join(list(products)),"boost":1.2}}})
  46. # dsl['query']['bool']['should'].append({"match": {"productchain":{"query": " ".join(list(products)),"boost":1.2}}})
  47. # dsl['query']['bool']['should'].append({"match": {"industry":{"query": " ".join(list(products)),"boost":1.2}}})
  48. if len(loc)>0:
  49. dsl["query"]['bool']['filter'] = []
  50. pros,citys = [],[]
  51. for item in addressparser.transform(list(loc)).itertuples():
  52. if getattr(item, '省') != '':
  53. pros.append(getattr(item, '省'))
  54. if getattr(item, '市') != '':
  55. citys.append(getattr(item, '市'))
  56. if len(pros)>0:
  57. dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
  58. if len(citys)>0:
  59. dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
  60. backup_dsl = {}
  61. search_words = []
  62. for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
  63. if int(level)>=2:
  64. search_words.append(word)
  65. backup_dsl["query"]={"bool":{"should":[]}}
  66. backup_dsl['query']['bool']['should'].append({"match": {"ori_product_tag_list":{"query":"".join(search_words) ,"boost": 2.0}}})
  67. # backup_dsl['query']['bool']['should'].append({"match": {"seller_bidding_products_list":{"query": text,"boost": 1.8}}})
  68. backup_dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(search_words),"boost":1.0}}})
  69. # backup_dsl['query']['bool']['should'].append({"match": {"gs_info_product":{"query":text}}})
  70. backup_dsl['query']['bool']['should'].append({"match": {"scope":{"query": "".join(search_words),"boost":0.8}}})
  71. backup_dsl["highlight"]=dsl["highlight"]
  72. dsl['min_score']=0.1
  73. return dsl,backup_dsl
  74. def gen_company_search_dsl(text):
  75. loc = set()
  76. products = []
  77. person = set()
  78. companyname = []
  79. dsl = {}
  80. lac_result = lac.run(text)
  81. han_result = HanLP(text)
  82. # print(f'lac_result:{lac_result}')
  83. # print(f'han_result:{han_result}')
  84. for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
  85. if pos == 'LOC':
  86. loc.add(word)
  87. elif pos == 'ORG':
  88. companyname.append(word)
  89. elif pos == 'PER':
  90. person.add(word)
  91. else:
  92. if int(level)>=2 and pos in ['n','nz','nw','vn'] and word not in ['老板','法人']:
  93. products.append(word)
  94. for word,pos,_,_ in han_result['ner']:
  95. if pos == 'LOCATION':
  96. loc.add(word)
  97. elif pos == 'ORGANIZATION':
  98. if word not in companyname:
  99. companyname.append(word)
  100. elif pos == 'PERSON':
  101. person.add(word)
  102. for word,pos in zip(han_result['tok'],han_result['pos']):
  103. if pos == 'ns':
  104. loc.add(word)
  105. elif pos == 'nt':
  106. if word not in companyname:
  107. companyname.append(word)
  108. elif pos == 'nr':
  109. person.add(word)
  110. dsl["query"]={"bool":{"should":[],"must":[]}}
  111. # print(f'companyname:{companyname},person:{person},loc:{loc}')
  112. if len(text)<=6:
  113. dsl['query']['bool']['should'].append({"match_phrase_prefix": {"companyname":{"query":text,"boost":2.0}}})
  114. if len(companyname)>0:
  115. dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(list(companyname))}}})
  116. dsl['query']['bool']['should'].append({"match_phrase_prefix": {"companyname":{"query": "".join(list(companyname)),"boost":2.0}}})
  117. if len(person)>0:
  118. dsl['query']['bool']['should'].append({"match_phrase_prefix": {"legalpersonname":{"query": "".join(list(person))}}})
  119. # if len(loc)>0:
  120. # dsl['query']['bool']['should'].append({"match": {"companyname":{"query": " ".join(list(loc))}}})
  121. if len(products)>0:
  122. dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(list(products))}}})
  123. dsl['query']['bool']['should'].append({"match_phrase_prefix": {"companyname":{"query": "".join(list(products)),"boost":3}}})
  124. dsl['query']['bool']['should'].append({"match": {"scope":{"query": " ".join(list(products))}}})
  125. if len(loc)>0:
  126. dsl["query"]['bool']['filter']=[]
  127. pros,citys,areas = [],[],[]
  128. for item in addressparser.transform(loc).itertuples():
  129. if getattr(item, '省') != '':
  130. pros.append(getattr(item, '省'))
  131. if getattr(item, '市') != '':
  132. citys.append(getattr(item, '市'))
  133. if getattr(item, '区') != '':
  134. areas.append(getattr(item, '区'))
  135. if len(pros)>0:
  136. dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
  137. if len(citys)>0:
  138. dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
  139. # dsl["query"]['bool']['should'].extend([{'match':{'city':' '.join(citys)}},{'match':{'province':' '.join(pros)}}])
  140. # dsl["query"]['bool']['filter']=[{'terms':{'province.keyword':pros}}]
  141. dsl["query"]['bool']['must_not']=[]
  142. dsl["query"]['bool']['must_not'].append({'terms':{'companyorgtype':['个体工商户','个体工商户3','个体工商户g','个体工商户(香港)','个体户','个体工商户(香港)','个体工户','非法人商事主体【台湾居民个体户】','个体商户','非普通个体户','个体工商','个体(台、港、澳)','个体工商户o','个体工商户(台湾)','澳门居民个体户','非法人商事主体【香港居民个体户】','个体(内地)','个体工商户(澳门)','台湾居民个体户','个体工','个体工商户数','个体工商户(台湾)','个体','非法人商事主体【普通个体户】','个体(台、港、澳) ','非香港居民个体户','个体工商户(临时)','个体(个人经营)','香港居民个体户','临时个体工商户','个体工商户2','个体户工商','个体(内地) ']}})
  143. backup_dsl = {}
  144. backup_dsl["query"]={"bool":{"should":[]}}
  145. search_words=[]
  146. for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
  147. if int(level)>=2:
  148. search_words.append(word)
  149. backup_dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(list(search_words)),"boost":2.0}}})
  150. backup_dsl['query']['bool']['should'].append({"match": {"scope":{"query": "".join(list(search_words))}}})
  151. dsl['min_score']=0.1
  152. # dsl['query']['bool']['should']=list(set(dsl['query']['bool']['should']))
  153. # dsl['query']['bool']['must']=list(set(dsl['query']['bool']['must']))
  154. return dsl,backup_dsl
  155. def biddingquery_type(query):
  156. """
  157. 判断query是查询“中标结果”还是“招标公告“。
  158. Args:
  159. query: 自然语言的query。
  160. Returns:
  161. "中标结果" 或 "招标公告"。
  162. """
  163. # 查询中标结果的正则表达式
  164. result_pattern = re.compile(r"(中标|中了|中标情况|中了哪些标)")
  165. if result_pattern.search(query):
  166. return "中标结果"
  167. else:
  168. return None
  169. def gen_bidding_search_dsl(text):
  170. loc = set()
  171. products = []
  172. person = set()
  173. companyname = []
  174. dsl = {}
  175. lac_result = lac.run(text)
  176. han_result = HanLP(text)
  177. jio_money_result = jio.ner.extract_money(text, with_parsing=False)
  178. jio_time_result = jio.ner.extract_time(text, time_base=time.time(),with_parsing=False) #jio.parse_time(text, time_base=time.time())
  179. # print('lac_result:',lac_result)
  180. # print('han_result:',han_result)
  181. # print('jio_money_result:',jio_money_result)
  182. # print('jio_time_result:',jio_time_result)
  183. def parse_money(han_result,jio_result):
  184. money_text=set()
  185. money_num = []
  186. for word,ner,_,_ in han_result['ner']:
  187. if ner=='MONEY':
  188. if '大于' in ''.join(han_result['tok']) or '以上' in ''.join(han_result['tok']):
  189. money_text.add(word+'以上')
  190. if '小于' in ''.join(han_result['tok']) or '以下' in ''.join(han_result['tok']):
  191. money_text.add(word+'以下')
  192. for i in jio_result:
  193. if i['type']=='money':
  194. money_text.add(i['text'])
  195. for m in money_text:
  196. try:
  197. m = jio.parse_money(m)
  198. if m['definition'] in ['accurate','blur']:
  199. money_num.append(float(m['num']))
  200. elif m['definition']=='blur+':
  201. money_num.append(float(m['num']))
  202. money_num.append(10e8)
  203. elif m['definition']=='blur-':
  204. money_num.append(float(m['num']))
  205. money_num.append(0.0)
  206. except:
  207. pass
  208. return money_num
  209. def parse_date(jio_result):
  210. for date in jio_result:
  211. date_string = jio.parse_time(date['text'], time_base=time.time())
  212. if date_string['type']=='time_span':
  213. time_start,time_end = date_string['time'][0],date_string['time'][1]
  214. if time_start=='-inf':
  215. time_start=jio.parse_time('今天', time_base=time.time())['time'][0]
  216. if date_string['type']=='time_point':
  217. time_start = jio.parse_time('今天', time_base=time.time())['time'][0]
  218. time_end = date_string['time'][1]
  219. return time_start,time_end
  220. return None,None
  221. def parse_location(locs):
  222. pros,citys,areas = [],[],[]
  223. df = addressparser.transform(locs)
  224. # print(df)
  225. for item in df.itertuples():
  226. province,city,area = getattr(item, '省'),getattr(item, '市'),getattr(item, '区')
  227. if province in ['上海市','北京市','天津市','重庆市']:
  228. city = area
  229. area = ''
  230. if province != '':
  231. pros.append(province)
  232. if city != '':
  233. if city in ['浦东区','浦东新区']:
  234. citys.extend(['浦东区','浦东新区'])
  235. citys.append(city)
  236. if area != '':
  237. areas.append(area)
  238. return pros,citys,areas
  239. money_num = parse_money(han_result,jio_money_result)
  240. time_start,time_end = parse_date(jio_time_result)
  241. for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
  242. if pos == 'LOC':
  243. loc.add(word)
  244. elif pos == 'ORG':
  245. companyname.append(word)
  246. else:
  247. if int(level)>=2 and pos in ['n','nz','nw'] and word not in ['月份','标','项目']:
  248. products.append(word)
  249. for word,pos,_,_ in han_result['ner']:
  250. if pos == 'LOCATION':
  251. loc.add(word)
  252. elif pos == 'ORGANIZATION':
  253. companyname.append(word)
  254. for word,pos in zip(han_result['tok'],han_result['pos']):
  255. if pos == 'ns':
  256. loc.add(word)
  257. # try:
  258. # jio_rec = jio.recognize_location(text).get('domestic',None)
  259. # if jio_rec is not None:
  260. # for item in jio_rec:
  261. # # if item[0]['province'],item[0]['city'],item[0]['county']:
  262. # loc.add(''.join(list(filter(lambda x:x!=None,[item[0]['province'],item[0]['city'],item[0]['county']]))))
  263. # except:
  264. # pass
  265. # print(f'products:{products},companyname:{companyname},loc:{loc},time_start:{time_start},time_end:{time_end}')
  266. dsl["query"]={"bool":{"should":[],"must":[],'filter':[]}}
  267. dsl['query']['bool']['should'].append({"match": {"title":{"query":text,"boost":1.2}}})
  268. dsl['query']['bool']['should'].append({"match_phrase_prefix": {"title":{"query":text,"boost":1.5}}})
  269. # dsl['query']['bool']['should'].append({"match": {"bid_unit":{"query":text}}})
  270. # dsl['query']['bool']['should'].append({"match": {"win_bid_unit":{"query":text}}})
  271. # dsl['query']['bool']['should'].append({"match": {"agent_unit":{"query":text}}})
  272. # dsl['query']['bool']['should'].append({"match": {"main_body":{"query":text}}})
  273. if biddingquery_type(text)=='中标结果':
  274. if len(companyname)>0:
  275. dsl['query']['bool']['must'].append({"match": {"win_bid_unit":{"query": " ".join(list(set(companyname)))}}})
  276. dsl['query']['bool']['should'].append({"match_phrase_prefix": {"win_bid_unit":{"query": " ".join(list(set(companyname))),'boost':3.0}}})
  277. if len(money_num)>1:
  278. min_money,max_money = min(money_num), max(money_num)
  279. if min_money < max_money:
  280. dsl["query"]['bool']['filter'].append({"range": {"win_bid_amount_real": {"gte": min_money,"lte": max_money}}})
  281. if time_start is not None:
  282. dsl["query"]['bool']['filter'].append({"range": {"publish_date": {"gte": time_start,"lte": time_end}}})
  283. dsl["query"]['bool']['filter'].append({"term": {"bid_type":'中标结果' }})
  284. else:
  285. # dsl["query"]['bool']['filter'].append({"terms": {"bid_type":['招标公告','拟建项目'] }})
  286. for i,_ in enumerate(companyname):
  287. for l in loc:
  288. companyname[i] = companyname[i].replace(l,'')
  289. companyname =list(set(companyname))
  290. if len(companyname)>0:
  291. dsl['query']['bool']['must'].append({'bool':{"should":[]}})
  292. dsl['query']['bool']['must'][0]['bool']['should'].append({"match": {"agent_unit":{"query": " ".join(list(companyname))}}})
  293. dsl['query']['bool']['must'][0]['bool']['should'].append({"match": {"bid_unit":{"query": " ".join(list(companyname))}}})
  294. dsl['query']['bool']['must'][0]['bool']['should'].append({"match_phrase": {"agent_unit":{"query": "".join(list(companyname)),'boost':3}}})
  295. dsl['query']['bool']['must'][0]['bool']['should'].append({"match_phrase": {"bid_unit":{"query": "".join(list(companyname)),'boost':3}}})
  296. dsl['query']['bool']['must'][0]['bool']['should'].append({"match": {"title":{"query": "".join(list(companyname))}}})
  297. # dsl['query']['bool']['should'].append({"match": {"agent_unit":{"query": " ".join(list(companyname))}}})
  298. # dsl['query']['bool']['should'].append({"match": {"bid_unit":{"query": " ".join(list(companyname))}}})
  299. # dsl["query"]['bool']['filter']=[]
  300. if len(money_num)>1:
  301. min_money,max_money = min(money_num), max(money_num)
  302. if min_money < max_money:
  303. dsl["query"]['bool']['filter'].append({"range": {"budget_amount_real": {"gte": min_money,"lte": max_money}}})
  304. if time_start is not None:
  305. dsl["query"]['bool']['filter'].append({"range": {"bid_end_date": {"gte": time_start,"lte": time_end}}})
  306. if len(loc)>0:
  307. pros,citys,areas = parse_location(list(loc))
  308. if len(pros)>0:
  309. dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
  310. if len(citys)>0:
  311. dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
  312. if len(areas)>0:
  313. dsl["query"]['bool']['filter'].append({'terms':{'area':areas}})
  314. if len(products)>0:
  315. dsl['query']['bool']['should'].append({"match": {"tender_products":{"query": " ".join(list(products)),"boost":1.2}}})
  316. dsl['query']['bool']['should'].append({"match": {"title":{"query": " ".join(list(products)),"boost":1.1}}})
  317. dsl['query']['bool']['should'].append({"match": {"main_body":{"query": " ".join(list(products)),"boost":1.0}}})
  318. backup_dsl={}
  319. backup_dsl["query"]={"bool":{"should":[],'filter':[]}}
  320. search_words=[]
  321. for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
  322. if int(level)>=2 and pos in ['LOC','n','ORG','nw','nz']:
  323. search_words.append(word)
  324. if biddingquery_type(text)=='中标结果':
  325. backup_dsl['query']['bool']['should'].append({"match":{"win_bid_unit":{"query":"".join(search_words),"boost":1.5}}})
  326. backup_dsl['query']['bool']['should'].append({"match":{"agent_unit":{"query":"".join(search_words),"boost":1.5}}})
  327. backup_dsl["query"]['bool']['filter'].append({"term": {"bid_type":'中标结果' }})
  328. else:
  329. # backup_dsl['query']['bool']['should'].append({"match":{"title":{"query":"".join(search_words),"boost":2.0}}})
  330. # backup_dsl['query']['bool']['should'].append({"match":{"tender_products":{"query":"".join(search_words),"boost":1.8}}})
  331. # backup_dsl['query']['bool']['should'].append({"match":{"bid_unit":{"query":"".join(search_words),"boost":1.5}}})
  332. # backup_dsl['query']['bool']['should'].append({"match":{"agent_unit":{"query":"".join(search_words),"boost":1.5}}})
  333. # backup_dsl['query']['bool']['should'].append({"match":{"main_body":{"query":"".join(search_words),"boost":1.5}}})
  334. # backup_dsl["query"]['bool']['filter'].append({"terms": {"bid_type":['招标公告','拟建项目'] }})
  335. backup_dsl = copy.deepcopy(dsl)
  336. backup_dsl['query']['bool']['filter']=[]
  337. # backup_dsl["query"]['bool']['filter'].append({"terms": {"bid_type":['招标公告','拟建项目'] }})
  338. if len(loc)>0:
  339. pros,citys,areas = parse_location(list(loc))
  340. if len(areas)>0:
  341. backup_dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
  342. backup_dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
  343. elif len(citys)>0:
  344. backup_dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
  345. backup_dsl['query']['bool']['filter'] = list(filter(lambda x:x is not None,backup_dsl['query']['bool']['filter']))
  346. dsl['min_score']=0.1
  347. backup_dsl["highlight"]=dsl["highlight"]
  348. return dsl,backup_dsl