123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406 |
- from LAC import LAC
- import hanlp
- import addressparser
- import jionlp as jio
- import time
- import re
- import copy
- lac = LAC(mode='rank')
- # .append(hanlp.load('COARSE_ELECTRA_SMALL_ZH'), output_key='tok') \
- HanLP = hanlp.pipeline() \
- .append(hanlp.load('FINE_ELECTRA_SMALL_ZH'), output_key='tok') \
- .append(hanlp.load('PKU_POS_ELECTRA_SMALL'), output_key='pos') \
- .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok')
- # jio.recognize_location('TEST')
- def gen_product_search_dsl(text):
- loc=set()
- org=[]
- products=[]
- dsl = {}
- lac_result = lac.run(text)
- han_result = HanLP(text)
- # print(f'lac_result:{lac_result}')
- # print(f'han_result:{han_result}')
- for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
- if pos == 'LOC':
- loc.add(word)
- else:
- if int(level)>2:
- products.append(word)
- for word,pos in zip(han_result['tok'],han_result['pos']):
- if pos == 'ns':
- loc.add(word)
- # elif pos == 'nt':
- # if word not in companyname:
- # companyname.append(word)
- # elif pos == 'nr':
- # person.add(word)
- dsl["query"]={"bool":{"should":[]}}
- # if len(text.replace(' ',''))<=6:
- # dsl['query']['bool']['should'].append({"match_phrase": {"ori_product_tag_list":{"query":text.replace(' ',''),"boost":3.0}}})
-
- dsl['query']['bool']['should'].append({"match": {"ori_product_tag_list":{"query": " ".join(list(products)),"boost": 2.0}}})
- dsl['query']['bool']['should'].append({"match_phrase": {"ori_product_tag_list":{"query": "".join(list(products)),"boost": 5.0}}})
- dsl['query']['bool']['should'].append({"match": {"companyname":{"query": " ".join(list(products)),"boost":1.5}}})
- dsl['query']['bool']['should'].append({"match_phrase": {"scope":{"query": "".join(list(products)),"boost":1.8}}})
- # dsl['query']['bool']['should'].append({"match": {"product":{"query": " ".join(list(products)),"boost":1.2}}})
- # dsl['query']['bool']['should'].append({"match": {"productchain":{"query": " ".join(list(products)),"boost":1.2}}})
- # dsl['query']['bool']['should'].append({"match": {"industry":{"query": " ".join(list(products)),"boost":1.2}}})
- if len(loc)>0:
- dsl["query"]['bool']['filter'] = []
- pros,citys = [],[]
- for item in addressparser.transform(list(loc)).itertuples():
- if getattr(item, '省') != '':
- pros.append(getattr(item, '省'))
- if getattr(item, '市') != '':
- citys.append(getattr(item, '市'))
- if len(pros)>0:
- dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
- if len(citys)>0:
- dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
-
- backup_dsl = {}
- search_words = []
- for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
- if int(level)>=2:
- search_words.append(word)
- backup_dsl["query"]={"bool":{"should":[]}}
- backup_dsl['query']['bool']['should'].append({"match": {"ori_product_tag_list":{"query":"".join(search_words) ,"boost": 2.0}}})
- # backup_dsl['query']['bool']['should'].append({"match": {"seller_bidding_products_list":{"query": text,"boost": 1.8}}})
- backup_dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(search_words),"boost":1.0}}})
- # backup_dsl['query']['bool']['should'].append({"match": {"gs_info_product":{"query":text}}})
- backup_dsl['query']['bool']['should'].append({"match": {"scope":{"query": "".join(search_words),"boost":0.8}}})
- backup_dsl["highlight"]=dsl["highlight"]
- dsl['min_score']=0.1
- return dsl,backup_dsl
- def gen_company_search_dsl(text):
- loc = set()
- products = []
- person = set()
- companyname = []
-
- dsl = {}
-
- lac_result = lac.run(text)
- han_result = HanLP(text)
- # print(f'lac_result:{lac_result}')
- # print(f'han_result:{han_result}')
- for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
- if pos == 'LOC':
- loc.add(word)
- elif pos == 'ORG':
- companyname.append(word)
- elif pos == 'PER':
- person.add(word)
- else:
- if int(level)>=2 and pos in ['n','nz','nw','vn'] and word not in ['老板','法人']:
- products.append(word)
- for word,pos,_,_ in han_result['ner']:
- if pos == 'LOCATION':
- loc.add(word)
- elif pos == 'ORGANIZATION':
- if word not in companyname:
- companyname.append(word)
- elif pos == 'PERSON':
- person.add(word)
-
- for word,pos in zip(han_result['tok'],han_result['pos']):
- if pos == 'ns':
- loc.add(word)
- elif pos == 'nt':
- if word not in companyname:
- companyname.append(word)
- elif pos == 'nr':
- person.add(word)
- dsl["query"]={"bool":{"should":[],"must":[]}}
- # print(f'companyname:{companyname},person:{person},loc:{loc}')
- if len(text)<=6:
- dsl['query']['bool']['should'].append({"match_phrase_prefix": {"companyname":{"query":text,"boost":2.0}}})
- if len(companyname)>0:
- dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(list(companyname))}}})
- dsl['query']['bool']['should'].append({"match_phrase_prefix": {"companyname":{"query": "".join(list(companyname)),"boost":2.0}}})
- if len(person)>0:
- dsl['query']['bool']['should'].append({"match_phrase_prefix": {"legalpersonname":{"query": "".join(list(person))}}})
- # if len(loc)>0:
- # dsl['query']['bool']['should'].append({"match": {"companyname":{"query": " ".join(list(loc))}}})
- if len(products)>0:
- dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(list(products))}}})
- dsl['query']['bool']['should'].append({"match_phrase_prefix": {"companyname":{"query": "".join(list(products)),"boost":3}}})
- dsl['query']['bool']['should'].append({"match": {"scope":{"query": " ".join(list(products))}}})
- if len(loc)>0:
- dsl["query"]['bool']['filter']=[]
- pros,citys,areas = [],[],[]
- for item in addressparser.transform(loc).itertuples():
- if getattr(item, '省') != '':
- pros.append(getattr(item, '省'))
- if getattr(item, '市') != '':
- citys.append(getattr(item, '市'))
- if getattr(item, '区') != '':
- areas.append(getattr(item, '区'))
- if len(pros)>0:
- dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
- if len(citys)>0:
- dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
- # dsl["query"]['bool']['should'].extend([{'match':{'city':' '.join(citys)}},{'match':{'province':' '.join(pros)}}])
- # dsl["query"]['bool']['filter']=[{'terms':{'province.keyword':pros}}]
- dsl["query"]['bool']['must_not']=[]
- dsl["query"]['bool']['must_not'].append({'terms':{'companyorgtype':['个体工商户','个体工商户3','个体工商户g','个体工商户(香港)','个体户','个体工商户(香港)','个体工户','非法人商事主体【台湾居民个体户】','个体商户','非普通个体户','个体工商','个体(台、港、澳)','个体工商户o','个体工商户(台湾)','澳门居民个体户','非法人商事主体【香港居民个体户】','个体(内地)','个体工商户(澳门)','台湾居民个体户','个体工','个体工商户数','个体工商户(台湾)','个体','非法人商事主体【普通个体户】','个体(台、港、澳) ','非香港居民个体户','个体工商户(临时)','个体(个人经营)','香港居民个体户','临时个体工商户','个体工商户2','个体户工商','个体(内地) ']}})
- backup_dsl = {}
-
- backup_dsl["query"]={"bool":{"should":[]}}
- search_words=[]
- for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
- if int(level)>=2:
- search_words.append(word)
- backup_dsl['query']['bool']['should'].append({"match": {"companyname":{"query": "".join(list(search_words)),"boost":2.0}}})
- backup_dsl['query']['bool']['should'].append({"match": {"scope":{"query": "".join(list(search_words))}}})
- dsl['min_score']=0.1
- # dsl['query']['bool']['should']=list(set(dsl['query']['bool']['should']))
- # dsl['query']['bool']['must']=list(set(dsl['query']['bool']['must']))
- return dsl,backup_dsl
- def biddingquery_type(query):
- """
- 判断query是查询“中标结果”还是“招标公告“。
- Args:
- query: 自然语言的query。
- Returns:
- "中标结果" 或 "招标公告"。
- """
- # 查询中标结果的正则表达式
- result_pattern = re.compile(r"(中标|中了|中标情况|中了哪些标)")
- if result_pattern.search(query):
- return "中标结果"
- else:
- return None
- def gen_bidding_search_dsl(text):
- loc = set()
- products = []
- person = set()
- companyname = []
- dsl = {}
- lac_result = lac.run(text)
- han_result = HanLP(text)
- jio_money_result = jio.ner.extract_money(text, with_parsing=False)
- jio_time_result = jio.ner.extract_time(text, time_base=time.time(),with_parsing=False) #jio.parse_time(text, time_base=time.time())
-
- # print('lac_result:',lac_result)
- # print('han_result:',han_result)
- # print('jio_money_result:',jio_money_result)
- # print('jio_time_result:',jio_time_result)
-
- def parse_money(han_result,jio_result):
- money_text=set()
- money_num = []
-
- for word,ner,_,_ in han_result['ner']:
- if ner=='MONEY':
- if '大于' in ''.join(han_result['tok']) or '以上' in ''.join(han_result['tok']):
- money_text.add(word+'以上')
- if '小于' in ''.join(han_result['tok']) or '以下' in ''.join(han_result['tok']):
- money_text.add(word+'以下')
- for i in jio_result:
- if i['type']=='money':
- money_text.add(i['text'])
- for m in money_text:
- try:
- m = jio.parse_money(m)
- if m['definition'] in ['accurate','blur']:
- money_num.append(float(m['num']))
- elif m['definition']=='blur+':
- money_num.append(float(m['num']))
- money_num.append(10e8)
- elif m['definition']=='blur-':
- money_num.append(float(m['num']))
- money_num.append(0.0)
- except:
- pass
- return money_num
-
- def parse_date(jio_result):
- for date in jio_result:
- date_string = jio.parse_time(date['text'], time_base=time.time())
- if date_string['type']=='time_span':
- time_start,time_end = date_string['time'][0],date_string['time'][1]
- if time_start=='-inf':
- time_start=jio.parse_time('今天', time_base=time.time())['time'][0]
- if date_string['type']=='time_point':
- time_start = jio.parse_time('今天', time_base=time.time())['time'][0]
- time_end = date_string['time'][1]
- return time_start,time_end
-
- return None,None
- def parse_location(locs):
- pros,citys,areas = [],[],[]
- df = addressparser.transform(locs)
- # print(df)
- for item in df.itertuples():
- province,city,area = getattr(item, '省'),getattr(item, '市'),getattr(item, '区')
- if province in ['上海市','北京市','天津市','重庆市']:
- city = area
- area = ''
- if province != '':
- pros.append(province)
- if city != '':
- if city in ['浦东区','浦东新区']:
- citys.extend(['浦东区','浦东新区'])
- citys.append(city)
- if area != '':
- areas.append(area)
- return pros,citys,areas
-
- money_num = parse_money(han_result,jio_money_result)
- time_start,time_end = parse_date(jio_time_result)
- for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
- if pos == 'LOC':
- loc.add(word)
- elif pos == 'ORG':
- companyname.append(word)
- else:
- if int(level)>=2 and pos in ['n','nz','nw'] and word not in ['月份','标','项目']:
- products.append(word)
-
- for word,pos,_,_ in han_result['ner']:
- if pos == 'LOCATION':
- loc.add(word)
- elif pos == 'ORGANIZATION':
- companyname.append(word)
- for word,pos in zip(han_result['tok'],han_result['pos']):
- if pos == 'ns':
- loc.add(word)
- # try:
- # jio_rec = jio.recognize_location(text).get('domestic',None)
- # if jio_rec is not None:
- # for item in jio_rec:
- # # if item[0]['province'],item[0]['city'],item[0]['county']:
- # loc.add(''.join(list(filter(lambda x:x!=None,[item[0]['province'],item[0]['city'],item[0]['county']]))))
- # except:
- # pass
- # print(f'products:{products},companyname:{companyname},loc:{loc},time_start:{time_start},time_end:{time_end}')
- dsl["query"]={"bool":{"should":[],"must":[],'filter':[]}}
- dsl['query']['bool']['should'].append({"match": {"title":{"query":text,"boost":1.2}}})
- dsl['query']['bool']['should'].append({"match_phrase_prefix": {"title":{"query":text,"boost":1.5}}})
- # dsl['query']['bool']['should'].append({"match": {"bid_unit":{"query":text}}})
- # dsl['query']['bool']['should'].append({"match": {"win_bid_unit":{"query":text}}})
- # dsl['query']['bool']['should'].append({"match": {"agent_unit":{"query":text}}})
- # dsl['query']['bool']['should'].append({"match": {"main_body":{"query":text}}})
- if biddingquery_type(text)=='中标结果':
- if len(companyname)>0:
- dsl['query']['bool']['must'].append({"match": {"win_bid_unit":{"query": " ".join(list(set(companyname)))}}})
- dsl['query']['bool']['should'].append({"match_phrase_prefix": {"win_bid_unit":{"query": " ".join(list(set(companyname))),'boost':3.0}}})
- if len(money_num)>1:
- min_money,max_money = min(money_num), max(money_num)
- if min_money < max_money:
- dsl["query"]['bool']['filter'].append({"range": {"win_bid_amount_real": {"gte": min_money,"lte": max_money}}})
- if time_start is not None:
- dsl["query"]['bool']['filter'].append({"range": {"publish_date": {"gte": time_start,"lte": time_end}}})
- dsl["query"]['bool']['filter'].append({"term": {"bid_type":'中标结果' }})
- else:
- # dsl["query"]['bool']['filter'].append({"terms": {"bid_type":['招标公告','拟建项目'] }})
- for i,_ in enumerate(companyname):
- for l in loc:
- companyname[i] = companyname[i].replace(l,'')
- companyname =list(set(companyname))
- if len(companyname)>0:
- dsl['query']['bool']['must'].append({'bool':{"should":[]}})
- dsl['query']['bool']['must'][0]['bool']['should'].append({"match": {"agent_unit":{"query": " ".join(list(companyname))}}})
- dsl['query']['bool']['must'][0]['bool']['should'].append({"match": {"bid_unit":{"query": " ".join(list(companyname))}}})
- dsl['query']['bool']['must'][0]['bool']['should'].append({"match_phrase": {"agent_unit":{"query": "".join(list(companyname)),'boost':3}}})
- dsl['query']['bool']['must'][0]['bool']['should'].append({"match_phrase": {"bid_unit":{"query": "".join(list(companyname)),'boost':3}}})
- dsl['query']['bool']['must'][0]['bool']['should'].append({"match": {"title":{"query": "".join(list(companyname))}}})
- # dsl['query']['bool']['should'].append({"match": {"agent_unit":{"query": " ".join(list(companyname))}}})
- # dsl['query']['bool']['should'].append({"match": {"bid_unit":{"query": " ".join(list(companyname))}}})
- # dsl["query"]['bool']['filter']=[]
- if len(money_num)>1:
- min_money,max_money = min(money_num), max(money_num)
- if min_money < max_money:
- dsl["query"]['bool']['filter'].append({"range": {"budget_amount_real": {"gte": min_money,"lte": max_money}}})
- if time_start is not None:
- dsl["query"]['bool']['filter'].append({"range": {"bid_end_date": {"gte": time_start,"lte": time_end}}})
- if len(loc)>0:
- pros,citys,areas = parse_location(list(loc))
- if len(pros)>0:
- dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
- if len(citys)>0:
- dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
- if len(areas)>0:
- dsl["query"]['bool']['filter'].append({'terms':{'area':areas}})
- if len(products)>0:
- dsl['query']['bool']['should'].append({"match": {"tender_products":{"query": " ".join(list(products)),"boost":1.2}}})
- dsl['query']['bool']['should'].append({"match": {"title":{"query": " ".join(list(products)),"boost":1.1}}})
- dsl['query']['bool']['should'].append({"match": {"main_body":{"query": " ".join(list(products)),"boost":1.0}}})
- backup_dsl={}
- backup_dsl["query"]={"bool":{"should":[],'filter':[]}}
- search_words=[]
- for word,pos,level in zip(lac_result[0],lac_result[1],lac_result[2]):
- if int(level)>=2 and pos in ['LOC','n','ORG','nw','nz']:
- search_words.append(word)
- if biddingquery_type(text)=='中标结果':
- backup_dsl['query']['bool']['should'].append({"match":{"win_bid_unit":{"query":"".join(search_words),"boost":1.5}}})
- backup_dsl['query']['bool']['should'].append({"match":{"agent_unit":{"query":"".join(search_words),"boost":1.5}}})
- backup_dsl["query"]['bool']['filter'].append({"term": {"bid_type":'中标结果' }})
- else:
- # backup_dsl['query']['bool']['should'].append({"match":{"title":{"query":"".join(search_words),"boost":2.0}}})
- # backup_dsl['query']['bool']['should'].append({"match":{"tender_products":{"query":"".join(search_words),"boost":1.8}}})
- # backup_dsl['query']['bool']['should'].append({"match":{"bid_unit":{"query":"".join(search_words),"boost":1.5}}})
- # backup_dsl['query']['bool']['should'].append({"match":{"agent_unit":{"query":"".join(search_words),"boost":1.5}}})
- # backup_dsl['query']['bool']['should'].append({"match":{"main_body":{"query":"".join(search_words),"boost":1.5}}})
- # backup_dsl["query"]['bool']['filter'].append({"terms": {"bid_type":['招标公告','拟建项目'] }})
- backup_dsl = copy.deepcopy(dsl)
- backup_dsl['query']['bool']['filter']=[]
- # backup_dsl["query"]['bool']['filter'].append({"terms": {"bid_type":['招标公告','拟建项目'] }})
- if len(loc)>0:
- pros,citys,areas = parse_location(list(loc))
- if len(areas)>0:
- backup_dsl["query"]['bool']['filter'].append({'terms':{'city':citys}})
- backup_dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
- elif len(citys)>0:
- backup_dsl["query"]['bool']['filter'].append({'terms':{'province':pros}})
-
- backup_dsl['query']['bool']['filter'] = list(filter(lambda x:x is not None,backup_dsl['query']['bool']['filter']))
- dsl['min_score']=0.1
- backup_dsl["highlight"]=dsl["highlight"]
- return dsl,backup_dsl
|