from elasticsearch7 import Elasticsearch from collections import Counter from general_functions import save_error, normalize_content from funcs import read_from_json import datetime import os # ################################## # برای محتوای مواد و احکام قانون که از معاونت قوانین مجلس در ایندکس الاستیک ذخیره شده است # qanon_section-v02 # تحلیل روی بعضی فیلدها می کند و تاریخ های آن را استخراج و تبدیل به فرمت خاص تایم استمپ می کند # و در فیدل مناسب در همان ایندکس الاستیک ذخیره میکند # توجه : دسترسی به الاستیک باید باشد # ################################## def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs): global counter global total is_first = True while True: # Scroll next if is_first: # Initialize scroll # result = es.search(index=index, scroll="2m", **kwargs, body={ # "size": pagesize # }) result = es.search( index=index, scroll="2m", **kwargs, size=pagesize, body={ "query": { "bool": { "must_not": [ {"exists": {"field": "nlp_parser.type"}}, {"match": {"content_len": 0}}, {"match": {"parse_state": 1}}, {"match": {"parse_state": 2}} ] } } } ) total = result["hits"]["total"]["value"] print("total = %d" % total) is_first = False else: result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] counter += len(hits) print("progress -> %.2f %%" % ((counter / total) * 100)) # Stop after no more docs if not hits: break # Yield each entry yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits) def es_iterate_some_documents(es, index, records, pagesize=250, scroll_timeout="25m", **kwargs): global counter global total is_first = True query = { "query": { "terms": { "_id": records } } } while True: # Scroll next if is_first: # Initialize scroll # result = es.search(index=index, scroll="2m", **kwargs, body={ # "size": pagesize # }) result = es.search( index=index, scroll="2m", **kwargs, size=pagesize, body= query ) total = result["hits"]["total"]["value"] print("total = %d" % total) is_first = False else: result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] counter += len(hits) print("progress -> %.2f %%" % ((counter / total) * 100)) # Stop after no more docs if not hits: break # Yield each entry yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits) def prepare_data(ner_obj_list): ner_data_list = [] for ner_obj in ner_obj_list: ner_data = { "key" :ner_obj['ner_key'], "value" :ner_obj['ner_value'], "begin" :ner_obj['ner_start_token'], "end" :ner_obj['ner_end_token'], "score" :ner_obj['ner_score'] } ner_data_list.append(ner_data) return ner_data_list def relation_finder(all_orgs): new_orgs = [] for index, section in enumerate(all_orgs): if index % 1000 == 0: print(f"relation finder progress: {(index/len(all_orgs)) * 100:.2f} %") # if index > 1000: # break related_sections = [] orgs = section['ai_key'] for org in orgs: for compare_item in all_orgs: compare_item_orgs = compare_item['ai_key'] if section['section_id'] == compare_item['section_id']:# جلوگیری از ارتباط یک مقرره با خودش continue if org in compare_item_orgs: # related_sections.append(compare_item['id']) related_sections.append({ 'section_id': compare_item['section_id'], 'qanon_id': compare_item['qanon_id'], 'ai_key': org, 'type': 'ORG', 'weight': 1, }) new_orgs.append({ 'id': section['section_id'], 'qanon_id': section['qanon_id'], 'ai_key': section['orgs'], # 'orgs_text': section['orgs_text'], 'relations': related_sections, }) return new_orgs def extract_weight(relation_list): relation_list_temp = [] for rel_item in relation_list: weight = 0 rel_labels = [] current_section_id = rel_item['section_id'] for item in relation_list: if item['section_id'] == current_section_id: weight += 1 rel_labels.append(item['ai_key']) for rel_item2 in relation_list_temp: if current_section_id == rel_item2['section_id']: break else: relation_list_temp.append({ "section_id": current_section_id, "qanon_id": rel_item['qanon_id'], "ai_key": rel_labels, "type": rel_item['type'], "weight": weight }) break if len(relation_list_temp) == 0: relation_list_temp.append({ "section_id": current_section_id, "qanon_id": rel_item['qanon_id'], "ai_key": rel_labels, "type": rel_item['type'], "weight": weight }) return relation_list_temp print(datetime.datetime.now()) index_name_i = "ai_mj_qa_section-v07" # الاستیک موجود روی جی پی یو # index_name_o = 'mj_qa_test-v01' # is_update_state = False index_name_o = "ai_mj_qa_section-v03" is_update_state = False mapping_o = "" es = Elasticsearch( "http://127.0.0.1:6900", basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm") ) try: if not es.indices.exists(index=index_name_o): response = es.indices.create(index=index_name_o, body=mapping_o) # print out the response: print("create index response:", response) except: print("elastic error") counter = 0 total = 0 id = "" novalid = -15000000000 all_orgs = [] all_orgs_text = '' orgs_list = [] main_9k_sections = read_from_json("/data/selected_sections_9K.json") main_9k_sections_list = [item00["id"] for item00 in main_9k_sections] section_list1 = es_iterate_all_documents(es, index_name_i) count = 0 for mentry1 in section_list1: count += 1 id1 = mentry1["id"] # اگر این سکشن جزء سکشن های اصلی موجود در داده های فایل 9 هزارتایی نبود، آن را بررسی نکن if not id1 in main_9k_sections_list: continue # if count > 10000: # break # print('relation task for section: ' + str(count)) if count % 100 == 0: print(f"relation finder progress: {(count / 273442) * 100:.2f} %") entry1 = mentry1["source"] content1 = entry1.get("content", "") content_ai1 = entry1["content_ai"] content_len1 = entry1.get("content_len", "") qanon_id1 = entry1.get("qanon_id", "") ners1 = entry1['ners_v1'] relations = entry1['relations'] temp = relations before = len(relations) relations = extract_weight(relations) after = len(relations) if before != after: print() print("id: "+ id1 +" merged find! diffrence = "+str(before-after)) print() data1 = { "qanon_id" : qanon_id1, "content_ai": content_ai1, "ners_v1": ners1, "relations": relations } eid = id1 try: if is_update_state: resp = es.update(index=index_name_o, id=eid, doc=data1) else: #write_to_json(data, './data/regulations_ner.json') resp = es.index(index=index_name_o, id=eid, document=data1) #pass except Exception as e: save_error(id, e) print(datetime.datetime.now()) print(" # # # relation update finished! # # # ")