from elasticsearch7 import Elasticsearch from collections import Counter from general_functions import save_error, normalize_content from funcs import save_to_file_by_address, read_file_by_address, write_to_json import datetime import os # ################################## # برای محتوای مواد و احکام قانون که از معاونت قوانین مجلس در ایندکس الاستیک ذخیره شده است # qanon_section-v02 # تحلیل روی بعضی فیلدها می کند و تاریخ های آن را استخراج و تبدیل به فرمت خاص تایم استمپ می کند # و در فیدل مناسب در همان ایندکس الاستیک ذخیره میکند # توجه : دسترسی به الاستیک باید باشد # ################################## def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs): """ Helper to iterate ALL values from a single index Yields all the documents. """ global counter global total is_first = True while True: # Scroll next if is_first: # Initialize scroll # result = es.search(index=index, scroll="2m", **kwargs, body={ # "size": pagesize # }) result = es.search( index=index, scroll="2m", **kwargs, size=pagesize, body={ "query": { "bool": { "must_not": [ {"exists": {"field": "nlp_parser.type"}}, {"match": {"content_len": 0}}, {"match": {"parse_state": 1}}, {"match": {"parse_state": 2}} ] } } } ) total = result["hits"]["total"]["value"] print("total = %d" % total) is_first = False else: result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] counter += len(hits) print("progress -> %.2f %%" % ((counter / total) * 100)) # Stop after no more docs if not hits: break # Yield each entry yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits) def es_iterate_some_documents(es, index, records, pagesize=250, scroll_timeout="25m", **kwargs): global counter global total is_first = True query = { "query": { "terms": { "_id": records } } } while True: # Scroll next if is_first: # Initialize scroll # result = es.search(index=index, scroll="2m", **kwargs, body={ # "size": pagesize # }) result = es.search( index=index, scroll="2m", **kwargs, size=pagesize, body= query ) total = result["hits"]["total"]["value"] print("total = %d" % total) is_first = False else: result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] counter += len(hits) print("progress -> %.2f %%" % ((counter / total) * 100)) # Stop after no more docs if not hits: break # Yield each entry yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits) def prepare_data(ner_obj_list): ner_data_list = [] for ner_obj in ner_obj_list: ner_data = { "key" :ner_obj['ner_key'], "value" :ner_obj['ner_value'], "begin" :ner_obj['ner_start_token'], "end" :ner_obj['ner_end_token'], "score" :ner_obj['ner_score'] } ner_data_list.append(ner_data) return ner_data_list def remove_stop_signs(ner_value): ner_temp = ner_value for sign in stop_signs: ner_temp = ner_temp.lstrip(sign) ner_temp = ner_temp.rstrip(sign) ner_temp = ner_temp.strip() return ner_temp def remove_stop_orgs(ner_value): if ner_value in stop_orgs: return False return True def relation_finder(all_orgs): new_orgs = [] for index, section in enumerate(all_orgs): if index % 1000 == 0: print(f"relation finder progress: {(index/len(all_orgs)) * 100:.2f} %") # if index > 1000: # break related_sections = [] orgs = section['orgs'] for org in orgs: for compare_item in all_orgs: compare_item_orgs = compare_item['orgs'] if section['id'] == compare_item['id']:# جلوگیری از ارتباط یک مقرره با خودش continue if org in compare_item_orgs: # related_sections.append(compare_item['id']) related_sections.append({ 'section_id': compare_item['id'], 'qanon_id': compare_item['qanon_id'], 'ai_key': org, 'type': 'ORG', 'weight': 1, }) #rel_unique_list_sorted_by_repitation = [] # پیدا کردن سطح ارتباط از طریق شمارش تعداد تکرار ارتباطات # rel_counter = Counter(related_sections) # rel_unique_list = list(rel_counter.items()) # rel_unique_list_sorted_by_repitation = sorted(rel_unique_list, key=lambda x: x[1], reverse=True) # related_sections = rel_unique_list_sorted_by_repitation # related_sections_temp = [] # for relation in related_sections: # related_sections_temp.append({ # 'related_section_id': relation[0], # 'weight': relation[1] # }) new_orgs.append({ 'id': section['id'], 'qanon_id': section['qanon_id'], 'orgs': section['orgs'], # 'orgs_text': section['orgs_text'], 'related_sections': related_sections, }) return new_orgs def extract_weight(relation_list): relation_list_temp = [] for rel_item in relation_list: weight = 0 rel_labels = [] current_section_id = rel_item['section_id'] for item in relation_list: if item['section_id'] == current_section_id: weight += 1 rel_labels.append(item['ai_key']) for rel_item2 in relation_list_temp: if current_section_id == rel_item2['section_id']: break else: relation_list_temp.append({ "section_id": current_section_id, "qanon_id": rel_item['qanon_id'], "ai_key": rel_labels, "type": rel_item['type'], "weight": weight }) return relation_list_temp print(datetime.datetime.now()) index_name_i = "ai_mj_qa_section-v05" # الاستیک موجود روی جی پی یو # index_name_o = 'mj_qa_test-v01' # is_update_state = False index_name_o = "ai_mj_qa_section-v07" is_update_state = False mapping_o = "" es = Elasticsearch( "http://127.0.0.1:6900", basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm") ) try: if not es.indices.exists(index=index_name_o): response = es.indices.create(index=index_name_o, body=mapping_o) # print out the response: print("create index response:", response) except: print("elastic error") counter = 0 total = 0 id = "" try: # # رکوردهایی که قبلا با خطا مواجه شده در آدرس زیر قرار دارد # address3 = os.getcwd() + '/Flair_NER/data/ner_reg_list_prev2.txt' # section_list_text = read_file_by_address(address3) # records = section_list_text.splitlines() section_list = es_iterate_all_documents(es, index_name_i) except Exception as e: print(' reading from elastic error! ') save_error(0, e) stop_signs = [')',']','،','؛','»',')،','-','»،','.','(',')؛','/','\\' '.......................................................', '..........................................................', ':..............................................................', ] stop_orgs = ['و','.','0','از','است','هایایرانی','ای','یافته و','یافته', 'یا دولتی شهرداری','گیری','وزارت.','وزارت خانه های','یافته،', 'یافته و تروریسم','','خانه','چمن','ذغال','یا شهرداری','یا دولتی'] stop_orgs2 = ["Acrobasis pyrivorella", "Aleurocanthus spiniferus", "Amauromyza maculosa", "سوسک شاخک بلند آسیایی", "Anoplophora glabripennis", "Araecerus fasciculatus", "Asterolecanium phoenicis", "Bactrocera spp", "fusca", "Cacoecimorpha pronubana", "Coccotrypes dactyliperda", "Coccus viridis", "Cryptophlebia leucotreta", "Cydia prunivora", "Daphorina citri", "مگس گالزای گل داوودی", "Epichoristodes acerbella", "Epochra canadensis", "Euphranta japonica", "Eutetranychus", "Eutetranychus carpini", "عنکبوتی", "Eutetranychus hirstii", "Gonipterus scutellanus", "Leptinotarsa desemlineata", "Mackiella phoenicls", "Mycetaspis personata", "Oligonychus mangiferus", "Oligonychus pratensis", "Omatissus binotatus", "Opogona sacchari", "Oryctes spp", "Pammene rhediella", "Petrobia latens", "Phoenicoccus marlani", "Phoracantha semipunctata", "Planococcus ficus", "Platypara poeciloptera", "Platyptilia carduidactyla", "Popillia japonica", "Premnotrypes latithorax", "Scaphytopius nitridus", "Scirtothrips", "سوسک پوستخوار", "Scolytus multistriatus", "Stenodiplosis sorghicola", "Simeotarsenemus", "Tetranychus", "Toxoptera citricida", "Trioza erytreae", "Trypodendron domesticum", "Tumescoptes truchycorp", "Zabrotes subfasciatus", "Ascochyta lentis", "Fusarium", "Cephalosporium maydis", "Ceratocystis fimbriata", "Cercospora kikuchii", "Claviceps purpurea", "Cochliobolus heterostrophus", "Colletotrichum acutatum", "Colletotrichum lagenarium", "Cryphonectria parasitica", "Cryptodiaporthe populea", "Didymella lycopersici", "oxysporum", "Gaeumannomyces graminis", "Glomerella gossypii", "Monilinia fructigena", "Monilinia fructicola", "Phaeoisariopsis griseola", "Phaeoramularia angolensis", "Phialophora cinerescens", "Phialophora gregata", "بلایت سیاه سیبزمینی", "andina", "Phyllosticta solitaria", "Phymatotrichopsis omnivora", "Plasmodiophora brassicae", "pittieriana", "Septoria lycopersici", "Setosphaeria turcica", "جرب پودری سیبزمینی", "Spongospora subterranea", "Stenocarpella macrospora", "Stenocarpella maydis", "گال (زگیل ) سیبزمینی", "Synchytrium endobioticum", "سیبزمینی", "Thecaphora solani", "سیاهک پاکوتاه گندم", "Urocystis cepulae", "Uromyces transversalis", "potato latent tymovirus", "Andean potato mottle comovirus", "Banana bract mosaic potyvirus", "Banana bunchy top nanavirus", "golden mosaic geminivirus", "برگ چغندر", "Beet leaf curl virus", "Cherry rasp leaf nepovirus", "Citrus leprosis nucleorhabdovirus", "Citrus tatter leaf capillovirus", "Citrus variegation ilarvirus", "Citrus", "Lettuce infectious yellows crinivirus", "Little cherry clostrovirus", "spindle tuber viroid", "dwarf nepovirus", "ویروس چروکیدگی توت فرنگی", "Tobacco ringspot nepovirus", "bushy stunt tombusvirus", "ringspot", "Burkholderia caryophylli", "Citrus greening bacterium", "michiganensis", "chrysanthemi", "Erwinia tracheiphila", "syringae", "Pseudomonas syringae", "Ralstonia solanacearum", "Xanthomonas fragariae", "Xanthomonas vesicatoria", "Xylella fastidiosa", "Lime witches", "Palm lethal yellowing phytoplasma", "Peach rosette phytoplasma", "Peach X-disease phytoplasma", "Peach yellows phytoplasma", "Pear decline phytoplasma", "Potato stolbur phytoplasma", "Anguina agrostis", "Anguina funesta", "Anguina graminis", "Aphelenchoides fragariae", "Globodera rostochiensis", "Heterodera trifolii", "articlla", "Meloidogyne fallax", "Pratylenchus coffee", "Pratylenchus fallax", "citrophilus", "Tylenchulus semipenetrans", "Aphis gossypii", "Bactrocera oleae", "Brevipalpus phoenicis", "spp", "Ceratitis capitata", "Epilachna chrysomelina", "Lasioderma serricorne", "Liriomyza trifolii", "Pectinophora gossypiella", "بید چغندر قند", "Phthorimaea ocellatella", "Phyllocoptruta oleivora", "Planococcus citri", "Polyphagotarsonemus latus", "Pseudococcus longispinus", "Toxoptera aurantii", "Trogoderma spp", "Viteus vitifoliae", "united states of america", "stone fruit", "states of america", "rosette", "petiolaris", "pensilvanicum", "peanut clump furovirus", "orientalis Waterhouse", "of america", "mop-top furovirus", "macrospora", "leucoloma Boheman", "leucoloma", "leaf roll nepovirus", "greening", "dwarf", "bacterium", "Thrips palmi Karny", "Thecaphora solani Barrus", "The Hague", "Sugarcane grassy shoot", "Striga lutea Lour", "Striga hermonthica Benth", "Striga euphrasioides", "Sternochetus mangiferae", "Spodoptera litura Fabricius", "Spodoptera frugiperda", "Spodoptera eridania Cramer", "Solanum elaeagnifolium Cav", "Solanum", "Sesbania macrocarpa", "Satsuma dwarf nepovirus", "Salvia lancifolia", "Raspberry ringspot nepovirus", "Raspberry leaf curl luteovirus", "Prostephanus truncatus Horn", "Potato yellow dwarf rhabdovirus", "Potato black ringspot", "Popillia", "Polygonum pensilvanicum", "Plum pox potyvirus", "Pissodes castaneus Degeer", "Phytophthora megasperma", "Phytophthora cinnamomi Rands", "Phymatotrichopsis omnivora Hennebert", "Phylloxera vastatrix Planch", "Peronosclerospora sorghi", "Peronosclerospora philippinensis", "Peronosclerospora maydis C", "Perkinsiella saccharicida Kirkaldy", "Pentalonia nigronervosa Coquerel", "Pectinophora scutigera Holdaway", "Peanut stripe potyvirus", "Pea early browning tobravirus", "Parasaissetia nigra Nietner", "Parabemisia myricae Kuwana", "Palm", "Otiorhynchus sulcatus Fabricius", "New York", "Mycosphaerella dearnessii Barr", "Meloidogyne fallax Karssen", "Massee", "Los Angeles", "Lime withes broom", "Jacquemontia tamnifolia Griseb", "Iridomyrmex", "Ipomoea hederacea", "High", "Helianthus petiolaris", "Helianthus lenticularis", "Helianthus ciliaris", "Haplothrips chinensis", "Gymnosporangium globosum", "Glomerella gossypii Edgerton", "Globodera tabacum", "Globodera pallida Behrens", "Gilpinia hercyniae Hartig", "Flower", "Euphorbia marginata Pursh", "Euphorbia", "Epitrix tuberis Gentner", "Emex australis", "Dysmicoccus brevipes", "Diatraea saccharalis", "Cuscuta europaea", "Cuscuta epithymum Murr", "Cuscuta epilinum Weiche", "Cryphonectria parasitica Barr", "Cotton leaf curl geminivirus", "Corporation", "Coccus viridis Green", "Citrus leaf rugose ilarvirus", "Ceratocystis fagacearum", "Ceratitis capitata Wiedemann", "Carposina niponensis", "Bursaphelenchus mucronatus", "Broad bean mottle bromovirus", "Bactrocera tryoni Froggatt", "Bactrocera oleae Gmelin", "Bactrocera dorsalis Hendel", "Bactrocera cucurbitae Coquillett", "Aspidiella", "Apple stem pitting virus", "Apple chlorotic leafspot trichovirus", "Anoplophora glabripennis)Motschulsky", "Anastrepha obliqua Macquart", "Amaranthus blitoides S.Wats", ] novalid = -15000000000 all_orgs = [] all_orgs_text = '' orgs_list = [] for index, mentry in enumerate(section_list): try: # if index > 20000: # break id = mentry["id"] # if not id in records: # print(id + ' exists') # continue entry = mentry["source"] content = entry.get("content", "") content_ai = entry["content_ai"] content_len = entry.get("content_len", "") qanon_id = entry.get("qanon_id", "") ners = entry['ners_v1'] current_orgs = [] for ner_item in ners: if ner_item['key'] == 'ORG': ner_temp = remove_stop_signs(ner_item['value'].strip()) if ner_temp in stop_orgs: continue if ner_temp in stop_orgs2: continue if ner_temp.startswith('ها') or ner_temp.startswith('های'): continue if ('سازمان های' in ner_temp or 'سازمان ها' in ner_temp or 'وزارت خانها' in ner_temp or 'وزارت خانهها' in ner_temp or 'وزارت خانههای' in ner_temp or 'وزارت خانه' in ner_temp or 'وزارت خانه های' in ner_temp or 'وزارتین ' in ner_temp ): continue if not ('وزارت' in ner_temp or 'سازمان' in ner_temp): continue ner_temp = normalize_content(ner_temp) current_orgs.append(ner_temp) if current_orgs: unique_orgs_list = [] # حذف مقادیر تکراری [unique_orgs_list.append(x) for x in current_orgs if x not in unique_orgs_list] current_orgs = unique_orgs_list # current_org_list_text = '' # for c_org in current_orgs: # current_org_list_text += c_org + '\n' all_orgs.append({ "id": id, "qanon_id": qanon_id, "orgs": current_orgs, # "orgs_text": current_org_list_text, }) #separator = "*"*100 + '\n' #all_orgs_text += f"id: {id}\norgs: {current_orgs}\n\norgs_text: \n{current_org_list_text}\n\ncontentai: {content_ai}\n{separator}" # for org in current_orgs: # orgs_list.append(org) # all_orgs_relations = relation_finder(all_orgs) # print(len(all_orgs_relations)) # # for i, section in enumerate(all_orgs_relations): # # print('rel_task: ' + str(i)) # data = { # "qanon_id" : qanon_id, # "content_ai":content_ai, # "ners_v1": ners, # "relations": all_orgs_relations # } # eid = id # try: # if is_update_state: # resp = es.update(index=index_name_o, id=eid, doc=data) # else: # #write_to_json(data, './data/regulations_ner.json') # resp = es.index(index=index_name_o, id=eid, document=data) # except Exception as e: # save_error(id, e) except Exception as error: print(error) # print('relation task --------------> ' + str(count)) # if count > 1000 : # break # if content_len == 0: # continue all_orgs_relations = relation_finder(all_orgs) print('all orgs count: ' + str(len(all_orgs_relations))) # section_list_temp = [c for i, c in enumerate(section_list)] # شمارش تعداد رکوردها # len_section_list_temp = len(section_list_temp) count = 0 section_list1 = es_iterate_all_documents(es, index_name_i) for mentry1 in section_list1: count += 1 # if count > 10000: # break # print('relation task for section: ' + str(count)) if count % 100 == 0: print(f"relation finder progress: {(count / 273442) * 100:.2f} %") id1 = mentry1["id"] entry1 = mentry1["source"] content1 = entry1.get("content", "") content_ai1 = entry1["content_ai"] content_len1 = entry1.get("content_len", "") qanon_id1 = entry1.get("qanon_id", "") ners1 = entry1['ners_v1'] current_relation_list = [] # iid = 'mj_qa_section_230085' # if id1 != iid: # continue # if id1 == iid: # x = 10 current_relations = [] for i, section in enumerate(all_orgs_relations): if section['id'] == id1: current_relations = section['related_sections'] for rel_item in current_relations: current_relation_list.append(rel_item) current_relation_list = extract_weight(current_relation_list) data1 = { "qanon_id" : qanon_id1, "content_ai": content_ai1, "ners_v1": ners1, "relations": current_relation_list } eid = id1 try: if is_update_state: resp = es.update(index=index_name_o, id=eid, doc=data1) else: #write_to_json(data, './data/regulations_ner.json') resp = es.index(index=index_name_o, id=eid, document=data1) #pass except Exception as e: save_error(id, e) print(datetime.datetime.now()) print(" # # # relation extraction finished! # # # ") # استفاده از Counter برای شمارش تعداد تکرار هر موجودیت ''' counter = Counter(orgs_list) unique_dict = dict(counter) unique_list = list(unique_dict.items()) unique_list_sorted_by_key = sorted(unique_list, key=lambda x: x[1], reverse=True) orgs_list = unique_list_sorted_by_key org_list_text = '' for org in orgs_list: org_list_text += str(org[1]) + ' *** '+ org[0] + '\n' orgs_address = os.getcwd() + '/Flair_NER/data/all_orgs.txt' save_to_file_by_address(orgs_address, org_list_text) orgs_text_address = os.getcwd() + '/Flair_NER/data/all_orgs_text.txt' save_to_file_by_address(orgs_text_address, all_orgs_text) save_error(id, e) '''