697 lines
25 KiB
Python
697 lines
25 KiB
Python
|
from elasticsearch7 import Elasticsearch
|
|||
|
from collections import Counter
|
|||
|
from general_functions import save_error, normalize_content
|
|||
|
from funcs import save_to_file_by_address, read_file_by_address, write_to_json
|
|||
|
import datetime
|
|||
|
import os
|
|||
|
|
|||
|
# ##################################
|
|||
|
# برای محتوای مواد و احکام قانون که از معاونت قوانین مجلس در ایندکس الاستیک ذخیره شده است
|
|||
|
# qanon_section-v02
|
|||
|
# تحلیل روی بعضی فیلدها می کند و تاریخ های آن را استخراج و تبدیل به فرمت خاص تایم استمپ می کند
|
|||
|
# و در فیدل مناسب در همان ایندکس الاستیک ذخیره میکند
|
|||
|
# توجه : دسترسی به الاستیک باید باشد
|
|||
|
# ##################################
|
|||
|
|
|||
|
|
|||
|
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
|
|||
|
"""
|
|||
|
Helper to iterate ALL values from a single index
|
|||
|
Yields all the documents.
|
|||
|
"""
|
|||
|
global counter
|
|||
|
global total
|
|||
|
is_first = True
|
|||
|
while True:
|
|||
|
# Scroll next
|
|||
|
if is_first: # Initialize scroll
|
|||
|
# result = es.search(index=index, scroll="2m", **kwargs, body={
|
|||
|
# "size": pagesize
|
|||
|
# })
|
|||
|
result = es.search(
|
|||
|
index=index,
|
|||
|
scroll="2m",
|
|||
|
**kwargs,
|
|||
|
size=pagesize,
|
|||
|
body={
|
|||
|
"query": {
|
|||
|
"bool": {
|
|||
|
"must_not": [
|
|||
|
{"exists": {"field": "nlp_parser.type"}},
|
|||
|
{"match": {"content_len": 0}},
|
|||
|
{"match": {"parse_state": 1}},
|
|||
|
{"match": {"parse_state": 2}}
|
|||
|
]
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
)
|
|||
|
total = result["hits"]["total"]["value"]
|
|||
|
print("total = %d" % total)
|
|||
|
is_first = False
|
|||
|
else:
|
|||
|
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|||
|
scroll_id = result["_scroll_id"]
|
|||
|
hits = result["hits"]["hits"]
|
|||
|
counter += len(hits)
|
|||
|
print("progress -> %.2f %%" % ((counter / total) * 100))
|
|||
|
# Stop after no more docs
|
|||
|
if not hits:
|
|||
|
break
|
|||
|
# Yield each entry
|
|||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
|||
|
|
|||
|
def es_iterate_some_documents(es, index, records, pagesize=250, scroll_timeout="25m", **kwargs):
|
|||
|
|
|||
|
global counter
|
|||
|
global total
|
|||
|
is_first = True
|
|||
|
query = {
|
|||
|
"query": {
|
|||
|
"terms": {
|
|||
|
"_id": records
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
while True:
|
|||
|
# Scroll next
|
|||
|
if is_first: # Initialize scroll
|
|||
|
# result = es.search(index=index, scroll="2m", **kwargs, body={
|
|||
|
# "size": pagesize
|
|||
|
# })
|
|||
|
result = es.search(
|
|||
|
index=index,
|
|||
|
scroll="2m",
|
|||
|
**kwargs,
|
|||
|
size=pagesize,
|
|||
|
body= query
|
|||
|
)
|
|||
|
total = result["hits"]["total"]["value"]
|
|||
|
print("total = %d" % total)
|
|||
|
is_first = False
|
|||
|
else:
|
|||
|
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|||
|
scroll_id = result["_scroll_id"]
|
|||
|
hits = result["hits"]["hits"]
|
|||
|
counter += len(hits)
|
|||
|
print("progress -> %.2f %%" % ((counter / total) * 100))
|
|||
|
# Stop after no more docs
|
|||
|
if not hits:
|
|||
|
break
|
|||
|
# Yield each entry
|
|||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
|||
|
|
|||
|
def prepare_data(ner_obj_list):
|
|||
|
ner_data_list = []
|
|||
|
for ner_obj in ner_obj_list:
|
|||
|
ner_data = {
|
|||
|
"key" :ner_obj['ner_key'],
|
|||
|
"value" :ner_obj['ner_value'],
|
|||
|
"begin" :ner_obj['ner_start_token'],
|
|||
|
"end" :ner_obj['ner_end_token'],
|
|||
|
"score" :ner_obj['ner_score']
|
|||
|
}
|
|||
|
|
|||
|
ner_data_list.append(ner_data)
|
|||
|
return ner_data_list
|
|||
|
|
|||
|
def remove_stop_signs(ner_value):
|
|||
|
ner_temp = ner_value
|
|||
|
for sign in stop_signs:
|
|||
|
ner_temp = ner_temp.lstrip(sign)
|
|||
|
ner_temp = ner_temp.rstrip(sign)
|
|||
|
ner_temp = ner_temp.strip()
|
|||
|
return ner_temp
|
|||
|
|
|||
|
def remove_stop_orgs(ner_value):
|
|||
|
if ner_value in stop_orgs:
|
|||
|
return False
|
|||
|
return True
|
|||
|
|
|||
|
def relation_finder(all_orgs):
|
|||
|
new_orgs = []
|
|||
|
|
|||
|
for index, section in enumerate(all_orgs):
|
|||
|
if index % 1000 == 0:
|
|||
|
print(f"relation finder progress: {(index/len(all_orgs)) * 100:.2f} %")
|
|||
|
|
|||
|
# if index > 1000:
|
|||
|
# break
|
|||
|
related_sections = []
|
|||
|
orgs = section['orgs']
|
|||
|
|
|||
|
for org in orgs:
|
|||
|
for compare_item in all_orgs:
|
|||
|
compare_item_orgs = compare_item['orgs']
|
|||
|
if section['id'] == compare_item['id']:# جلوگیری از ارتباط یک مقرره با خودش
|
|||
|
continue
|
|||
|
if org in compare_item_orgs:
|
|||
|
# related_sections.append(compare_item['id'])
|
|||
|
related_sections.append({
|
|||
|
'section_id': compare_item['id'],
|
|||
|
'qanon_id': compare_item['qanon_id'],
|
|||
|
'ai_key': org,
|
|||
|
'type': 'ORG',
|
|||
|
'weight': 1,
|
|||
|
})
|
|||
|
|
|||
|
#rel_unique_list_sorted_by_repitation = []
|
|||
|
# پیدا کردن سطح ارتباط از طریق شمارش تعداد تکرار ارتباطات
|
|||
|
# rel_counter = Counter(related_sections)
|
|||
|
|
|||
|
# rel_unique_list = list(rel_counter.items())
|
|||
|
# rel_unique_list_sorted_by_repitation = sorted(rel_unique_list, key=lambda x: x[1], reverse=True)
|
|||
|
# related_sections = rel_unique_list_sorted_by_repitation
|
|||
|
# related_sections_temp = []
|
|||
|
# for relation in related_sections:
|
|||
|
# related_sections_temp.append({
|
|||
|
# 'related_section_id': relation[0],
|
|||
|
# 'weight': relation[1]
|
|||
|
# })
|
|||
|
new_orgs.append({
|
|||
|
'id': section['id'],
|
|||
|
'qanon_id': section['qanon_id'],
|
|||
|
'orgs': section['orgs'],
|
|||
|
|
|||
|
# 'orgs_text': section['orgs_text'],
|
|||
|
'related_sections': related_sections,
|
|||
|
})
|
|||
|
return new_orgs
|
|||
|
|
|||
|
def extract_weight(relation_list):
|
|||
|
relation_list_temp = []
|
|||
|
for rel_item in relation_list:
|
|||
|
weight = 0
|
|||
|
rel_labels = []
|
|||
|
current_section_id = rel_item['section_id']
|
|||
|
for item in relation_list:
|
|||
|
if item['section_id'] == current_section_id:
|
|||
|
weight += 1
|
|||
|
rel_labels.append(item['ai_key'])
|
|||
|
for rel_item2 in relation_list_temp:
|
|||
|
if current_section_id == rel_item2['section_id']:
|
|||
|
break
|
|||
|
else:
|
|||
|
relation_list_temp.append({
|
|||
|
"section_id": current_section_id,
|
|||
|
"qanon_id": rel_item['qanon_id'],
|
|||
|
"ai_key": rel_labels,
|
|||
|
"type": rel_item['type'],
|
|||
|
"weight": weight
|
|||
|
})
|
|||
|
return relation_list_temp
|
|||
|
|
|||
|
print(datetime.datetime.now())
|
|||
|
index_name_i = "ai_mj_qa_section-v05" # الاستیک موجود روی جی پی یو
|
|||
|
# index_name_o = 'mj_qa_test-v01'
|
|||
|
# is_update_state = False
|
|||
|
index_name_o = "ai_mj_qa_section-v07"
|
|||
|
is_update_state = False
|
|||
|
|
|||
|
mapping_o = ""
|
|||
|
|
|||
|
es = Elasticsearch(
|
|||
|
"http://127.0.0.1:6900",
|
|||
|
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
|||
|
)
|
|||
|
|
|||
|
try:
|
|||
|
if not es.indices.exists(index=index_name_o):
|
|||
|
response = es.indices.create(index=index_name_o, body=mapping_o)
|
|||
|
# print out the response:
|
|||
|
print("create index response:", response)
|
|||
|
except:
|
|||
|
print("elastic error")
|
|||
|
|
|||
|
counter = 0
|
|||
|
total = 0
|
|||
|
id = ""
|
|||
|
|
|||
|
|
|||
|
|
|||
|
try:
|
|||
|
|
|||
|
# # رکوردهایی که قبلا با خطا مواجه شده در آدرس زیر قرار دارد
|
|||
|
# address3 = os.getcwd() + '/Flair_NER/data/ner_reg_list_prev2.txt'
|
|||
|
# section_list_text = read_file_by_address(address3)
|
|||
|
# records = section_list_text.splitlines()
|
|||
|
|
|||
|
section_list = es_iterate_all_documents(es, index_name_i)
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(' reading from elastic error! ')
|
|||
|
save_error(0, e)
|
|||
|
|
|||
|
stop_signs = [')',']','،','؛','»',')،','-','»،','.','(',')؛','/','\\'
|
|||
|
'.......................................................',
|
|||
|
'..........................................................',
|
|||
|
':..............................................................',
|
|||
|
]
|
|||
|
stop_orgs = ['و','.','0','از','است','هایایرانی','ای','یافته و','یافته',
|
|||
|
'یا دولتی شهرداری','گیری','وزارت.','وزارت خانه های','یافته،',
|
|||
|
'یافته و تروریسم','','خانه','چمن','ذغال','یا شهرداری','یا دولتی']
|
|||
|
stop_orgs2 = ["Acrobasis pyrivorella",
|
|||
|
"Aleurocanthus spiniferus",
|
|||
|
"Amauromyza maculosa",
|
|||
|
"سوسک شاخک بلند آسیایی",
|
|||
|
"Anoplophora glabripennis",
|
|||
|
"Araecerus fasciculatus",
|
|||
|
"Asterolecanium phoenicis",
|
|||
|
"Bactrocera spp",
|
|||
|
"fusca",
|
|||
|
"Cacoecimorpha pronubana",
|
|||
|
"Coccotrypes dactyliperda",
|
|||
|
"Coccus viridis",
|
|||
|
"Cryptophlebia leucotreta",
|
|||
|
"Cydia prunivora",
|
|||
|
"Daphorina citri",
|
|||
|
"مگس گالزای گل داوودی",
|
|||
|
"Epichoristodes acerbella",
|
|||
|
"Epochra canadensis",
|
|||
|
"Euphranta japonica",
|
|||
|
"Eutetranychus",
|
|||
|
"Eutetranychus carpini",
|
|||
|
"عنکبوتی",
|
|||
|
"Eutetranychus hirstii",
|
|||
|
"Gonipterus scutellanus",
|
|||
|
"Leptinotarsa desemlineata",
|
|||
|
"Mackiella phoenicls",
|
|||
|
"Mycetaspis personata",
|
|||
|
"Oligonychus mangiferus",
|
|||
|
"Oligonychus pratensis",
|
|||
|
"Omatissus binotatus",
|
|||
|
"Opogona sacchari",
|
|||
|
"Oryctes spp",
|
|||
|
"Pammene rhediella",
|
|||
|
"Petrobia latens",
|
|||
|
"Phoenicoccus marlani",
|
|||
|
"Phoracantha semipunctata",
|
|||
|
"Planococcus ficus",
|
|||
|
"Platypara poeciloptera",
|
|||
|
"Platyptilia carduidactyla",
|
|||
|
"Popillia japonica",
|
|||
|
"Premnotrypes latithorax",
|
|||
|
"Scaphytopius nitridus",
|
|||
|
"Scirtothrips",
|
|||
|
"سوسک پوستخوار",
|
|||
|
"Scolytus multistriatus",
|
|||
|
"Stenodiplosis sorghicola",
|
|||
|
"Simeotarsenemus",
|
|||
|
"Tetranychus",
|
|||
|
"Toxoptera citricida",
|
|||
|
"Trioza erytreae",
|
|||
|
"Trypodendron domesticum",
|
|||
|
"Tumescoptes truchycorp",
|
|||
|
"Zabrotes subfasciatus",
|
|||
|
"Ascochyta lentis",
|
|||
|
"Fusarium",
|
|||
|
"Cephalosporium maydis",
|
|||
|
"Ceratocystis fimbriata",
|
|||
|
"Cercospora kikuchii",
|
|||
|
"Claviceps purpurea",
|
|||
|
"Cochliobolus heterostrophus",
|
|||
|
"Colletotrichum acutatum",
|
|||
|
"Colletotrichum lagenarium",
|
|||
|
"Cryphonectria parasitica",
|
|||
|
"Cryptodiaporthe populea",
|
|||
|
"Didymella lycopersici",
|
|||
|
"oxysporum",
|
|||
|
"Gaeumannomyces graminis",
|
|||
|
"Glomerella gossypii",
|
|||
|
"Monilinia fructigena",
|
|||
|
"Monilinia fructicola",
|
|||
|
"Phaeoisariopsis griseola",
|
|||
|
"Phaeoramularia angolensis",
|
|||
|
"Phialophora cinerescens",
|
|||
|
"Phialophora gregata",
|
|||
|
"بلایت سیاه سیبزمینی",
|
|||
|
"andina",
|
|||
|
"Phyllosticta solitaria",
|
|||
|
"Phymatotrichopsis omnivora",
|
|||
|
"Plasmodiophora brassicae",
|
|||
|
"pittieriana",
|
|||
|
"Septoria lycopersici",
|
|||
|
"Setosphaeria turcica",
|
|||
|
"جرب پودری سیبزمینی",
|
|||
|
"Spongospora subterranea",
|
|||
|
"Stenocarpella macrospora",
|
|||
|
"Stenocarpella maydis",
|
|||
|
"گال (زگیل ) سیبزمینی",
|
|||
|
"Synchytrium endobioticum",
|
|||
|
"سیبزمینی",
|
|||
|
"Thecaphora solani",
|
|||
|
"سیاهک پاکوتاه گندم",
|
|||
|
"Urocystis cepulae",
|
|||
|
"Uromyces transversalis",
|
|||
|
"potato latent tymovirus",
|
|||
|
"Andean potato mottle comovirus",
|
|||
|
"Banana bract mosaic potyvirus",
|
|||
|
"Banana bunchy top nanavirus",
|
|||
|
"golden mosaic geminivirus",
|
|||
|
"برگ چغندر",
|
|||
|
"Beet leaf curl virus",
|
|||
|
"Cherry rasp leaf nepovirus",
|
|||
|
"Citrus leprosis nucleorhabdovirus",
|
|||
|
"Citrus tatter leaf capillovirus",
|
|||
|
"Citrus variegation ilarvirus",
|
|||
|
"Citrus",
|
|||
|
"Lettuce infectious yellows crinivirus",
|
|||
|
"Little cherry clostrovirus",
|
|||
|
"spindle tuber viroid",
|
|||
|
"dwarf nepovirus",
|
|||
|
"ویروس چروکیدگی توت فرنگی",
|
|||
|
"Tobacco ringspot nepovirus",
|
|||
|
"bushy stunt tombusvirus",
|
|||
|
"ringspot",
|
|||
|
"Burkholderia caryophylli",
|
|||
|
"Citrus greening bacterium",
|
|||
|
"michiganensis",
|
|||
|
"chrysanthemi",
|
|||
|
"Erwinia tracheiphila",
|
|||
|
"syringae",
|
|||
|
"Pseudomonas syringae",
|
|||
|
"Ralstonia solanacearum",
|
|||
|
"Xanthomonas fragariae",
|
|||
|
"Xanthomonas vesicatoria",
|
|||
|
"Xylella fastidiosa",
|
|||
|
"Lime witches",
|
|||
|
"Palm lethal yellowing phytoplasma",
|
|||
|
"Peach rosette phytoplasma",
|
|||
|
"Peach X-disease phytoplasma",
|
|||
|
"Peach yellows phytoplasma",
|
|||
|
"Pear decline phytoplasma",
|
|||
|
"Potato stolbur phytoplasma",
|
|||
|
"Anguina agrostis",
|
|||
|
"Anguina funesta",
|
|||
|
"Anguina graminis",
|
|||
|
"Aphelenchoides fragariae",
|
|||
|
"Globodera rostochiensis",
|
|||
|
"Heterodera trifolii",
|
|||
|
"articlla",
|
|||
|
"Meloidogyne fallax",
|
|||
|
"Pratylenchus coffee",
|
|||
|
"Pratylenchus fallax",
|
|||
|
"citrophilus",
|
|||
|
"Tylenchulus semipenetrans",
|
|||
|
"Aphis gossypii",
|
|||
|
"Bactrocera oleae",
|
|||
|
"Brevipalpus phoenicis",
|
|||
|
"spp",
|
|||
|
"Ceratitis capitata",
|
|||
|
"Epilachna chrysomelina",
|
|||
|
"Lasioderma serricorne",
|
|||
|
"Liriomyza trifolii",
|
|||
|
"Pectinophora gossypiella",
|
|||
|
"بید چغندر قند",
|
|||
|
"Phthorimaea ocellatella",
|
|||
|
"Phyllocoptruta oleivora",
|
|||
|
"Planococcus citri",
|
|||
|
"Polyphagotarsonemus latus",
|
|||
|
"Pseudococcus longispinus",
|
|||
|
"Toxoptera aurantii",
|
|||
|
"Trogoderma spp",
|
|||
|
"Viteus vitifoliae",
|
|||
|
"united states of america",
|
|||
|
"stone fruit",
|
|||
|
"states of america",
|
|||
|
"rosette",
|
|||
|
"petiolaris",
|
|||
|
"pensilvanicum",
|
|||
|
"peanut clump furovirus",
|
|||
|
"orientalis Waterhouse",
|
|||
|
"of america",
|
|||
|
"mop-top furovirus",
|
|||
|
"macrospora",
|
|||
|
"leucoloma Boheman",
|
|||
|
"leucoloma",
|
|||
|
"leaf roll nepovirus",
|
|||
|
"greening",
|
|||
|
"dwarf",
|
|||
|
"bacterium",
|
|||
|
"Thrips palmi Karny",
|
|||
|
"Thecaphora solani Barrus",
|
|||
|
"The Hague",
|
|||
|
"Sugarcane grassy shoot",
|
|||
|
"Striga lutea Lour",
|
|||
|
"Striga hermonthica Benth",
|
|||
|
"Striga euphrasioides",
|
|||
|
"Sternochetus mangiferae",
|
|||
|
"Spodoptera litura Fabricius",
|
|||
|
"Spodoptera frugiperda",
|
|||
|
"Spodoptera eridania Cramer",
|
|||
|
"Solanum elaeagnifolium Cav",
|
|||
|
"Solanum",
|
|||
|
"Sesbania macrocarpa",
|
|||
|
"Satsuma dwarf nepovirus",
|
|||
|
"Salvia lancifolia",
|
|||
|
"Raspberry ringspot nepovirus",
|
|||
|
"Raspberry leaf curl luteovirus",
|
|||
|
"Prostephanus truncatus Horn",
|
|||
|
"Potato yellow dwarf rhabdovirus",
|
|||
|
"Potato black ringspot",
|
|||
|
"Popillia",
|
|||
|
"Polygonum pensilvanicum",
|
|||
|
"Plum pox potyvirus",
|
|||
|
"Pissodes castaneus Degeer",
|
|||
|
"Phytophthora megasperma",
|
|||
|
"Phytophthora cinnamomi Rands",
|
|||
|
"Phymatotrichopsis omnivora Hennebert",
|
|||
|
"Phylloxera vastatrix Planch",
|
|||
|
"Peronosclerospora sorghi",
|
|||
|
"Peronosclerospora philippinensis",
|
|||
|
"Peronosclerospora maydis C",
|
|||
|
"Perkinsiella saccharicida Kirkaldy",
|
|||
|
"Pentalonia nigronervosa Coquerel",
|
|||
|
"Pectinophora scutigera Holdaway",
|
|||
|
"Peanut stripe potyvirus",
|
|||
|
"Pea early browning tobravirus",
|
|||
|
"Parasaissetia nigra Nietner",
|
|||
|
"Parabemisia myricae Kuwana",
|
|||
|
"Palm",
|
|||
|
"Otiorhynchus sulcatus Fabricius",
|
|||
|
"New York",
|
|||
|
"Mycosphaerella dearnessii Barr",
|
|||
|
"Meloidogyne fallax Karssen",
|
|||
|
"Massee",
|
|||
|
"Los Angeles",
|
|||
|
"Lime withes broom",
|
|||
|
"Jacquemontia tamnifolia Griseb",
|
|||
|
"Iridomyrmex",
|
|||
|
"Ipomoea hederacea",
|
|||
|
"High",
|
|||
|
"Helianthus petiolaris",
|
|||
|
"Helianthus lenticularis",
|
|||
|
"Helianthus ciliaris",
|
|||
|
"Haplothrips chinensis",
|
|||
|
"Gymnosporangium globosum",
|
|||
|
"Glomerella gossypii Edgerton",
|
|||
|
"Globodera tabacum",
|
|||
|
"Globodera pallida Behrens",
|
|||
|
"Gilpinia hercyniae Hartig",
|
|||
|
"Flower",
|
|||
|
"Euphorbia marginata Pursh",
|
|||
|
"Euphorbia",
|
|||
|
"Epitrix tuberis Gentner",
|
|||
|
"Emex australis",
|
|||
|
"Dysmicoccus brevipes",
|
|||
|
"Diatraea saccharalis",
|
|||
|
"Cuscuta europaea",
|
|||
|
"Cuscuta epithymum Murr",
|
|||
|
"Cuscuta epilinum Weiche",
|
|||
|
"Cryphonectria parasitica Barr",
|
|||
|
"Cotton leaf curl geminivirus",
|
|||
|
"Corporation",
|
|||
|
"Coccus viridis Green",
|
|||
|
"Citrus leaf rugose ilarvirus",
|
|||
|
"Ceratocystis fagacearum",
|
|||
|
"Ceratitis capitata Wiedemann",
|
|||
|
"Carposina niponensis",
|
|||
|
"Bursaphelenchus mucronatus",
|
|||
|
"Broad bean mottle bromovirus",
|
|||
|
"Bactrocera tryoni Froggatt",
|
|||
|
"Bactrocera oleae Gmelin",
|
|||
|
"Bactrocera dorsalis Hendel",
|
|||
|
"Bactrocera cucurbitae Coquillett",
|
|||
|
"Aspidiella",
|
|||
|
"Apple stem pitting virus",
|
|||
|
"Apple chlorotic leafspot trichovirus",
|
|||
|
"Anoplophora glabripennis)Motschulsky",
|
|||
|
"Anastrepha obliqua Macquart",
|
|||
|
"Amaranthus blitoides S.Wats",
|
|||
|
]
|
|||
|
|
|||
|
novalid = -15000000000
|
|||
|
all_orgs = []
|
|||
|
all_orgs_text = ''
|
|||
|
orgs_list = []
|
|||
|
for index, mentry in enumerate(section_list):
|
|||
|
try:
|
|||
|
# if index > 20000:
|
|||
|
# break
|
|||
|
id = mentry["id"]
|
|||
|
# if not id in records:
|
|||
|
# print(id + ' exists')
|
|||
|
# continue
|
|||
|
entry = mentry["source"]
|
|||
|
content = entry.get("content", "")
|
|||
|
content_ai = entry["content_ai"]
|
|||
|
content_len = entry.get("content_len", "")
|
|||
|
qanon_id = entry.get("qanon_id", "")
|
|||
|
ners = entry['ners_v1']
|
|||
|
current_orgs = []
|
|||
|
|
|||
|
for ner_item in ners:
|
|||
|
if ner_item['key'] == 'ORG':
|
|||
|
ner_temp = remove_stop_signs(ner_item['value'].strip())
|
|||
|
if ner_temp in stop_orgs:
|
|||
|
continue
|
|||
|
if ner_temp in stop_orgs2:
|
|||
|
continue
|
|||
|
if ner_temp.startswith('ها') or ner_temp.startswith('های'):
|
|||
|
continue
|
|||
|
if ('سازمان های' in ner_temp
|
|||
|
or 'سازمان ها' in ner_temp
|
|||
|
or 'وزارت خانها' in ner_temp
|
|||
|
or 'وزارت خانهها' in ner_temp
|
|||
|
or 'وزارت خانههای' in ner_temp
|
|||
|
or 'وزارت خانه' in ner_temp
|
|||
|
or 'وزارت خانه های' in ner_temp
|
|||
|
or 'وزارتین ' in ner_temp
|
|||
|
):
|
|||
|
continue
|
|||
|
if not ('وزارت' in ner_temp or 'سازمان' in ner_temp):
|
|||
|
continue
|
|||
|
ner_temp = normalize_content(ner_temp)
|
|||
|
current_orgs.append(ner_temp)
|
|||
|
|
|||
|
if current_orgs:
|
|||
|
unique_orgs_list = []
|
|||
|
# حذف مقادیر تکراری
|
|||
|
[unique_orgs_list.append(x) for x in current_orgs if x not in unique_orgs_list]
|
|||
|
current_orgs = unique_orgs_list
|
|||
|
# current_org_list_text = ''
|
|||
|
# for c_org in current_orgs:
|
|||
|
# current_org_list_text += c_org + '\n'
|
|||
|
all_orgs.append({
|
|||
|
"id": id,
|
|||
|
"qanon_id": qanon_id,
|
|||
|
"orgs": current_orgs,
|
|||
|
# "orgs_text": current_org_list_text,
|
|||
|
})
|
|||
|
#separator = "*"*100 + '\n'
|
|||
|
#all_orgs_text += f"id: {id}\norgs: {current_orgs}\n\norgs_text: \n{current_org_list_text}\n\ncontentai: {content_ai}\n{separator}"
|
|||
|
# for org in current_orgs:
|
|||
|
# orgs_list.append(org)
|
|||
|
|
|||
|
# all_orgs_relations = relation_finder(all_orgs)
|
|||
|
# print(len(all_orgs_relations))
|
|||
|
# # for i, section in enumerate(all_orgs_relations):
|
|||
|
# # print('rel_task: ' + str(i))
|
|||
|
# data = {
|
|||
|
# "qanon_id" : qanon_id,
|
|||
|
# "content_ai":content_ai,
|
|||
|
# "ners_v1": ners,
|
|||
|
# "relations": all_orgs_relations
|
|||
|
# }
|
|||
|
|
|||
|
# eid = id
|
|||
|
|
|||
|
# try:
|
|||
|
# if is_update_state:
|
|||
|
# resp = es.update(index=index_name_o, id=eid, doc=data)
|
|||
|
# else:
|
|||
|
# #write_to_json(data, './data/regulations_ner.json')
|
|||
|
# resp = es.index(index=index_name_o, id=eid, document=data)
|
|||
|
|
|||
|
|
|||
|
# except Exception as e:
|
|||
|
# save_error(id, e)
|
|||
|
except Exception as error:
|
|||
|
print(error)
|
|||
|
# print('relation task --------------> ' + str(count))
|
|||
|
# if count > 1000 :
|
|||
|
# break
|
|||
|
|
|||
|
# if content_len == 0:
|
|||
|
# continue
|
|||
|
|
|||
|
all_orgs_relations = relation_finder(all_orgs)
|
|||
|
print('all orgs count: ' + str(len(all_orgs_relations)))
|
|||
|
|
|||
|
# section_list_temp = [c for i, c in enumerate(section_list)]
|
|||
|
# شمارش تعداد رکوردها
|
|||
|
# len_section_list_temp = len(section_list_temp)
|
|||
|
count = 0
|
|||
|
section_list1 = es_iterate_all_documents(es, index_name_i)
|
|||
|
for mentry1 in section_list1:
|
|||
|
count += 1
|
|||
|
# if count > 10000:
|
|||
|
# break
|
|||
|
# print('relation task for section: ' + str(count))
|
|||
|
if count % 100 == 0:
|
|||
|
print(f"relation finder progress: {(count / 273442) * 100:.2f} %")
|
|||
|
id1 = mentry1["id"]
|
|||
|
|
|||
|
entry1 = mentry1["source"]
|
|||
|
content1 = entry1.get("content", "")
|
|||
|
content_ai1 = entry1["content_ai"]
|
|||
|
content_len1 = entry1.get("content_len", "")
|
|||
|
qanon_id1 = entry1.get("qanon_id", "")
|
|||
|
ners1 = entry1['ners_v1']
|
|||
|
current_relation_list = []
|
|||
|
# iid = 'mj_qa_section_230085'
|
|||
|
|
|||
|
# if id1 != iid:
|
|||
|
# continue
|
|||
|
# if id1 == iid:
|
|||
|
# x = 10
|
|||
|
current_relations = []
|
|||
|
for i, section in enumerate(all_orgs_relations):
|
|||
|
if section['id'] == id1:
|
|||
|
current_relations = section['related_sections']
|
|||
|
for rel_item in current_relations:
|
|||
|
current_relation_list.append(rel_item)
|
|||
|
current_relation_list = extract_weight(current_relation_list)
|
|||
|
data1 = {
|
|||
|
"qanon_id" : qanon_id1,
|
|||
|
"content_ai": content_ai1,
|
|||
|
"ners_v1": ners1,
|
|||
|
"relations": current_relation_list
|
|||
|
}
|
|||
|
|
|||
|
eid = id1
|
|||
|
|
|||
|
try:
|
|||
|
if is_update_state:
|
|||
|
resp = es.update(index=index_name_o, id=eid, doc=data1)
|
|||
|
else:
|
|||
|
#write_to_json(data, './data/regulations_ner.json')
|
|||
|
resp = es.index(index=index_name_o, id=eid, document=data1)
|
|||
|
#pass
|
|||
|
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
save_error(id, e)
|
|||
|
|
|||
|
print(datetime.datetime.now())
|
|||
|
print(" # # # relation extraction finished! # # # ")
|
|||
|
|
|||
|
|
|||
|
# استفاده از Counter برای شمارش تعداد تکرار هر موجودیت
|
|||
|
'''
|
|||
|
counter = Counter(orgs_list)
|
|||
|
unique_dict = dict(counter)
|
|||
|
unique_list = list(unique_dict.items())
|
|||
|
unique_list_sorted_by_key = sorted(unique_list, key=lambda x: x[1], reverse=True)
|
|||
|
orgs_list = unique_list_sorted_by_key
|
|||
|
org_list_text = ''
|
|||
|
for org in orgs_list:
|
|||
|
org_list_text += str(org[1]) + ' *** '+ org[0] + '\n'
|
|||
|
orgs_address = os.getcwd() + '/Flair_NER/data/all_orgs.txt'
|
|||
|
save_to_file_by_address(orgs_address, org_list_text)
|
|||
|
orgs_text_address = os.getcwd() + '/Flair_NER/data/all_orgs_text.txt'
|
|||
|
save_to_file_by_address(orgs_text_address, all_orgs_text)
|
|||
|
|
|||
|
save_error(id, e)
|
|||
|
'''
|