Flair_NER/relation.py

697 lines
25 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from elasticsearch7 import Elasticsearch
from collections import Counter
from general_functions import save_error, normalize_content
from funcs import save_to_file_by_address, read_file_by_address, write_to_json
import datetime
import os
# ##################################
# برای محتوای مواد و احکام قانون که از معاونت قوانین مجلس در ایندکس الاستیک ذخیره شده است
# qanon_section-v02
# تحلیل روی بعضی فیلدها می کند و تاریخ های آن را استخراج و تبدیل به فرمت خاص تایم استمپ می کند
# و در فیدل مناسب در همان ایندکس الاستیک ذخیره میکند
# توجه : دسترسی به الاستیک باید باشد
# ##################################
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
"""
Helper to iterate ALL values from a single index
Yields all the documents.
"""
global counter
global total
is_first = True
while True:
# Scroll next
if is_first: # Initialize scroll
# result = es.search(index=index, scroll="2m", **kwargs, body={
# "size": pagesize
# })
result = es.search(
index=index,
scroll="2m",
**kwargs,
size=pagesize,
body={
"query": {
"bool": {
"must_not": [
{"exists": {"field": "nlp_parser.type"}},
{"match": {"content_len": 0}},
{"match": {"parse_state": 1}},
{"match": {"parse_state": 2}}
]
}
}
}
)
total = result["hits"]["total"]["value"]
print("total = %d" % total)
is_first = False
else:
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
counter += len(hits)
print("progress -> %.2f %%" % ((counter / total) * 100))
# Stop after no more docs
if not hits:
break
# Yield each entry
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
def es_iterate_some_documents(es, index, records, pagesize=250, scroll_timeout="25m", **kwargs):
global counter
global total
is_first = True
query = {
"query": {
"terms": {
"_id": records
}
}
}
while True:
# Scroll next
if is_first: # Initialize scroll
# result = es.search(index=index, scroll="2m", **kwargs, body={
# "size": pagesize
# })
result = es.search(
index=index,
scroll="2m",
**kwargs,
size=pagesize,
body= query
)
total = result["hits"]["total"]["value"]
print("total = %d" % total)
is_first = False
else:
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
counter += len(hits)
print("progress -> %.2f %%" % ((counter / total) * 100))
# Stop after no more docs
if not hits:
break
# Yield each entry
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
def prepare_data(ner_obj_list):
ner_data_list = []
for ner_obj in ner_obj_list:
ner_data = {
"key" :ner_obj['ner_key'],
"value" :ner_obj['ner_value'],
"begin" :ner_obj['ner_start_token'],
"end" :ner_obj['ner_end_token'],
"score" :ner_obj['ner_score']
}
ner_data_list.append(ner_data)
return ner_data_list
def remove_stop_signs(ner_value):
ner_temp = ner_value
for sign in stop_signs:
ner_temp = ner_temp.lstrip(sign)
ner_temp = ner_temp.rstrip(sign)
ner_temp = ner_temp.strip()
return ner_temp
def remove_stop_orgs(ner_value):
if ner_value in stop_orgs:
return False
return True
def relation_finder(all_orgs):
new_orgs = []
for index, section in enumerate(all_orgs):
if index % 1000 == 0:
print(f"relation finder progress: {(index/len(all_orgs)) * 100:.2f} %")
# if index > 1000:
# break
related_sections = []
orgs = section['orgs']
for org in orgs:
for compare_item in all_orgs:
compare_item_orgs = compare_item['orgs']
if section['id'] == compare_item['id']:# جلوگیری از ارتباط یک مقرره با خودش
continue
if org in compare_item_orgs:
# related_sections.append(compare_item['id'])
related_sections.append({
'section_id': compare_item['id'],
'qanon_id': compare_item['qanon_id'],
'ai_key': org,
'type': 'ORG',
'weight': 1,
})
#rel_unique_list_sorted_by_repitation = []
# پیدا کردن سطح ارتباط از طریق شمارش تعداد تکرار ارتباطات
# rel_counter = Counter(related_sections)
# rel_unique_list = list(rel_counter.items())
# rel_unique_list_sorted_by_repitation = sorted(rel_unique_list, key=lambda x: x[1], reverse=True)
# related_sections = rel_unique_list_sorted_by_repitation
# related_sections_temp = []
# for relation in related_sections:
# related_sections_temp.append({
# 'related_section_id': relation[0],
# 'weight': relation[1]
# })
new_orgs.append({
'id': section['id'],
'qanon_id': section['qanon_id'],
'orgs': section['orgs'],
# 'orgs_text': section['orgs_text'],
'related_sections': related_sections,
})
return new_orgs
def extract_weight(relation_list):
relation_list_temp = []
for rel_item in relation_list:
weight = 0
rel_labels = []
current_section_id = rel_item['section_id']
for item in relation_list:
if item['section_id'] == current_section_id:
weight += 1
rel_labels.append(item['ai_key'])
for rel_item2 in relation_list_temp:
if current_section_id == rel_item2['section_id']:
break
else:
relation_list_temp.append({
"section_id": current_section_id,
"qanon_id": rel_item['qanon_id'],
"ai_key": rel_labels,
"type": rel_item['type'],
"weight": weight
})
return relation_list_temp
print(datetime.datetime.now())
index_name_i = "ai_mj_qa_section-v05" # الاستیک موجود روی جی پی یو
# index_name_o = 'mj_qa_test-v01'
# is_update_state = False
index_name_o = "ai_mj_qa_section-v07"
is_update_state = False
mapping_o = ""
es = Elasticsearch(
"http://127.0.0.1:6900",
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)
try:
if not es.indices.exists(index=index_name_o):
response = es.indices.create(index=index_name_o, body=mapping_o)
# print out the response:
print("create index response:", response)
except:
print("elastic error")
counter = 0
total = 0
id = ""
try:
# # رکوردهایی که قبلا با خطا مواجه شده در آدرس زیر قرار دارد
# address3 = os.getcwd() + '/Flair_NER/data/ner_reg_list_prev2.txt'
# section_list_text = read_file_by_address(address3)
# records = section_list_text.splitlines()
section_list = es_iterate_all_documents(es, index_name_i)
except Exception as e:
print(' reading from elastic error! ')
save_error(0, e)
stop_signs = [')',']','،','؛','»','','-','»،','.','(','','/','\\'
'.......................................................',
'..........................................................',
':..............................................................',
]
stop_orgs = ['و','.','0','از','است','هایایرانی','ای','یافته و','یافته',
'یا دولتی شهرداری','گیری','وزارت.','وزارت خانه های','یافته،',
'یافته و تروریسم','','خانه','چمن','ذغال','یا شهرداری','یا دولتی']
stop_orgs2 = ["Acrobasis pyrivorella",
"Aleurocanthus spiniferus",
"Amauromyza maculosa",
"سوسک شاخک بلند آسیایی",
"Anoplophora glabripennis",
"Araecerus fasciculatus",
"Asterolecanium phoenicis",
"Bactrocera spp",
"fusca",
"Cacoecimorpha pronubana",
"Coccotrypes dactyliperda",
"Coccus viridis",
"Cryptophlebia leucotreta",
"Cydia prunivora",
"Daphorina citri",
"مگس گالزای گل داوودی",
"Epichoristodes acerbella",
"Epochra canadensis",
"Euphranta japonica",
"Eutetranychus",
"Eutetranychus carpini",
"عنکبوتی",
"Eutetranychus hirstii",
"Gonipterus scutellanus",
"Leptinotarsa desemlineata",
"Mackiella phoenicls",
"Mycetaspis personata",
"Oligonychus mangiferus",
"Oligonychus pratensis",
"Omatissus binotatus",
"Opogona sacchari",
"Oryctes spp",
"Pammene rhediella",
"Petrobia latens",
"Phoenicoccus marlani",
"Phoracantha semipunctata",
"Planococcus ficus",
"Platypara poeciloptera",
"Platyptilia carduidactyla",
"Popillia japonica",
"Premnotrypes latithorax",
"Scaphytopius nitridus",
"Scirtothrips",
"سوسک پوستخوار",
"Scolytus multistriatus",
"Stenodiplosis sorghicola",
"Simeotarsenemus",
"Tetranychus",
"Toxoptera citricida",
"Trioza erytreae",
"Trypodendron domesticum",
"Tumescoptes truchycorp",
"Zabrotes subfasciatus",
"Ascochyta lentis",
"Fusarium",
"Cephalosporium maydis",
"Ceratocystis fimbriata",
"Cercospora kikuchii",
"Claviceps purpurea",
"Cochliobolus heterostrophus",
"Colletotrichum acutatum",
"Colletotrichum lagenarium",
"Cryphonectria parasitica",
"Cryptodiaporthe populea",
"Didymella lycopersici",
"oxysporum",
"Gaeumannomyces graminis",
"Glomerella gossypii",
"Monilinia fructigena",
"Monilinia fructicola",
"Phaeoisariopsis griseola",
"Phaeoramularia angolensis",
"Phialophora cinerescens",
"Phialophora gregata",
"بلایت سیاه سیبزمینی",
"andina",
"Phyllosticta solitaria",
"Phymatotrichopsis omnivora",
"Plasmodiophora brassicae",
"pittieriana",
"Septoria lycopersici",
"Setosphaeria turcica",
"جرب پودری سیبزمینی",
"Spongospora subterranea",
"Stenocarpella macrospora",
"Stenocarpella maydis",
"گال (زگیل ) سیبزمینی",
"Synchytrium endobioticum",
"سیبزمینی",
"Thecaphora solani",
"سیاهک پاکوتاه گندم",
"Urocystis cepulae",
"Uromyces transversalis",
"potato latent tymovirus",
"Andean potato mottle comovirus",
"Banana bract mosaic potyvirus",
"Banana bunchy top nanavirus",
"golden mosaic geminivirus",
"برگ چغندر",
"Beet leaf curl virus",
"Cherry rasp leaf nepovirus",
"Citrus leprosis nucleorhabdovirus",
"Citrus tatter leaf capillovirus",
"Citrus variegation ilarvirus",
"Citrus",
"Lettuce infectious yellows crinivirus",
"Little cherry clostrovirus",
"spindle tuber viroid",
"dwarf nepovirus",
"ویروس چروکیدگی توت فرنگی",
"Tobacco ringspot nepovirus",
"bushy stunt tombusvirus",
"ringspot",
"Burkholderia caryophylli",
"Citrus greening bacterium",
"michiganensis",
"chrysanthemi",
"Erwinia tracheiphila",
"syringae",
"Pseudomonas syringae",
"Ralstonia solanacearum",
"Xanthomonas fragariae",
"Xanthomonas vesicatoria",
"Xylella fastidiosa",
"Lime witches",
"Palm lethal yellowing phytoplasma",
"Peach rosette phytoplasma",
"Peach X-disease phytoplasma",
"Peach yellows phytoplasma",
"Pear decline phytoplasma",
"Potato stolbur phytoplasma",
"Anguina agrostis",
"Anguina funesta",
"Anguina graminis",
"Aphelenchoides fragariae",
"Globodera rostochiensis",
"Heterodera trifolii",
"articlla",
"Meloidogyne fallax",
"Pratylenchus coffee",
"Pratylenchus fallax",
"citrophilus",
"Tylenchulus semipenetrans",
"Aphis gossypii",
"Bactrocera oleae",
"Brevipalpus phoenicis",
"spp",
"Ceratitis capitata",
"Epilachna chrysomelina",
"Lasioderma serricorne",
"Liriomyza trifolii",
"Pectinophora gossypiella",
"بید چغندر قند",
"Phthorimaea ocellatella",
"Phyllocoptruta oleivora",
"Planococcus citri",
"Polyphagotarsonemus latus",
"Pseudococcus longispinus",
"Toxoptera aurantii",
"Trogoderma spp",
"Viteus vitifoliae",
"united states of america",
"stone fruit",
"states of america",
"rosette",
"petiolaris",
"pensilvanicum",
"peanut clump furovirus",
"orientalis Waterhouse",
"of america",
"mop-top furovirus",
"macrospora",
"leucoloma Boheman",
"leucoloma",
"leaf roll nepovirus",
"greening",
"dwarf",
"bacterium",
"Thrips palmi Karny",
"Thecaphora solani Barrus",
"The Hague",
"Sugarcane grassy shoot",
"Striga lutea Lour",
"Striga hermonthica Benth",
"Striga euphrasioides",
"Sternochetus mangiferae",
"Spodoptera litura Fabricius",
"Spodoptera frugiperda",
"Spodoptera eridania Cramer",
"Solanum elaeagnifolium Cav",
"Solanum",
"Sesbania macrocarpa",
"Satsuma dwarf nepovirus",
"Salvia lancifolia",
"Raspberry ringspot nepovirus",
"Raspberry leaf curl luteovirus",
"Prostephanus truncatus Horn",
"Potato yellow dwarf rhabdovirus",
"Potato black ringspot",
"Popillia",
"Polygonum pensilvanicum",
"Plum pox potyvirus",
"Pissodes castaneus Degeer",
"Phytophthora megasperma",
"Phytophthora cinnamomi Rands",
"Phymatotrichopsis omnivora Hennebert",
"Phylloxera vastatrix Planch",
"Peronosclerospora sorghi",
"Peronosclerospora philippinensis",
"Peronosclerospora maydis C",
"Perkinsiella saccharicida Kirkaldy",
"Pentalonia nigronervosa Coquerel",
"Pectinophora scutigera Holdaway",
"Peanut stripe potyvirus",
"Pea early browning tobravirus",
"Parasaissetia nigra Nietner",
"Parabemisia myricae Kuwana",
"Palm",
"Otiorhynchus sulcatus Fabricius",
"New York",
"Mycosphaerella dearnessii Barr",
"Meloidogyne fallax Karssen",
"Massee",
"Los Angeles",
"Lime withes broom",
"Jacquemontia tamnifolia Griseb",
"Iridomyrmex",
"Ipomoea hederacea",
"High",
"Helianthus petiolaris",
"Helianthus lenticularis",
"Helianthus ciliaris",
"Haplothrips chinensis",
"Gymnosporangium globosum",
"Glomerella gossypii Edgerton",
"Globodera tabacum",
"Globodera pallida Behrens",
"Gilpinia hercyniae Hartig",
"Flower",
"Euphorbia marginata Pursh",
"Euphorbia",
"Epitrix tuberis Gentner",
"Emex australis",
"Dysmicoccus brevipes",
"Diatraea saccharalis",
"Cuscuta europaea",
"Cuscuta epithymum Murr",
"Cuscuta epilinum Weiche",
"Cryphonectria parasitica Barr",
"Cotton leaf curl geminivirus",
"Corporation",
"Coccus viridis Green",
"Citrus leaf rugose ilarvirus",
"Ceratocystis fagacearum",
"Ceratitis capitata Wiedemann",
"Carposina niponensis",
"Bursaphelenchus mucronatus",
"Broad bean mottle bromovirus",
"Bactrocera tryoni Froggatt",
"Bactrocera oleae Gmelin",
"Bactrocera dorsalis Hendel",
"Bactrocera cucurbitae Coquillett",
"Aspidiella",
"Apple stem pitting virus",
"Apple chlorotic leafspot trichovirus",
"Anoplophora glabripennis)Motschulsky",
"Anastrepha obliqua Macquart",
"Amaranthus blitoides S.Wats",
]
novalid = -15000000000
all_orgs = []
all_orgs_text = ''
orgs_list = []
for index, mentry in enumerate(section_list):
try:
# if index > 20000:
# break
id = mentry["id"]
# if not id in records:
# print(id + ' exists')
# continue
entry = mentry["source"]
content = entry.get("content", "")
content_ai = entry["content_ai"]
content_len = entry.get("content_len", "")
qanon_id = entry.get("qanon_id", "")
ners = entry['ners_v1']
current_orgs = []
for ner_item in ners:
if ner_item['key'] == 'ORG':
ner_temp = remove_stop_signs(ner_item['value'].strip())
if ner_temp in stop_orgs:
continue
if ner_temp in stop_orgs2:
continue
if ner_temp.startswith('ها') or ner_temp.startswith('های'):
continue
if ('سازمان های' in ner_temp
or 'سازمان ها' in ner_temp
or 'وزارت خانها' in ner_temp
or 'وزارت خانهها' in ner_temp
or 'وزارت خانههای' in ner_temp
or 'وزارت خانه' in ner_temp
or 'وزارت خانه های' in ner_temp
or 'وزارتین ' in ner_temp
):
continue
if not ('وزارت' in ner_temp or 'سازمان' in ner_temp):
continue
ner_temp = normalize_content(ner_temp)
current_orgs.append(ner_temp)
if current_orgs:
unique_orgs_list = []
# حذف مقادیر تکراری
[unique_orgs_list.append(x) for x in current_orgs if x not in unique_orgs_list]
current_orgs = unique_orgs_list
# current_org_list_text = ''
# for c_org in current_orgs:
# current_org_list_text += c_org + '\n'
all_orgs.append({
"id": id,
"qanon_id": qanon_id,
"orgs": current_orgs,
# "orgs_text": current_org_list_text,
})
#separator = "*"*100 + '\n'
#all_orgs_text += f"id: {id}\norgs: {current_orgs}\n\norgs_text: \n{current_org_list_text}\n\ncontentai: {content_ai}\n{separator}"
# for org in current_orgs:
# orgs_list.append(org)
# all_orgs_relations = relation_finder(all_orgs)
# print(len(all_orgs_relations))
# # for i, section in enumerate(all_orgs_relations):
# # print('rel_task: ' + str(i))
# data = {
# "qanon_id" : qanon_id,
# "content_ai":content_ai,
# "ners_v1": ners,
# "relations": all_orgs_relations
# }
# eid = id
# try:
# if is_update_state:
# resp = es.update(index=index_name_o, id=eid, doc=data)
# else:
# #write_to_json(data, './data/regulations_ner.json')
# resp = es.index(index=index_name_o, id=eid, document=data)
# except Exception as e:
# save_error(id, e)
except Exception as error:
print(error)
# print('relation task --------------> ' + str(count))
# if count > 1000 :
# break
# if content_len == 0:
# continue
all_orgs_relations = relation_finder(all_orgs)
print('all orgs count: ' + str(len(all_orgs_relations)))
# section_list_temp = [c for i, c in enumerate(section_list)]
# شمارش تعداد رکوردها
# len_section_list_temp = len(section_list_temp)
count = 0
section_list1 = es_iterate_all_documents(es, index_name_i)
for mentry1 in section_list1:
count += 1
# if count > 10000:
# break
# print('relation task for section: ' + str(count))
if count % 100 == 0:
print(f"relation finder progress: {(count / 273442) * 100:.2f} %")
id1 = mentry1["id"]
entry1 = mentry1["source"]
content1 = entry1.get("content", "")
content_ai1 = entry1["content_ai"]
content_len1 = entry1.get("content_len", "")
qanon_id1 = entry1.get("qanon_id", "")
ners1 = entry1['ners_v1']
current_relation_list = []
# iid = 'mj_qa_section_230085'
# if id1 != iid:
# continue
# if id1 == iid:
# x = 10
current_relations = []
for i, section in enumerate(all_orgs_relations):
if section['id'] == id1:
current_relations = section['related_sections']
for rel_item in current_relations:
current_relation_list.append(rel_item)
current_relation_list = extract_weight(current_relation_list)
data1 = {
"qanon_id" : qanon_id1,
"content_ai": content_ai1,
"ners_v1": ners1,
"relations": current_relation_list
}
eid = id1
try:
if is_update_state:
resp = es.update(index=index_name_o, id=eid, doc=data1)
else:
#write_to_json(data, './data/regulations_ner.json')
resp = es.index(index=index_name_o, id=eid, document=data1)
#pass
except Exception as e:
save_error(id, e)
print(datetime.datetime.now())
print(" # # # relation extraction finished! # # # ")
# استفاده از Counter برای شمارش تعداد تکرار هر موجودیت
'''
counter = Counter(orgs_list)
unique_dict = dict(counter)
unique_list = list(unique_dict.items())
unique_list_sorted_by_key = sorted(unique_list, key=lambda x: x[1], reverse=True)
orgs_list = unique_list_sorted_by_key
org_list_text = ''
for org in orgs_list:
org_list_text += str(org[1]) + ' *** '+ org[0] + '\n'
orgs_address = os.getcwd() + '/Flair_NER/data/all_orgs.txt'
save_to_file_by_address(orgs_address, org_list_text)
orgs_text_address = os.getcwd() + '/Flair_NER/data/all_orgs_text.txt'
save_to_file_by_address(orgs_text_address, all_orgs_text)
save_error(id, e)
'''