272 lines
9.1 KiB
Python
272 lines
9.1 KiB
Python
from elasticsearch7 import Elasticsearch
|
|
from collections import Counter
|
|
from general_functions import save_error, normalize_content
|
|
from funcs import read_from_json
|
|
import datetime
|
|
import os
|
|
|
|
# ##################################
|
|
# برای محتوای مواد و احکام قانون که از معاونت قوانین مجلس در ایندکس الاستیک ذخیره شده است
|
|
# qanon_section-v02
|
|
# تحلیل روی بعضی فیلدها می کند و تاریخ های آن را استخراج و تبدیل به فرمت خاص تایم استمپ می کند
|
|
# و در فیدل مناسب در همان ایندکس الاستیک ذخیره میکند
|
|
# توجه : دسترسی به الاستیک باید باشد
|
|
# ##################################
|
|
|
|
|
|
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
|
|
|
|
global counter
|
|
global total
|
|
is_first = True
|
|
while True:
|
|
# Scroll next
|
|
if is_first: # Initialize scroll
|
|
# result = es.search(index=index, scroll="2m", **kwargs, body={
|
|
# "size": pagesize
|
|
# })
|
|
result = es.search(
|
|
index=index,
|
|
scroll="2m",
|
|
**kwargs,
|
|
size=pagesize,
|
|
body={
|
|
"query": {
|
|
"bool": {
|
|
"must_not": [
|
|
{"exists": {"field": "nlp_parser.type"}},
|
|
{"match": {"content_len": 0}},
|
|
{"match": {"parse_state": 1}},
|
|
{"match": {"parse_state": 2}}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
)
|
|
total = result["hits"]["total"]["value"]
|
|
print("total = %d" % total)
|
|
is_first = False
|
|
else:
|
|
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|
scroll_id = result["_scroll_id"]
|
|
hits = result["hits"]["hits"]
|
|
counter += len(hits)
|
|
print("progress -> %.2f %%" % ((counter / total) * 100))
|
|
# Stop after no more docs
|
|
if not hits:
|
|
break
|
|
# Yield each entry
|
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
|
|
|
def es_iterate_some_documents(es, index, records, pagesize=250, scroll_timeout="25m", **kwargs):
|
|
|
|
global counter
|
|
global total
|
|
is_first = True
|
|
query = {
|
|
"query": {
|
|
"terms": {
|
|
"_id": records
|
|
}
|
|
}
|
|
}
|
|
while True:
|
|
# Scroll next
|
|
if is_first: # Initialize scroll
|
|
# result = es.search(index=index, scroll="2m", **kwargs, body={
|
|
# "size": pagesize
|
|
# })
|
|
result = es.search(
|
|
index=index,
|
|
scroll="2m",
|
|
**kwargs,
|
|
size=pagesize,
|
|
body= query
|
|
)
|
|
total = result["hits"]["total"]["value"]
|
|
print("total = %d" % total)
|
|
is_first = False
|
|
else:
|
|
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
|
scroll_id = result["_scroll_id"]
|
|
hits = result["hits"]["hits"]
|
|
counter += len(hits)
|
|
print("progress -> %.2f %%" % ((counter / total) * 100))
|
|
# Stop after no more docs
|
|
if not hits:
|
|
break
|
|
# Yield each entry
|
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
|
|
|
def prepare_data(ner_obj_list):
|
|
ner_data_list = []
|
|
for ner_obj in ner_obj_list:
|
|
ner_data = {
|
|
"key" :ner_obj['ner_key'],
|
|
"value" :ner_obj['ner_value'],
|
|
"begin" :ner_obj['ner_start_token'],
|
|
"end" :ner_obj['ner_end_token'],
|
|
"score" :ner_obj['ner_score']
|
|
}
|
|
|
|
ner_data_list.append(ner_data)
|
|
return ner_data_list
|
|
|
|
|
|
def relation_finder(all_orgs):
|
|
new_orgs = []
|
|
|
|
for index, section in enumerate(all_orgs):
|
|
if index % 1000 == 0:
|
|
print(f"relation finder progress: {(index/len(all_orgs)) * 100:.2f} %")
|
|
|
|
# if index > 1000:
|
|
# break
|
|
related_sections = []
|
|
orgs = section['ai_key']
|
|
|
|
for org in orgs:
|
|
for compare_item in all_orgs:
|
|
compare_item_orgs = compare_item['ai_key']
|
|
if section['section_id'] == compare_item['section_id']:# جلوگیری از ارتباط یک مقرره با خودش
|
|
continue
|
|
if org in compare_item_orgs:
|
|
# related_sections.append(compare_item['id'])
|
|
related_sections.append({
|
|
'section_id': compare_item['section_id'],
|
|
'qanon_id': compare_item['qanon_id'],
|
|
'ai_key': org,
|
|
'type': 'ORG',
|
|
'weight': 1,
|
|
})
|
|
|
|
new_orgs.append({
|
|
'id': section['section_id'],
|
|
'qanon_id': section['qanon_id'],
|
|
'ai_key': section['orgs'],
|
|
|
|
# 'orgs_text': section['orgs_text'],
|
|
'relations': related_sections,
|
|
})
|
|
return new_orgs
|
|
|
|
def extract_weight(relation_list):
|
|
relation_list_temp = []
|
|
for rel_item in relation_list:
|
|
weight = 0
|
|
rel_labels = []
|
|
current_section_id = rel_item['section_id']
|
|
for item in relation_list:
|
|
if item['section_id'] == current_section_id:
|
|
weight += 1
|
|
rel_labels.append(item['ai_key'])
|
|
|
|
for rel_item2 in relation_list_temp:
|
|
if current_section_id == rel_item2['section_id']:
|
|
break
|
|
else:
|
|
relation_list_temp.append({
|
|
"section_id": current_section_id,
|
|
"qanon_id": rel_item['qanon_id'],
|
|
"ai_key": rel_labels,
|
|
"type": rel_item['type'],
|
|
"weight": weight
|
|
})
|
|
break
|
|
if len(relation_list_temp) == 0:
|
|
relation_list_temp.append({
|
|
"section_id": current_section_id,
|
|
"qanon_id": rel_item['qanon_id'],
|
|
"ai_key": rel_labels,
|
|
"type": rel_item['type'],
|
|
"weight": weight
|
|
})
|
|
return relation_list_temp
|
|
|
|
print(datetime.datetime.now())
|
|
index_name_i = "ai_mj_qa_section-v07" # الاستیک موجود روی جی پی یو
|
|
# index_name_o = 'mj_qa_test-v01'
|
|
# is_update_state = False
|
|
index_name_o = "ai_mj_qa_section-v03"
|
|
is_update_state = False
|
|
|
|
mapping_o = ""
|
|
|
|
es = Elasticsearch(
|
|
"http://127.0.0.1:6900",
|
|
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
|
)
|
|
|
|
try:
|
|
if not es.indices.exists(index=index_name_o):
|
|
response = es.indices.create(index=index_name_o, body=mapping_o)
|
|
# print out the response:
|
|
print("create index response:", response)
|
|
except:
|
|
print("elastic error")
|
|
|
|
counter = 0
|
|
total = 0
|
|
id = ""
|
|
|
|
novalid = -15000000000
|
|
all_orgs = []
|
|
all_orgs_text = ''
|
|
orgs_list = []
|
|
main_9k_sections = read_from_json("/data/selected_sections_9K.json")
|
|
main_9k_sections_list = [item00["id"] for item00 in main_9k_sections]
|
|
section_list1 = es_iterate_all_documents(es, index_name_i)
|
|
count = 0
|
|
for mentry1 in section_list1:
|
|
count += 1
|
|
id1 = mentry1["id"]
|
|
# اگر این سکشن جزء سکشن های اصلی موجود در داده های فایل 9 هزارتایی نبود، آن را بررسی نکن
|
|
if not id1 in main_9k_sections_list:
|
|
continue
|
|
# if count > 10000:
|
|
# break
|
|
# print('relation task for section: ' + str(count))
|
|
if count % 100 == 0:
|
|
print(f"relation finder progress: {(count / 273442) * 100:.2f} %")
|
|
|
|
entry1 = mentry1["source"]
|
|
content1 = entry1.get("content", "")
|
|
content_ai1 = entry1["content_ai"]
|
|
content_len1 = entry1.get("content_len", "")
|
|
qanon_id1 = entry1.get("qanon_id", "")
|
|
ners1 = entry1['ners_v1']
|
|
relations = entry1['relations']
|
|
|
|
temp = relations
|
|
before = len(relations)
|
|
relations = extract_weight(relations)
|
|
after = len(relations)
|
|
if before != after:
|
|
print()
|
|
print("id: "+ id1 +" merged find! diffrence = "+str(before-after))
|
|
print()
|
|
|
|
data1 = {
|
|
"qanon_id" : qanon_id1,
|
|
"content_ai": content_ai1,
|
|
"ners_v1": ners1,
|
|
"relations": relations
|
|
}
|
|
|
|
eid = id1
|
|
|
|
try:
|
|
if is_update_state:
|
|
resp = es.update(index=index_name_o, id=eid, doc=data1)
|
|
else:
|
|
#write_to_json(data, './data/regulations_ner.json')
|
|
resp = es.index(index=index_name_o, id=eid, document=data1)
|
|
#pass
|
|
|
|
|
|
except Exception as e:
|
|
save_error(id, e)
|
|
|
|
print(datetime.datetime.now())
|
|
print(" # # # relation update finished! # # # ")
|