Flair_NER/do_ner_reg.py

231 lines
7.8 KiB
Python

from elasticsearch7 import Elasticsearch
from general_functions import save_error
from ner_proccess import inference_main
import os
from funcs import save_to_file_by_address, read_file_by_address, write_to_json
# ##################################
# برای محتوای مواد و احکام قانون که از معاونت قوانین مجلس در ایندکس الاستیک ذخیره شده است
# qanon_section-v02
# تحلیل روی بعضی فیلدها می کند و تاریخ های آن را استخراج و تبدیل به فرمت خاص تایم استمپ می کند
# و در فیدل مناسب در همان ایندکس الاستیک ذخیره میکند
# توجه : دسترسی به الاستیک باید باشد
# ##################################
index_name_i = "semantic_search-v09" # الاستیک موجود روی جی پی یو
# index_name_o = 'mj_qa_test-v01'
# is_update_state = False
index_name_o = "ai_mj_qa_section-v05"
is_update_state = False
mapping_o = ""
es = Elasticsearch(
"http://127.0.0.1:6900",
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)
try:
if not es.indices.exists(index=index_name_o):
response = es.indices.create(index=index_name_o, body=mapping_o)
# print out the response:
print("create index response:", response)
except:
print("elastic error")
counter = 0
total = 0
id = ""
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
"""
Helper to iterate ALL values from a single index
Yields all the documents.
"""
global counter
global total
is_first = True
while True:
# Scroll next
if is_first: # Initialize scroll
# result = es.search(index=index, scroll="2m", **kwargs, body={
# "size": pagesize
# })
result = es.search(
index=index,
scroll="2m",
**kwargs,
size=pagesize,
body={
"query": {
"bool": {
"must_not": [
{"exists": {"field": "nlp_parser.type"}},
{"match": {"content_len": 0}},
{"match": {"parse_state": 1}},
{"match": {"parse_state": 2}}
]
}
}
}
)
total = result["hits"]["total"]["value"]
print("total = %d" % total)
is_first = False
else:
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
counter += len(hits)
print("progress -> %.2f %%" % ((counter / total) * 100))
# Stop after no more docs
if not hits:
break
# Yield each entry
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
def es_iterate_some_documents(es, index, records, pagesize=250, scroll_timeout="25m", **kwargs):
global counter
global total
is_first = True
query = {
"query": {
"terms": {
"_id": records
}
}
}
while True:
# Scroll next
if is_first: # Initialize scroll
# result = es.search(index=index, scroll="2m", **kwargs, body={
# "size": pagesize
# })
result = es.search(
index=index,
scroll="2m",
**kwargs,
size=pagesize,
body= query
)
total = result["hits"]["total"]["value"]
print("total = %d" % total)
is_first = False
else:
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
counter += len(hits)
print("progress -> %.2f %%" % ((counter / total) * 100))
# Stop after no more docs
if not hits:
break
# Yield each entry
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
def prepare_data(ner_obj_list):
ner_data_list = []
for ner_obj in ner_obj_list:
ner_data = {
"key" :ner_obj['ner_key'],
"value" :ner_obj['ner_value'],
"begin" :ner_obj['ner_start_token'],
"end" :ner_obj['ner_end_token'],
"score" :ner_obj['ner_score']
}
ner_data_list.append(ner_data)
return ner_data_list
try:
try:
# رکوردهایی که قبلا با خطا مواجه شده در آدرس زیر قرار دارد
# address3 = os.getcwd() + '/data/ner_reg_error_ids.txt'
section_list_text = read_file_by_address('/data/ner_reg_error_ids.txt')
records = section_list_text.splitlines()
list = es_iterate_all_documents(es, index_name_i)
except Exception as e:
print(' reading from elastic error! ')
save_error(0, e)
count = 0
novalid = -15000000000
for mentry in list:
try:
count += 1
id = mentry["id"]
if not id in records:
print(id + ' exists')
continue
entry = mentry["source"]
content = entry.get("content", "")
content_len = entry.get("content_len", "")
qanon_id = entry.get("qanon_id", "")
# qid = int(qanon_id.replace('mj_qa_qavanin_',''))
# if qid < 84996:
# continue
except:
pass
print('ner task --------------> ' + str(count))
# if count > 1000 :
# break
if content_len == 0:
continue
try:
#model_name = 'orgcatorg/xlm-v-base-ner *** learning_rate=0.5e-4 # mini_batch_size = 10 # max_epochs = 10'
ner_obj_list, content_ai, ner_result = inference_main('orgcatorg/xlm-v-base-ner', content)
if not ner_result[0]:
# ذخیره شناسه قانون و شناسه مقرره فعلی
separator = '*'*70
error = f"\nsection_id= {id}\nlaw_id= {qanon_id}\nerror_msg= {ner_result[1]}\ncontent= {content}\n{separator}"
# لیستی از مقرراتی که در اضافه شدن به خطا خورده به همراه
# address = os.getcwd() + '/Flair_NER/data/ner_reg_errors.txt'
save_to_file_by_address("/data/ner_reg_errors.txt", error)
# address2 = os.getcwd() + '/Flair_NER/data/ner_reg_list.txt'
save_to_file_by_address("/data/ner_reg_list.txt", id + '\n')
continue
# ner_obj_list, content_ai = [] , content
ner_data_list = prepare_data(ner_obj_list)
# parse_state = 1
except Exception as e:
# parse_state = 2
save_error(id, e)
data = {
"qanon_id" : qanon_id,
"content_ai":content_ai,
"ners_v1": ner_data_list
}
eid = id
try:
if is_update_state:
resp = es.update(index=index_name_o, id=eid, doc=data)
else:
#write_to_json(data, './data/regulations_ner.json')
resp = es.index(index=index_name_o, id=eid, document=data)
except Exception as e:
save_error(id, e)
except Exception as e:
save_error(id, e)
print(" # # # regulations NER finished! # # # ")