Flair_NER/ner_api/do_ner_reg.py

192 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import datetime
from funcs import save_to_file_by_address, read_file_by_address
from elasticsearch7 import Elasticsearch
from ner_proccess import inference_main
# ##################################
# در این فایل، نام ایندکسی از الستیک که داده ها روی آن قرار دارد، وارد می شود و نیز نام ایندکس جدیدی که پس از پردازش، داده ها روی آن ذخیره می شود نیز نوشته می شود و بازای تک تک متن های قانونی، موجودیت های نامدار استخراج می شود
# ##################################
date = datetime.datetime.now()
print(date)
index_name_i = "semantic_search-v09" # الاستیک موجود روی جی پی یو
# index_name_o = 'mj_qa_test-v01'
# is_update_state = False
index_name_o = "ai_mj_qa_section-v05"
is_update_state = False
mapping_o = ""
es = Elasticsearch(
"http://127.0.0.1:6900",
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
)
try:
if not es.indices.exists(index=index_name_o):
response = es.indices.create(index=index_name_o, body=mapping_o)
# print out the response:
print("create index response:", response)
except:
print("elastic error")
counter = 0
total = 0
id = ""
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
"""
Helper to iterate ALL values from a single index
Yields all the documents.
"""
global counter
global total
is_first = True
while True:
# Scroll next
if is_first: # Initialize scroll
# result = es.search(index=index, scroll="2m", **kwargs, body={
# "size": pagesize
# })
result = es.search(
index=index,
scroll="2m",
**kwargs,
size=pagesize,
body={
"query": {
"bool": {
"must_not": [
{"exists": {"field": "nlp_parser.type"}},
{"match": {"content_len": 0}},
{"match": {"parse_state": 1}},
{"match": {"parse_state": 2}}
]
}
}
}
)
total = result["hits"]["total"]["value"]
print("total = %d" % total)
is_first = False
else:
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
scroll_id = result["_scroll_id"]
hits = result["hits"]["hits"]
counter += len(hits)
print("progress -> %.2f %%" % ((counter / total) * 100))
# Stop after no more docs
if not hits:
break
# Yield each entry
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
def prepare_data(ner_obj_list):
ner_data_list = []
for ner_obj in ner_obj_list:
ner_data = {
"key" :ner_obj['ner_key'],
"value" :ner_obj['ner_value'],
"begin" :ner_obj['ner_start_token'],
"end" :ner_obj['ner_end_token'],
"score" :ner_obj['ner_score']
}
ner_data_list.append(ner_data)
return ner_data_list
try:
try:
# رکوردهایی که قبلا با خطا مواجه شده در آدرس زیر قرار دارد
section_list_text = read_file_by_address('/data/ner_reg_error_ids.txt')
records = section_list_text.splitlines()
list = es_iterate_all_documents(es, index_name_i)
except Exception as e:
print(' reading from elastic error! ')
date = datetime.datetime.now()
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
error = e
save_to_file_by_address("/data/errors.txt", error)
count = 0
novalid = -15000000000
for mentry in list:
try:
count += 1
id = mentry["id"]
if not id in records:
print(id + ' exists')
continue
entry = mentry["source"]
content = entry.get("content", "")
content_len = entry.get("content_len", "")
qanon_id = entry.get("qanon_id", "")
except:
pass
print('ner task --------------> ' + str(count))
# if count > 1000 :
# break
if content_len == 0:
continue
try:
ner_obj_list, content_ai, ner_result = inference_main('orgcatorg/xlm-v-base-ner', content)
if not ner_result[0]:
# ذخیره شناسه قانون و شناسه مقرره فعلی
separator = '*'*70
error = f"\nsection_id= {id}\nlaw_id= {qanon_id}\nerror_msg= {ner_result[1]}\ncontent= {content}\n{separator}"
# لیستی از مقرراتی که در اضافه شدن به خطا خورده
save_to_file_by_address("/data/ner_reg_errors.txt", error)
save_to_file_by_address("/data/ner_reg_list.txt", id + '\n')
continue
ner_data_list = prepare_data(ner_obj_list)
# parse_state = 1
except Exception as e:
date = datetime.datetime.now()
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
error = e
save_to_file_by_address("/data/errors.txt", error)
data = {
"qanon_id" : qanon_id,
"content_ai":content_ai,
"ners_v1": ner_data_list
}
eid = id
try:
if is_update_state:
resp = es.update(index=index_name_o, id=eid, doc=data)
else:
resp = es.index(index=index_name_o, id=eid, document=data)
except Exception as e:
date = datetime.datetime.now()
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
error = e
save_to_file_by_address("/data/errors.txt", error)
except Exception as e:
date = datetime.datetime.now()
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
error = e
save_to_file_by_address("/data/errors.txt", error)
date = datetime.datetime.now()
print(date)
print(" # # # regulations NER finished! # # # ")