192 lines
6.6 KiB
Python
192 lines
6.6 KiB
Python
import datetime
|
||
from funcs import save_to_file_by_address, read_file_by_address
|
||
from elasticsearch7 import Elasticsearch
|
||
from ner_proccess import inference_main
|
||
|
||
# ##################################
|
||
# در این فایل، نام ایندکسی از الستیک که داده ها روی آن قرار دارد، وارد می شود و نیز نام ایندکس جدیدی که پس از پردازش، داده ها روی آن ذخیره می شود نیز نوشته می شود و بازای تک تک متن های قانونی، موجودیت های نامدار استخراج می شود
|
||
# ##################################
|
||
|
||
date = datetime.datetime.now()
|
||
print(date)
|
||
|
||
index_name_i = "semantic_search-v09" # الاستیک موجود روی جی پی یو
|
||
# index_name_o = 'mj_qa_test-v01'
|
||
# is_update_state = False
|
||
index_name_o = "ai_mj_qa_section-v05"
|
||
is_update_state = False
|
||
|
||
|
||
mapping_o = ""
|
||
|
||
es = Elasticsearch(
|
||
"http://127.0.0.1:6900",
|
||
basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
||
)
|
||
|
||
try:
|
||
if not es.indices.exists(index=index_name_o):
|
||
response = es.indices.create(index=index_name_o, body=mapping_o)
|
||
# print out the response:
|
||
print("create index response:", response)
|
||
except:
|
||
print("elastic error")
|
||
|
||
counter = 0
|
||
total = 0
|
||
id = ""
|
||
|
||
|
||
def es_iterate_all_documents(es, index, pagesize=250, scroll_timeout="25m", **kwargs):
|
||
"""
|
||
Helper to iterate ALL values from a single index
|
||
Yields all the documents.
|
||
"""
|
||
global counter
|
||
global total
|
||
is_first = True
|
||
while True:
|
||
# Scroll next
|
||
if is_first: # Initialize scroll
|
||
# result = es.search(index=index, scroll="2m", **kwargs, body={
|
||
# "size": pagesize
|
||
# })
|
||
result = es.search(
|
||
index=index,
|
||
scroll="2m",
|
||
**kwargs,
|
||
size=pagesize,
|
||
body={
|
||
"query": {
|
||
"bool": {
|
||
"must_not": [
|
||
{"exists": {"field": "nlp_parser.type"}},
|
||
{"match": {"content_len": 0}},
|
||
{"match": {"parse_state": 1}},
|
||
{"match": {"parse_state": 2}}
|
||
]
|
||
}
|
||
}
|
||
}
|
||
)
|
||
total = result["hits"]["total"]["value"]
|
||
print("total = %d" % total)
|
||
is_first = False
|
||
else:
|
||
result = es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
||
scroll_id = result["_scroll_id"]
|
||
hits = result["hits"]["hits"]
|
||
counter += len(hits)
|
||
print("progress -> %.2f %%" % ((counter / total) * 100))
|
||
# Stop after no more docs
|
||
if not hits:
|
||
break
|
||
# Yield each entry
|
||
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
||
|
||
def prepare_data(ner_obj_list):
|
||
ner_data_list = []
|
||
for ner_obj in ner_obj_list:
|
||
ner_data = {
|
||
"key" :ner_obj['ner_key'],
|
||
"value" :ner_obj['ner_value'],
|
||
"begin" :ner_obj['ner_start_token'],
|
||
"end" :ner_obj['ner_end_token'],
|
||
"score" :ner_obj['ner_score']
|
||
}
|
||
|
||
ner_data_list.append(ner_data)
|
||
return ner_data_list
|
||
|
||
try:
|
||
try:
|
||
|
||
# رکوردهایی که قبلا با خطا مواجه شده در آدرس زیر قرار دارد
|
||
section_list_text = read_file_by_address('/data/ner_reg_error_ids.txt')
|
||
records = section_list_text.splitlines()
|
||
|
||
list = es_iterate_all_documents(es, index_name_i)
|
||
|
||
except Exception as e:
|
||
print(' reading from elastic error! ')
|
||
date = datetime.datetime.now()
|
||
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
|
||
error = e
|
||
save_to_file_by_address("/data/errors.txt", error)
|
||
|
||
|
||
count = 0
|
||
novalid = -15000000000
|
||
|
||
for mentry in list:
|
||
try:
|
||
count += 1
|
||
id = mentry["id"]
|
||
if not id in records:
|
||
print(id + ' exists')
|
||
continue
|
||
entry = mentry["source"]
|
||
content = entry.get("content", "")
|
||
content_len = entry.get("content_len", "")
|
||
qanon_id = entry.get("qanon_id", "")
|
||
except:
|
||
pass
|
||
|
||
|
||
print('ner task --------------> ' + str(count))
|
||
# if count > 1000 :
|
||
# break
|
||
|
||
if content_len == 0:
|
||
continue
|
||
|
||
try:
|
||
ner_obj_list, content_ai, ner_result = inference_main('orgcatorg/xlm-v-base-ner', content)
|
||
if not ner_result[0]:
|
||
# ذخیره شناسه قانون و شناسه مقرره فعلی
|
||
separator = '*'*70
|
||
error = f"\nsection_id= {id}\nlaw_id= {qanon_id}\nerror_msg= {ner_result[1]}\ncontent= {content}\n{separator}"
|
||
# لیستی از مقرراتی که در اضافه شدن به خطا خورده
|
||
save_to_file_by_address("/data/ner_reg_errors.txt", error)
|
||
save_to_file_by_address("/data/ner_reg_list.txt", id + '\n')
|
||
continue
|
||
ner_data_list = prepare_data(ner_obj_list)
|
||
|
||
# parse_state = 1
|
||
except Exception as e:
|
||
date = datetime.datetime.now()
|
||
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
|
||
error = e
|
||
save_to_file_by_address("/data/errors.txt", error)
|
||
|
||
data = {
|
||
"qanon_id" : qanon_id,
|
||
"content_ai":content_ai,
|
||
"ners_v1": ner_data_list
|
||
}
|
||
|
||
eid = id
|
||
|
||
try:
|
||
if is_update_state:
|
||
resp = es.update(index=index_name_o, id=eid, doc=data)
|
||
else:
|
||
resp = es.index(index=index_name_o, id=eid, document=data)
|
||
|
||
|
||
except Exception as e:
|
||
date = datetime.datetime.now()
|
||
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
|
||
error = e
|
||
save_to_file_by_address("/data/errors.txt", error)
|
||
|
||
except Exception as e:
|
||
date = datetime.datetime.now()
|
||
# error = f"error:\ndate: {date}\nerror_message: {e.args[0]}\n{"#"*70}\n"
|
||
error = e
|
||
save_to_file_by_address("/data/errors.txt", error)
|
||
|
||
date = datetime.datetime.now()
|
||
print(date)
|
||
print(" # # # regulations NER finished! # # # ")
|