Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
a431551d3f |
67
ner_dataset/REF_finder.py
Normal file
67
ner_dataset/REF_finder.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
# بسم الله
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
from elastic_helper import ElasticHelper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Read = open (".\data\DATASET140402_no_arefـoutput.json","r",encoding='utf8')
|
||||||
|
RefList = json.loads(Read.read())
|
||||||
|
path = ".\data\mj_qa_section-v02.zip"
|
||||||
|
eh_obj = ElasticHelper()
|
||||||
|
sections = eh_obj.iterateJsonFile(path, True)
|
||||||
|
|
||||||
|
all_ref_list = []
|
||||||
|
find_refs_list = []
|
||||||
|
not_find_refs_list = []
|
||||||
|
|
||||||
|
for index, item in enumerate(sections):
|
||||||
|
ref_id = item['id']
|
||||||
|
source = item['source']
|
||||||
|
content = source['content']
|
||||||
|
all_ref_list.append([ref_id,content.strip()])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
n=1
|
||||||
|
for item in RefList :
|
||||||
|
refID2 , Content2 , ner_list = item['id'],item['content'].strip(),item['ner']
|
||||||
|
x=0
|
||||||
|
for refID1 , Content in all_ref_list:
|
||||||
|
|
||||||
|
if len(ner_list)==0:
|
||||||
|
x=1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if Content2 == Content and x == 0:
|
||||||
|
find_refs_list.append([refID1,refID2,Content])
|
||||||
|
print(f"REF ID {refID2} Found ! ... ")
|
||||||
|
x = 1
|
||||||
|
|
||||||
|
if x == 0:
|
||||||
|
not_find_refs_list.append(refID2)
|
||||||
|
|
||||||
|
print(f"{n} OF {len(RefList)} searched ...")
|
||||||
|
n+=1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
with open("foundfind_refs_list.json", "w" , encoding="utf8") as f:
|
||||||
|
json.dump(find_refs_list, f, indent=4, ensure_ascii=False )
|
||||||
|
|
||||||
|
|
||||||
|
txt=''
|
||||||
|
for id_ in not_find_refs_list:
|
||||||
|
txt+=f"{id_}\n"
|
||||||
|
|
||||||
|
with open("not_found_ids.txt", "w",encoding="utf8") as file:
|
||||||
|
# نوشتن دادهها در فایل
|
||||||
|
file.write(txt)
|
||||||
|
|
||||||
|
|
||||||
|
print("finish!")
|
||||||
|
|
BIN
ner_dataset/__pycache__/elastic_helper.cpython-313.pyc
Normal file
BIN
ner_dataset/__pycache__/elastic_helper.cpython-313.pyc
Normal file
Binary file not shown.
BIN
ner_dataset/data/mj_qa_section-v02.zip
Normal file
BIN
ner_dataset/data/mj_qa_section-v02.zip
Normal file
Binary file not shown.
677
ner_dataset/elastic_helper.py
Normal file
677
ner_dataset/elastic_helper.py
Normal file
|
@ -0,0 +1,677 @@
|
||||||
|
import zipfile
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from time import sleep
|
||||||
|
from elasticsearch import Elasticsearch,helpers
|
||||||
|
|
||||||
|
class ElasticHelper():
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
total = 0
|
||||||
|
id = ""
|
||||||
|
path_mappings = os.getcwd() + '/repo/_other/'
|
||||||
|
|
||||||
|
# def __init__(self, es_url="http://127.0.0.1:6900", es_pass="", es_user="elastic", path_mappings = ""):
|
||||||
|
|
||||||
|
# if path_mappings :
|
||||||
|
# self.path_mappings = path_mappings
|
||||||
|
|
||||||
|
# if es_pass == '' :
|
||||||
|
# self.es = Elasticsearch(es_url)
|
||||||
|
# else:
|
||||||
|
# self.es = Elasticsearch(
|
||||||
|
# es_url,
|
||||||
|
# http_auth=(es_user, es_pass),
|
||||||
|
# )
|
||||||
|
|
||||||
|
# print(es_url)
|
||||||
|
# print(self.es)
|
||||||
|
|
||||||
|
# self.success_connect = False
|
||||||
|
# for a in range(0,10):
|
||||||
|
# try :
|
||||||
|
# if not self.es.ping():
|
||||||
|
# print('elastic not ping, sleep 30 s : ', a)
|
||||||
|
# sleep(5)
|
||||||
|
# continue
|
||||||
|
# else:
|
||||||
|
# self.success_connect = True
|
||||||
|
# break
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# break
|
||||||
|
# if not self.success_connect :
|
||||||
|
# print('******','not access to elastic service')
|
||||||
|
# return
|
||||||
|
|
||||||
|
|
||||||
|
# self.counter = 0
|
||||||
|
# self.total = 0
|
||||||
|
# self.id = ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_doctument(self, index_name, id):
|
||||||
|
res = self.es.get(index=index_name, id=id)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def exist_doctument(self, index_name, id):
|
||||||
|
res = self.es.exists(index=index_name, id=id)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def update_index_doc(self, is_update_state, index_name_o, eid, data):
|
||||||
|
if is_update_state:
|
||||||
|
resp = self.es.update(index=index_name_o, id=eid, doc=data)
|
||||||
|
# resp = self.es.update(index=index_name_o, id=eid, body={'doc':data})
|
||||||
|
else:
|
||||||
|
resp = self.es.index(index=index_name_o, id=eid, document=data)
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
|
def exportToJsonForAI(self, path_back, index_name, out_name= '', body={}, fields=[]) :
|
||||||
|
print('*' * 50, ' start backup -->', index_name)
|
||||||
|
self.counter = 0
|
||||||
|
sid = None
|
||||||
|
|
||||||
|
out = out_name
|
||||||
|
if out_name == '' :
|
||||||
|
out = index_name
|
||||||
|
|
||||||
|
fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
|
||||||
|
|
||||||
|
s_res = self.es.search(
|
||||||
|
index=index_name,
|
||||||
|
scroll='5m',
|
||||||
|
size=1000,
|
||||||
|
body=body
|
||||||
|
)
|
||||||
|
self.total = s_res["hits"]["total"]['value']
|
||||||
|
|
||||||
|
print('start index = %s' % index_name)
|
||||||
|
print('total = %d' % self.total)
|
||||||
|
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
file_count = 1
|
||||||
|
out_json = []
|
||||||
|
while scroll_size > 0:
|
||||||
|
"Scrolling..."
|
||||||
|
self.counter += scroll_size
|
||||||
|
print("progress -> %.2f %%" % ((self.counter / self.total)*100))
|
||||||
|
#############################
|
||||||
|
for item in s_res['hits']['hits']:
|
||||||
|
|
||||||
|
if fields :
|
||||||
|
item2={}
|
||||||
|
item2['id']=item['_id']
|
||||||
|
for kf in fields :
|
||||||
|
#print(kf)
|
||||||
|
if kf in item['_source'] :
|
||||||
|
# print(item['_source'][kf])
|
||||||
|
item2[kf] = item['_source'][kf]
|
||||||
|
#exit()
|
||||||
|
else :
|
||||||
|
item2=item
|
||||||
|
|
||||||
|
out_json.append(item2)
|
||||||
|
|
||||||
|
|
||||||
|
s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
|
||||||
|
sid = None
|
||||||
|
text = json.dumps(out_json, ensure_ascii=False)
|
||||||
|
fout.write(text)
|
||||||
|
|
||||||
|
##############################
|
||||||
|
|
||||||
|
def backupIndexToZipfile(self, path_back, index_name, out_name= '', body={}, byzip = True, fields=[], noFields=[]) :
|
||||||
|
print('*' * 50, ' start backup -->', index_name)
|
||||||
|
self.counter = 0
|
||||||
|
sid = None
|
||||||
|
|
||||||
|
out = out_name
|
||||||
|
if out_name == '' :
|
||||||
|
out = index_name
|
||||||
|
|
||||||
|
|
||||||
|
if body == {} :
|
||||||
|
s_res = self.es.search(
|
||||||
|
index=index_name,
|
||||||
|
scroll='5m',
|
||||||
|
size=1000
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
s_res = self.es.search(
|
||||||
|
index=index_name,
|
||||||
|
scroll='5m',
|
||||||
|
size=1000,
|
||||||
|
body=body
|
||||||
|
)
|
||||||
|
|
||||||
|
self.total = s_res["hits"]["total"]['value']
|
||||||
|
if self.total == 0 :
|
||||||
|
print('total index_name by query = %d' % self.total)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if byzip:
|
||||||
|
fout = zipfile.ZipFile(path_back + "/"+ out + '.zip', 'w')
|
||||||
|
else:
|
||||||
|
fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
print('start index = %s' % index_name)
|
||||||
|
print('total = %d' % self.total)
|
||||||
|
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
file_count = 1
|
||||||
|
while scroll_size > 0:
|
||||||
|
"Scrolling..."
|
||||||
|
self.counter += scroll_size
|
||||||
|
print("progress -> %.2f %%" % ((self.counter / self.total)*100))
|
||||||
|
#############################
|
||||||
|
out_json = []
|
||||||
|
for item in s_res['hits']['hits']:
|
||||||
|
if fields :
|
||||||
|
item2={}
|
||||||
|
item2['id']=item['_id']
|
||||||
|
item2['_source']={}
|
||||||
|
for kf in fields :
|
||||||
|
if kf in item['_source'] :
|
||||||
|
item2['_source'][kf] = item['_source'][kf]
|
||||||
|
else :
|
||||||
|
item2=item
|
||||||
|
|
||||||
|
if noFields :
|
||||||
|
for kf in noFields :
|
||||||
|
if kf in item2['_source']:
|
||||||
|
del item2['_source'][kf]
|
||||||
|
|
||||||
|
|
||||||
|
out_json.append(item2)
|
||||||
|
|
||||||
|
|
||||||
|
text = json.dumps(out_json, ensure_ascii=False)
|
||||||
|
out_json = []
|
||||||
|
if byzip:
|
||||||
|
filename = out + str(file_count) + '.json'
|
||||||
|
file_count +=1
|
||||||
|
fout.writestr(filename, text.encode('utf-8'), zipfile.ZIP_DEFLATED )
|
||||||
|
else:
|
||||||
|
fout.write(text)
|
||||||
|
|
||||||
|
##############################
|
||||||
|
s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
|
||||||
|
sid = s_res['_scroll_id']
|
||||||
|
scroll_size = len(s_res['hits']['hits'])
|
||||||
|
sid = None
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
|
||||||
|
def restorFileToElastic(self, path_back, index_name, app_key = '', queryDelete = True, map_name='') :
|
||||||
|
if not os.path.exists(path_back) :
|
||||||
|
print(' **** error *** path not exist: ', path_back)
|
||||||
|
return False
|
||||||
|
|
||||||
|
file_path = path_back + '/' + index_name + '.zip'
|
||||||
|
if not os.path.exists(file_path ) :
|
||||||
|
return False
|
||||||
|
|
||||||
|
if queryDelete :
|
||||||
|
# اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
|
||||||
|
if self.deleteIndex(index_name) :
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
|
||||||
|
def restorFileToElastic2(self, path_file, index_name, app_key = '', queryDelete = True, map_name='') :
|
||||||
|
if not os.path.exists(path_file) :
|
||||||
|
print(' **** error *** path not exist: ', path_file)
|
||||||
|
return False
|
||||||
|
|
||||||
|
file_path = path_file
|
||||||
|
if not os.path.exists(file_path ) :
|
||||||
|
return False
|
||||||
|
|
||||||
|
if queryDelete :
|
||||||
|
# اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
|
||||||
|
if self.deleteIndex(index_name) :
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
|
||||||
|
self.createIndex(index_name, app_key, map_name)
|
||||||
|
self.zipFileToElastic(file_path, index_name)
|
||||||
|
|
||||||
|
|
||||||
|
def renameElasticIndex(self, index_name_i, index_name_o, app_key = '', map_name='') :
|
||||||
|
|
||||||
|
if self.createIndex(index_name_o, app_key, map_name) :
|
||||||
|
res = self.es.reindex(
|
||||||
|
body={
|
||||||
|
"source": {"index": index_name_i},
|
||||||
|
"dest": {"index": index_name_o}
|
||||||
|
},
|
||||||
|
wait_for_completion=False)
|
||||||
|
|
||||||
|
print(type(res))
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
taskid = res["task"] if res["task"] else ""
|
||||||
|
#tasks = client.TasksClient(self.es)
|
||||||
|
tasks = self.es.tasks
|
||||||
|
while True :
|
||||||
|
res = tasks.get(task_id = taskid)
|
||||||
|
if res["completed"] :
|
||||||
|
break
|
||||||
|
|
||||||
|
# print( res["task"])
|
||||||
|
print( '----', index_name_o, ' imported : ', res["task"]["status"]["total"] , ' / ', res["task"]["status"]["created"])
|
||||||
|
sleep(1)
|
||||||
|
print( '----', index_name_o, ' complated')
|
||||||
|
|
||||||
|
|
||||||
|
def deleteIndex(self, index_name) :
|
||||||
|
if not self.es.indices.exists(index=index_name) :
|
||||||
|
print(' ' * 10, " for delete NOT exist index :", index_name )
|
||||||
|
return True
|
||||||
|
|
||||||
|
question = 'Is DELETE elastic index (' + index_name +') ? '
|
||||||
|
if self.query_yes_no(question) :
|
||||||
|
self.es.indices.delete(index = index_name)
|
||||||
|
print('%' * 10 , " Finish DELETE index :", index_name )
|
||||||
|
return True
|
||||||
|
else :
|
||||||
|
return False
|
||||||
|
|
||||||
|
def query_yes_no(self, question, default="no"):
|
||||||
|
valid = { "yes": True, "y": True, "ye": True, "no": False, "n": False }
|
||||||
|
if default is None:
|
||||||
|
prompt = " [y/n] "
|
||||||
|
elif default == "yes":
|
||||||
|
prompt = " [Y/n] "
|
||||||
|
elif default == "no":
|
||||||
|
prompt = " [y/N] "
|
||||||
|
else:
|
||||||
|
raise ValueError("invalid default answer: '%s'" % default)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print('%'*10, ' quistion ', '%'*10 , '\n')
|
||||||
|
sys.stdout.write(question + prompt)
|
||||||
|
choice = input().lower()
|
||||||
|
if default is not None and choice == "":
|
||||||
|
return valid[default]
|
||||||
|
elif choice in valid:
|
||||||
|
return valid[choice]
|
||||||
|
else:
|
||||||
|
sys.stdout.write("لطفا یکی از موارد روبرو را وارد کنید : 'yes' or 'no' " "(or 'y' or 'n').\n")
|
||||||
|
|
||||||
|
def createIndexIfNotExist(self, index_name_o, mapping_o=""):
|
||||||
|
try:
|
||||||
|
if not self.es.indices.exists(index=index_name_o):
|
||||||
|
response = self.es.indices.create(index=index_name_o, body=mapping_o)
|
||||||
|
# print out the response:
|
||||||
|
print("create index response:", response)
|
||||||
|
except:
|
||||||
|
print("....... index exist ! ... not created")
|
||||||
|
|
||||||
|
|
||||||
|
def createIndex(self, index_name, app_key='', map_name=''):
|
||||||
|
|
||||||
|
path_base = self.path_mappings
|
||||||
|
path_mapping1 = path_base + 'general/'
|
||||||
|
if app_key == '' :
|
||||||
|
app_key = 'tavasi'
|
||||||
|
path_mapping2 = path_base + app_key + '/'
|
||||||
|
|
||||||
|
|
||||||
|
if map_name == '':
|
||||||
|
map_name = index_name
|
||||||
|
|
||||||
|
if self.es.indices.exists(index=index_name) :
|
||||||
|
print("============== exist index :", index_name )
|
||||||
|
return True
|
||||||
|
|
||||||
|
if map_name == 'mj_rg_section' or map_name == 'semantic_search' :
|
||||||
|
map_name = 'mj_qa_section'
|
||||||
|
elif map_name[-3]=='_ai':
|
||||||
|
map_name=[0-len(map_name)-3]
|
||||||
|
print(map_name)
|
||||||
|
|
||||||
|
mapping_file_path = path_mapping1 + map_name + '.json'
|
||||||
|
print("mapping_file_path : " , mapping_file_path)
|
||||||
|
if not os.path.isfile(mapping_file_path):
|
||||||
|
if not os.path.isfile(mapping_file_path):
|
||||||
|
mapping_file_path = path_mapping2 + map_name + '.json'
|
||||||
|
|
||||||
|
print("mapping_file_path : " , mapping_file_path)
|
||||||
|
|
||||||
|
# Create Index With Mapping
|
||||||
|
if os.path.isfile(mapping_file_path):
|
||||||
|
mapping_file = open( mapping_file_path,'r', encoding='utf-8' )
|
||||||
|
mapping_file_read = mapping_file.read()
|
||||||
|
mapping_data = json.loads(mapping_file_read)
|
||||||
|
mapping_file.close()
|
||||||
|
if self.es.indices.exists(index=index_name) :
|
||||||
|
print("============== exist index :", index_name )
|
||||||
|
else :
|
||||||
|
self.es.indices.create(index = index_name , body = mapping_data)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print('*** error not find maping file elastic : *******', mapping_file_path)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def updateBulkList(self, listData, index_name):
|
||||||
|
chunk_size=1000
|
||||||
|
raise_on_error=False
|
||||||
|
raise_on_exception=False
|
||||||
|
stats_only=True
|
||||||
|
yield_ok = False
|
||||||
|
|
||||||
|
actions=[]
|
||||||
|
for item in listData:
|
||||||
|
actions.append({
|
||||||
|
"_op_type": "update",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id" : item['_id'],
|
||||||
|
"doc": item['_source']
|
||||||
|
}
|
||||||
|
)
|
||||||
|
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
||||||
|
|
||||||
|
def importBulkList(self, listData, index_name):
|
||||||
|
chunk_size=100000
|
||||||
|
raise_on_error=False
|
||||||
|
raise_on_exception=False
|
||||||
|
stats_only=True
|
||||||
|
yield_ok = False
|
||||||
|
|
||||||
|
for item in listData:
|
||||||
|
actions = [{
|
||||||
|
"_op_type": "index",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id" : item['_id'],
|
||||||
|
"_source": item['_source']
|
||||||
|
}
|
||||||
|
]
|
||||||
|
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
||||||
|
|
||||||
|
|
||||||
|
def importJsonDataToElastic(self, jsonData, index_name, fields=[]):
|
||||||
|
chunk_size=1000
|
||||||
|
raise_on_error=False
|
||||||
|
raise_on_exception=False
|
||||||
|
stats_only=True
|
||||||
|
yield_ok = False
|
||||||
|
|
||||||
|
actions=[]
|
||||||
|
|
||||||
|
for item in jsonData:
|
||||||
|
id = item['_id'] if item['_id'] else item['id']
|
||||||
|
source = item['_source']
|
||||||
|
if fields :
|
||||||
|
source = {}
|
||||||
|
for col in fields :
|
||||||
|
if col in item['_source'] :
|
||||||
|
source[col] = item['_source']
|
||||||
|
|
||||||
|
|
||||||
|
actions.append({
|
||||||
|
"_op_type": "index",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id" : id,
|
||||||
|
"_source": source
|
||||||
|
})
|
||||||
|
helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
|
||||||
|
|
||||||
|
|
||||||
|
def fileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
print("file zip:" , file_path , " not exist")
|
||||||
|
return
|
||||||
|
print("index:" , index_name , '=>' , file_path )
|
||||||
|
self.counter = 0
|
||||||
|
with open(file_path) as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
self.importJsonDataToElastic(data, index_name, fields)
|
||||||
|
|
||||||
|
self.es.indices.refresh(index=index_name)
|
||||||
|
print(self.es.cat.count(index=index_name, format="json"))
|
||||||
|
|
||||||
|
def zipFileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
print("file zip:" , file_path , " not exist for imort to elastic : ", index_name )
|
||||||
|
return
|
||||||
|
|
||||||
|
fileNo = 0
|
||||||
|
with zipfile.ZipFile(file_path, 'r') as zObject:
|
||||||
|
fileNo +=1
|
||||||
|
print("="*10, " zip fileNo: " , fileNo ," - ( ", index_name," ) | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
|
||||||
|
|
||||||
|
packNo = 0
|
||||||
|
self.counter = 0
|
||||||
|
for filename in zObject.namelist():
|
||||||
|
packNo += 1
|
||||||
|
if limit_pack != -1 :
|
||||||
|
if packNo > limit_pack :
|
||||||
|
print('limit_data ', index_name, ' ', limit_pack)
|
||||||
|
break
|
||||||
|
|
||||||
|
print("index:" , index_name , '=>' , filename )
|
||||||
|
with zObject.open(filename) as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
self.importJsonDataToElastic(data, index_name, fields)
|
||||||
|
|
||||||
|
self.es.indices.refresh(index=index_name)
|
||||||
|
print(self.es.cat.count(index=index_name, format="json"))
|
||||||
|
print(" END Of Import to elastic ", index_name ,"\n")
|
||||||
|
|
||||||
|
|
||||||
|
def iterateJsonFile(self, file_path, isZip=True, limit_pack = -1):
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
print("file zip:" , file_path , " not exist iterateJsonFile " )
|
||||||
|
return
|
||||||
|
|
||||||
|
if isZip :
|
||||||
|
fileNo = 0
|
||||||
|
with zipfile.ZipFile(file_path, 'r') as zObject:
|
||||||
|
fileNo +=1
|
||||||
|
print("="*10, " zip fileNo: " , fileNo ," iterateJsonFile - | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
|
||||||
|
|
||||||
|
packNo = 0
|
||||||
|
self.counter = 0
|
||||||
|
for filename in zObject.namelist():
|
||||||
|
packNo += 1
|
||||||
|
if limit_pack != -1 :
|
||||||
|
if packNo > limit_pack :
|
||||||
|
print('limit_data iterateJsonFile ', limit_pack)
|
||||||
|
break
|
||||||
|
|
||||||
|
print("index iterateJsonFile :", '=>' , filename )
|
||||||
|
with zObject.open(filename) as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
# Yield each entry
|
||||||
|
# yield data
|
||||||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
|
||||||
|
else :
|
||||||
|
with open(filename, 'r', encoding='utf-8') as file:
|
||||||
|
data = json.loads(file.read())
|
||||||
|
# Yield each entry
|
||||||
|
# yield from (hit for hit in data)
|
||||||
|
#return data
|
||||||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
|
||||||
|
|
||||||
|
|
||||||
|
def es_iterate_all_documents(self, index, body="", pagesize=250, scroll_timeout="25m", **kwargs):
|
||||||
|
"""
|
||||||
|
Helper to iterate ALL values from a single index
|
||||||
|
Yields all the documents.
|
||||||
|
"""
|
||||||
|
is_first = True
|
||||||
|
while True:
|
||||||
|
# Scroll next
|
||||||
|
if is_first: # Initialize scroll
|
||||||
|
# result = self.es.search(index=index, scroll="2m", **kwargs, body={
|
||||||
|
# "size": pagesize
|
||||||
|
# })
|
||||||
|
if body :
|
||||||
|
result = self.es.search(
|
||||||
|
index=index,
|
||||||
|
scroll=scroll_timeout,
|
||||||
|
**kwargs,
|
||||||
|
size=pagesize,
|
||||||
|
body=body
|
||||||
|
)
|
||||||
|
else :
|
||||||
|
result = self.es.search(
|
||||||
|
index=index,
|
||||||
|
scroll=scroll_timeout,
|
||||||
|
**kwargs,
|
||||||
|
size=pagesize
|
||||||
|
)
|
||||||
|
|
||||||
|
self.total = result["hits"]["total"]["value"]
|
||||||
|
if self.total > 0:
|
||||||
|
print("total = %d" % self.total)
|
||||||
|
is_first = False
|
||||||
|
else:
|
||||||
|
# result = es.scroll(body={
|
||||||
|
# "scroll_id": scroll_id,
|
||||||
|
# "scroll": scroll_timeout
|
||||||
|
# })
|
||||||
|
result = self.es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
|
||||||
|
|
||||||
|
scroll_id = result["_scroll_id"]
|
||||||
|
hits = result["hits"]["hits"]
|
||||||
|
self.counter += len(hits)
|
||||||
|
if self.total > 0 :
|
||||||
|
print("progress -> %.2f %%" % ((self.counter / self.total) * 100))
|
||||||
|
# Stop after no more docs
|
||||||
|
if not hits:
|
||||||
|
break
|
||||||
|
# Yield each entry
|
||||||
|
yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
|
||||||
|
|
||||||
|
|
||||||
|
def moveCustomFileds(self, index_name_i, index_name_o, fields=[], renameFileds={}):
|
||||||
|
try:
|
||||||
|
body = {}
|
||||||
|
list = []
|
||||||
|
try:
|
||||||
|
list = self.es_iterate_all_documents(index_name_i)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for mentry in list:
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
entry = mentry["source"]
|
||||||
|
id = mentry["id"]
|
||||||
|
# print(id)
|
||||||
|
eid = id
|
||||||
|
|
||||||
|
if (count % 100) == 0 :
|
||||||
|
print("%s -> %.2f " % (id , (count / self.total) if self.total > 0 else 0))
|
||||||
|
|
||||||
|
data_filled = False
|
||||||
|
data = {}
|
||||||
|
for col in fields:
|
||||||
|
|
||||||
|
if '.' in col :
|
||||||
|
cols = col.split('.')
|
||||||
|
subsource = entry
|
||||||
|
for sub in cols :
|
||||||
|
dCol = subsource.get(sub, None)
|
||||||
|
if dCol :
|
||||||
|
subsource = dCol
|
||||||
|
else :
|
||||||
|
break
|
||||||
|
else :
|
||||||
|
dCol = entry.get(col, None)
|
||||||
|
|
||||||
|
if dCol is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if col in renameFileds :
|
||||||
|
data[renameFileds[col]] = dCol
|
||||||
|
else:
|
||||||
|
data[col] = dCol
|
||||||
|
|
||||||
|
data_filled = True
|
||||||
|
|
||||||
|
if not data_filled :
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self.update_index_doc(True, index_name_o, eid, data)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# print("1111")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
def mappingIndex(self, index_name_i):
|
||||||
|
# فقط از طریق کیبانا میشه تغییر مپ داد
|
||||||
|
|
||||||
|
# با پایتون نمیشه
|
||||||
|
# باید ایندکس جدیدی با مپ مطلوب ایجاد کرد و رایندکس کرد
|
||||||
|
pass
|
||||||
|
|
||||||
|
def updateByQueryIndex(self, index_name_i, body):
|
||||||
|
## sample
|
||||||
|
# body = {
|
||||||
|
# "script": {
|
||||||
|
# "inline": "ctx._source.Device='Test'",
|
||||||
|
# "lang": "painless"
|
||||||
|
# },
|
||||||
|
# "query": {
|
||||||
|
# "match": {
|
||||||
|
# "Device": "Boiler"
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
try:
|
||||||
|
self.es.update_by_query(body=body, index=index_name_i)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
|
||||||
|
def deleteByQueryIndex(self, index_name_i, body):
|
||||||
|
## sample
|
||||||
|
# body = {
|
||||||
|
# "query": {
|
||||||
|
# "match": {
|
||||||
|
# "Device": "Boiler"
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
# }
|
||||||
|
try:
|
||||||
|
self.es.delete_by_query(index=index_name_i, body=body )
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
||||||
|
def delete_by_ids(self, index_name_i, ids):
|
||||||
|
try:
|
||||||
|
# ids = ['test1', 'test2', 'test3']
|
||||||
|
|
||||||
|
query = {"query": {"terms": {"_id": ids}}}
|
||||||
|
res = self.es.delete_by_query(index=index_name_i, body=query)
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# save_error(id, e)
|
||||||
|
|
334
ner_dataset/find_law.py
Normal file
334
ner_dataset/find_law.py
Normal file
|
@ -0,0 +1,334 @@
|
||||||
|
# بسم الله
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from elastic_helper import ElasticHelper
|
||||||
|
from thefuzz import fuzz
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Read = open ('.\data\DATASET140402_no_arefـoutput.json',"r",encoding='utf8')
|
||||||
|
RefList = json.loads(Read.read())
|
||||||
|
path = ".\\data\\mj_qa_section-v02.zip"
|
||||||
|
eh_obj = ElasticHelper()
|
||||||
|
sections = eh_obj.iterateJsonFile(path, True)
|
||||||
|
|
||||||
|
|
||||||
|
no_found_id = []
|
||||||
|
txt_file = open(".\\no_find_txt.txt" , "r" , encoding="utf8")
|
||||||
|
n = 0
|
||||||
|
for line in txt_file:
|
||||||
|
if n != 0:
|
||||||
|
no_found_id.append(int(line.strip()))
|
||||||
|
n=0
|
||||||
|
continue
|
||||||
|
n = 1
|
||||||
|
|
||||||
|
|
||||||
|
all_law_dict = []
|
||||||
|
for index, item in enumerate(sections):
|
||||||
|
ref_id = item['id']
|
||||||
|
source = item['source']
|
||||||
|
content = source['content'].strip()
|
||||||
|
all_law_dict.append({"id":ref_id , "caption":content, "approve_date":source['ts_date']})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def law_dict_saver(law_id,start_token_index,end_token_index,found_law_list,law_captions,matched_string,original_string,multi_flag):
|
||||||
|
|
||||||
|
dict = {
|
||||||
|
"law_id" : law_id,
|
||||||
|
"start_token_index": start_token_index,
|
||||||
|
"end_token_index" : end_token_index,
|
||||||
|
"found_law_list": found_law_list,
|
||||||
|
"law_captions" : law_captions,
|
||||||
|
"matched_string": matched_string,
|
||||||
|
"original_string": original_string,
|
||||||
|
"multi_flag": multi_flag
|
||||||
|
}
|
||||||
|
return dict
|
||||||
|
|
||||||
|
def remove_latest_added_token(text):
|
||||||
|
temp = text.strip().split(' ')
|
||||||
|
temp.pop()
|
||||||
|
text = ''
|
||||||
|
for token in temp:
|
||||||
|
text = text + ' ' + token
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def law_recognizer(text, law_dict):
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
normalized_content = text
|
||||||
|
text_token_list = normalized_content.strip().split()
|
||||||
|
matched_token_index_list = []
|
||||||
|
|
||||||
|
# جمع آوری عناوین احتمالی قانون در یک متن بر اساس کلیدواژه قانون
|
||||||
|
for index,token in enumerate(text_token_list):
|
||||||
|
if 'قانون' in token:
|
||||||
|
matched_token_index_list.append(index)
|
||||||
|
|
||||||
|
content_token_list = []
|
||||||
|
law_token_list = []
|
||||||
|
for index, item in enumerate(matched_token_index_list):
|
||||||
|
# اگر آیتم، آخرین عنصر موجود در آرایه نبود ...
|
||||||
|
|
||||||
|
end = 12 # در اینجا مشخص میکنیم چند کلمه را بررسی کند و حلقه بررسی چندبار تکرار شود
|
||||||
|
|
||||||
|
if item < len(text_token_list):
|
||||||
|
# نُه توکن بعدی را به عنوان عبارات تکمیلی احتمالی عنوان قانون ذخیره می کنیم
|
||||||
|
if item + end < len(text_token_list):
|
||||||
|
for i in range(end):
|
||||||
|
if item + (i+1) >= len(text_token_list):
|
||||||
|
break
|
||||||
|
content_token_list.append(text_token_list[item + (i+1)])
|
||||||
|
i = 0
|
||||||
|
# توکن های باقیمانده(که کمتر از نُه توکن است) تا پایان آرایه را ذخیره کن
|
||||||
|
else:
|
||||||
|
j = 0
|
||||||
|
while j < len(text_token_list)-index:
|
||||||
|
if item + (j+1) >= len(text_token_list)-index:
|
||||||
|
break
|
||||||
|
content_token_list.append(text_token_list[item + (j+1)])
|
||||||
|
j += 1
|
||||||
|
j = 0
|
||||||
|
law_token_list.append({
|
||||||
|
'start_token_index': item,
|
||||||
|
'law_token' : content_token_list
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(content_token_list) < end : # اگر مقدار کلمات انتخابی برای بررسی از طول کلمات جمله بیشتر بود
|
||||||
|
end = len(content_token_list) # کلمات انتخابی برای بررسی را به اندازه کل کلمات جمله قرار بده
|
||||||
|
|
||||||
|
content_token_list = []
|
||||||
|
matched_law_list = []
|
||||||
|
c = 0
|
||||||
|
|
||||||
|
|
||||||
|
for key, law_value in enumerate(law_token_list):
|
||||||
|
c += 1
|
||||||
|
law_token = law_value['law_token']
|
||||||
|
start_token_index = law_value['start_token_index']
|
||||||
|
end_token_index = 0
|
||||||
|
found_law_list_1 = []
|
||||||
|
found_law_list_2 = []
|
||||||
|
found_law_list_3 = []
|
||||||
|
# اگر تعداد توکن های متنی که احتمالا عنوان یک قانون است، صفر بود،
|
||||||
|
# از حلقه خارج می شویم و به سراغ بررسی عنوان قانون بعدی می رویم
|
||||||
|
if len(law_token) < 1:
|
||||||
|
break
|
||||||
|
|
||||||
|
# در ابتدا اولین توکن عبارتی که احتمالا عنوان یک قانون است را در عنوان قانون موجود در بانک بررسی می کنیم
|
||||||
|
# در مراحل بعدی تا به نُه گام برسیم، یکی یکی توکن ها را به توکن اول اضافه و سپس با عناوین قانون ها مقایسه می کنیم
|
||||||
|
law_section = law_token[0]
|
||||||
|
for index, value in enumerate(law_dict):
|
||||||
|
# عنوان قانونی که در حال مقایسه متن مورد نظر با آن هستیم
|
||||||
|
id = value['id']
|
||||||
|
current_caption = value['caption']
|
||||||
|
current_approve_date = value['approve_date']
|
||||||
|
# بررسی وجود عبارت مورد نظر در عنوان قانون
|
||||||
|
if current_caption.__contains__(law_section):
|
||||||
|
# به دست آوردن اولین توکن از عنوان قانون
|
||||||
|
current_law_first_token = current_caption.strip().split(' ')[0]
|
||||||
|
# اگر اولین توکن از عنوان قانون برابر با کلمه "قانون" بود، این کلمه را نادیده میگیریم
|
||||||
|
# زیرا در لیست مربوط به لیست توکن های احتمالی مربوط به قوانین، کلمه قانون را در نظر نگرفته ایم
|
||||||
|
if current_law_first_token == 'قانون':
|
||||||
|
current_law_first_token = current_caption.strip().split(' ')[1]
|
||||||
|
if law_section == current_law_first_token:
|
||||||
|
# اگر زیر رشته موردنظر ما در عنوان قانون وجود داشت، نام قانون را در یک لیست ذخیره می کنیم.
|
||||||
|
# در مرحله بعد متن احتمالی قانون که در حال بررسی آن هستیم را با این لیست مقایسه می کنیم تا مقایسه محدود تری داشته باشیم
|
||||||
|
found_law_list_1.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
X = 0
|
||||||
|
FoundLawList=[]
|
||||||
|
OldFoundLawList=[]
|
||||||
|
NewFoundLawList=[]
|
||||||
|
while X < end-1 :
|
||||||
|
# for x in range(end):
|
||||||
|
|
||||||
|
|
||||||
|
X+=1
|
||||||
|
if X == 1: # در بررسی توکن اول وارد این شرط میشود
|
||||||
|
if len(found_law_list_1) == 0:
|
||||||
|
# X= X+1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# X=X+1
|
||||||
|
if len(found_law_list_1) == 1:
|
||||||
|
found_law = []
|
||||||
|
found_law.append(found_law_list_1.pop())
|
||||||
|
k = 0
|
||||||
|
matched_string = ''
|
||||||
|
found_law_caption = found_law[0]['caption'].strip()
|
||||||
|
if found_law_caption.startswith('قانون'):
|
||||||
|
found_law_caption = found_law_caption[5:]
|
||||||
|
found_law_caption_tokens = found_law_caption.strip().split()
|
||||||
|
for k in range(len(law_token)):
|
||||||
|
if k >= len(found_law_caption_tokens):
|
||||||
|
break
|
||||||
|
if law_token[k] == found_law_caption_tokens[k]:
|
||||||
|
matched_string += law_token[k] + ' '
|
||||||
|
else:
|
||||||
|
end_token_index = start_token_index + len(matched_string.strip().split())
|
||||||
|
found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
|
||||||
|
matched_law_list.append(found_law_dict)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
end_token_index = start_token_index + len(matched_string.strip().split())
|
||||||
|
found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
|
||||||
|
matched_law_list.append(found_law_dict)
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(law_token) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
law_section = law_token[0]+' '+law_token[1]
|
||||||
|
for value in found_law_list_1:
|
||||||
|
id = value['id']
|
||||||
|
current_caption = value['caption']
|
||||||
|
current_approve_date = value['approve_date']
|
||||||
|
rate = fuzz.token_set_ratio(current_caption,law_section)
|
||||||
|
if rate == 100:
|
||||||
|
found_law_list_2.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
|
||||||
|
|
||||||
|
FoundLawList = found_law_list_1
|
||||||
|
NewFoundLawList = found_law_list_2
|
||||||
|
continue
|
||||||
|
|
||||||
|
OldFoundLawList = FoundLawList
|
||||||
|
FoundLawList = NewFoundLawList
|
||||||
|
NewFoundLawList = []
|
||||||
|
|
||||||
|
if X == int(end-1): # در بررسی آخرین کلمه وارد این شرط میشود
|
||||||
|
if len(FoundLawList) == 0:
|
||||||
|
# اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
|
||||||
|
if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
|
||||||
|
# به دقت کنترل شود
|
||||||
|
# مرتب سازی بر اساس قدیم به جدیدترین شناسه
|
||||||
|
sorted_found_law_list = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
|
||||||
|
found_law = sorted_found_law_list.pop()
|
||||||
|
end_token_index = start_token_index + len(law_section.strip().split())
|
||||||
|
# آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
|
||||||
|
# زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
|
||||||
|
law_section = remove_latest_added_token(law_section)
|
||||||
|
found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
|
||||||
|
matched_law_list.append(found_law_dict)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if len(FoundLawList) == 1:
|
||||||
|
sorted_found_law_list = sorted(FoundLawList, key=lambda x: x['approve_date'])
|
||||||
|
found_law = []
|
||||||
|
found_law.append(FoundLawList.pop())
|
||||||
|
end_token_index = start_token_index + len(law_section.strip().split())
|
||||||
|
found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
|
||||||
|
matched_law_list.append(found_law_dict)
|
||||||
|
|
||||||
|
elif len(FoundLawList) > 1 and len(FoundLawList) < 6:
|
||||||
|
sorted_found_law_list = sorted(OldFoundLawList, key=lambda x: x['approve_date'] )
|
||||||
|
found_law = sorted_found_law_list.pop()
|
||||||
|
end_token_index = start_token_index + len(law_section.strip().split())
|
||||||
|
found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,FoundLawList,found_law['caption'],law_section,law_token,True)
|
||||||
|
matched_law_list.append(found_law_dict)
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(FoundLawList) == 0:
|
||||||
|
# اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
|
||||||
|
if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
|
||||||
|
# به دقت کنترل شود
|
||||||
|
# مرتب سازی بر اساس قدیم به جدیدترین شناسه
|
||||||
|
sorted_found_law_list = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
|
||||||
|
found_law = sorted_found_law_list.pop()
|
||||||
|
end_token_index = start_token_index + len(law_section.strip().split())
|
||||||
|
# آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
|
||||||
|
# زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
|
||||||
|
law_section = remove_latest_added_token(law_section)
|
||||||
|
found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
|
||||||
|
matched_law_list.append(found_law_dict)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if len(FoundLawList) == 1:
|
||||||
|
found_law = []
|
||||||
|
found_law.append(FoundLawList.pop()) # = found_law_list_2.pop()
|
||||||
|
end_token_index = start_token_index + len(law_section.strip().split())
|
||||||
|
found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
|
||||||
|
matched_law_list.append(found_law_dict)
|
||||||
|
# اگر در جستجوی عنوان قانون، به یک مورد منحصر به فرد رسیده بودیم، فقط همین یک عنوان را ذخیره کند
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(law_token) < X+1 :
|
||||||
|
continue
|
||||||
|
law_section += ' ' + law_token[X] # X = new token
|
||||||
|
# law_section = List_Law_tokens
|
||||||
|
for value in FoundLawList:
|
||||||
|
id = value['id']
|
||||||
|
current_caption = value['caption']
|
||||||
|
current_approve_date = value['approve_date']
|
||||||
|
rate = fuzz.token_set_ratio(current_caption,law_section)
|
||||||
|
|
||||||
|
if rate == 100:
|
||||||
|
|
||||||
|
NewFoundLawList.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
|
||||||
|
|
||||||
|
# OldFoundLawList=FoundLawList
|
||||||
|
# FoundLawList=NewFoundLawList
|
||||||
|
# NewFoundLawList=[]
|
||||||
|
|
||||||
|
if matched_law_list:
|
||||||
|
for law_item in matched_law_list:
|
||||||
|
temp_list = []
|
||||||
|
found_list = law_item['found_law_list']
|
||||||
|
for item in found_list:
|
||||||
|
temp_list.append(item['caption'] + '#' + str(item['id']) + '#' + item['approve_date'])
|
||||||
|
|
||||||
|
law_item['found_law_list'] = temp_list
|
||||||
|
return matched_law_list, law_token_list
|
||||||
|
|
||||||
|
n=0
|
||||||
|
not_found_ids = []
|
||||||
|
all_laws_founded = []
|
||||||
|
for section in RefList :
|
||||||
|
|
||||||
|
refID , Content , ner_list = section['id'],section['content'].strip(),section['ner']
|
||||||
|
print(f"ID {refID} is searching... ")
|
||||||
|
if refID in no_found_id:
|
||||||
|
matched_law_list, law_token_list = law_recognizer(Content, all_law_dict )
|
||||||
|
matched_law_list_ids = []
|
||||||
|
matched_law_list_content = []
|
||||||
|
for law in matched_law_list:
|
||||||
|
matched_law_list_ids.append(law['law_id'])
|
||||||
|
matched_law_list_content.append(law['law_captions'])
|
||||||
|
if len(matched_law_list) != 0 :
|
||||||
|
n+=1
|
||||||
|
all_laws_founded.append({"dataset-REF":{"id":refID,"content":Content},
|
||||||
|
"All-REF":{"id":matched_law_list_ids,"content":matched_law_list_content}})
|
||||||
|
|
||||||
|
else:
|
||||||
|
not_found_ids.append(refID)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
txt=''
|
||||||
|
for id_ in not_found_ids:
|
||||||
|
txt+=f"{id_}\n"
|
||||||
|
|
||||||
|
with open("not_found_idsX.txt", "w",encoding="utf8") as file:
|
||||||
|
# نوشتن دادهها در فایل
|
||||||
|
file.write(txt)
|
||||||
|
|
||||||
|
with open("founded_lawsX.json", "w" , encoding="utf8") as f:
|
||||||
|
json.dump(all_laws_founded, f, indent=4, ensure_ascii=False )
|
||||||
|
|
||||||
|
print(f"{n} Law Founded ! ")
|
||||||
|
print(f"{len(not_found_ids)} Law Not Founded ! ")
|
1413
ner_dataset/normalizer.py
Normal file
1413
ner_dataset/normalizer.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user