ai_dataset/ner_dataset/REF_finder.py
2025-08-20 06:27:36 +03:30

67 lines
1.5 KiB
Python

# بسم الله
import json
from elastic_helper import ElasticHelper
Read = open (".\data\DATASET140402_no_arefـoutput.json","r",encoding='utf8')
RefList = json.loads(Read.read())
path = ".\data\mj_qa_section-v02.zip"
eh_obj = ElasticHelper()
sections = eh_obj.iterateJsonFile(path, True)
all_ref_list = []
find_refs_list = []
not_find_refs_list = []
for index, item in enumerate(sections):
ref_id = item['id']
source = item['source']
content = source['content']
all_ref_list.append([ref_id,content.strip()])
n=1
for item in RefList :
refID2 , Content2 , ner_list = item['id'],item['content'].strip(),item['ner']
x=0
for refID1 , Content in all_ref_list:
if len(ner_list)==0:
x=1
continue
else:
if Content2 == Content and x == 0:
find_refs_list.append([refID1,refID2,Content])
print(f"REF ID {refID2} Found ! ... ")
x = 1
if x == 0:
not_find_refs_list.append(refID2)
print(f"{n} OF {len(RefList)} searched ...")
n+=1
with open("foundfind_refs_list.json", "w" , encoding="utf8") as f:
json.dump(find_refs_list, f, indent=4, ensure_ascii=False )
txt=''
for id_ in not_find_refs_list:
txt+=f"{id_}\n"
with open("not_found_ids.txt", "w",encoding="utf8") as file:
# نوشتن داده‌ها در فایل
file.write(txt)
print("finish!")