ai_dataset/import_data/convert_ner_to_dataset.py

79 lines
2.1 KiB
Python
Raw Normal View History

2024-09-17 16:44:55 +00:00
"""
در این فایل، متن قانون به همراه آبجکتی از موجودیت های نامدار که قبلا مدل تشخیص داده است به عنوان
ورودی در نظر گرفته می شود و خروجی آن، یک دیتاست با فرمت
NER
خواهد بود
"""
import numpy as np
from funcs import save_to_file_by_address, read_from_json
def create_dataset_part(content, ners):
tokens = content.split()
token_list = [[item,"O"] for item in tokens]
np_tokens = np.array(token_list)
for ner in ners:
pass
begin = ner["begin"]
end = ner["end"]
# تصحیح عنوان نوع موجودیت ها
key = ner["key"]
if key == "H_REF":
key = "Href"
elif key == "REF":
key = "Ref"
elif key == "ORG":
key = "Org"
elif key == "DATE":
key = "Date"
elif key == "DATE2":
key = "Date"
elif key == "DATE3":
key = "Date"
np_tokens[begin][1] = f"B-{key}"
for index in range(begin+1, end):
np_tokens[index][1] = f"I-{key}"
return np_tokens
def create_dataset(all_dataset_parts):
final_dataset_text = ""
for sentence_tokens in all_dataset_parts:
for token in sentence_tokens:
final_dataset_text = final_dataset_text + ''.join(f"{token[0]} {token[1]}\n")
pass
final_dataset_text += "\n"
return final_dataset_text
sections_110_addresss = "./data/sections_110_ner.json"
sections_list = read_from_json(sections_110_addresss)
ners = []
content = ''
token_list = []
all_sections = []
for section in sections_list:
section_id = section["id"]
content = section["content"]
ners = section["ners_v1"]
np_tokens = create_dataset_part(content=content, ners=ners)
all_sections.append(np_tokens)
final_dataset = create_dataset(all_sections).strip()
path = "./data/ner_dataset_110.txt"
save_to_file_by_address(path, final_dataset)
print(' operation finished! ')