ai_dataset/import_data/convert_ner_to_dataset.py

79 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
در این فایل، متن قانون به همراه آبجکتی از موجودیت های نامدار که قبلا مدل تشخیص داده است به عنوان
ورودی در نظر گرفته می شود و خروجی آن، یک دیتاست با فرمت
NER
خواهد بود
"""
import numpy as np
from funcs import save_to_file_by_address, read_from_json
def create_dataset_part(content, ners):
tokens = content.split()
token_list = [[item,"O"] for item in tokens]
np_tokens = np.array(token_list)
for ner in ners:
pass
begin = ner["begin"]
end = ner["end"]
# تصحیح عنوان نوع موجودیت ها
key = ner["key"]
if key == "H_REF":
key = "Href"
elif key == "REF":
key = "Ref"
elif key == "ORG":
key = "Org"
elif key == "DATE":
key = "Date"
elif key == "DATE2":
key = "Date"
elif key == "DATE3":
key = "Date"
np_tokens[begin][1] = f"B-{key}"
for index in range(begin+1, end):
np_tokens[index][1] = f"I-{key}"
return np_tokens
def create_dataset(all_dataset_parts):
final_dataset_text = ""
for sentence_tokens in all_dataset_parts:
for token in sentence_tokens:
final_dataset_text = final_dataset_text + ''.join(f"{token[0]} {token[1]}\n")
pass
final_dataset_text += "\n"
return final_dataset_text
sections_110_addresss = "./data/sections_110_ner.json"
sections_list = read_from_json(sections_110_addresss)
ners = []
content = ''
token_list = []
all_sections = []
for section in sections_list:
section_id = section["id"]
content = section["content"]
ners = section["ners_v1"]
np_tokens = create_dataset_part(content=content, ners=ners)
all_sections.append(np_tokens)
final_dataset = create_dataset(all_sections).strip()
path = "./data/ner_dataset_110.txt"
save_to_file_by_address(path, final_dataset)
print(' operation finished! ')