79 lines
2.1 KiB
Python
79 lines
2.1 KiB
Python
"""
|
||
در این فایل، متن قانون به همراه آبجکتی از موجودیت های نامدار که قبلا مدل تشخیص داده است به عنوان
|
||
ورودی در نظر گرفته می شود و خروجی آن، یک دیتاست با فرمت
|
||
NER
|
||
خواهد بود
|
||
"""
|
||
|
||
import numpy as np
|
||
from funcs import save_to_file_by_address, read_from_json
|
||
|
||
def create_dataset_part(content, ners):
|
||
tokens = content.split()
|
||
token_list = [[item,"O"] for item in tokens]
|
||
np_tokens = np.array(token_list)
|
||
|
||
for ner in ners:
|
||
pass
|
||
begin = ner["begin"]
|
||
end = ner["end"]
|
||
|
||
# تصحیح عنوان نوع موجودیت ها
|
||
key = ner["key"]
|
||
if key == "H_REF":
|
||
key = "Href"
|
||
elif key == "REF":
|
||
key = "Ref"
|
||
elif key == "ORG":
|
||
key = "Org"
|
||
elif key == "DATE":
|
||
key = "Date"
|
||
elif key == "DATE2":
|
||
key = "Date"
|
||
elif key == "DATE3":
|
||
key = "Date"
|
||
|
||
np_tokens[begin][1] = f"B-{key}"
|
||
for index in range(begin+1, end):
|
||
np_tokens[index][1] = f"I-{key}"
|
||
|
||
return np_tokens
|
||
|
||
def create_dataset(all_dataset_parts):
|
||
final_dataset_text = ""
|
||
for sentence_tokens in all_dataset_parts:
|
||
for token in sentence_tokens:
|
||
final_dataset_text = final_dataset_text + ''.join(f"{token[0]} {token[1]}\n")
|
||
pass
|
||
final_dataset_text += "\n"
|
||
|
||
return final_dataset_text
|
||
|
||
sections_110_addresss = "./data/sections_110_ner.json"
|
||
sections_list = read_from_json(sections_110_addresss)
|
||
|
||
ners = []
|
||
content = ''
|
||
token_list = []
|
||
|
||
all_sections = []
|
||
for section in sections_list:
|
||
|
||
section_id = section["id"]
|
||
content = section["content"]
|
||
ners = section["ners_v1"]
|
||
np_tokens = create_dataset_part(content=content, ners=ners)
|
||
all_sections.append(np_tokens)
|
||
|
||
final_dataset = create_dataset(all_sections).strip()
|
||
|
||
path = "./data/ner_dataset_110.txt"
|
||
|
||
save_to_file_by_address(path, final_dataset)
|
||
|
||
print(' operation finished! ')
|
||
|
||
|
||
|
||
|