79 lines
2.1 KiB
Python
79 lines
2.1 KiB
Python
|
"""
|
|||
|
در این فایل، متن قانون به همراه آبجکتی از موجودیت های نامدار که قبلا مدل تشخیص داده است به عنوان
|
|||
|
ورودی در نظر گرفته می شود و خروجی آن، یک دیتاست با فرمت
|
|||
|
NER
|
|||
|
خواهد بود
|
|||
|
"""
|
|||
|
|
|||
|
import numpy as np
|
|||
|
from funcs import save_to_file_by_address, read_from_json
|
|||
|
|
|||
|
def create_dataset_part(content, ners):
|
|||
|
tokens = content.split()
|
|||
|
token_list = [[item,"O"] for item in tokens]
|
|||
|
np_tokens = np.array(token_list)
|
|||
|
|
|||
|
for ner in ners:
|
|||
|
pass
|
|||
|
begin = ner["begin"]
|
|||
|
end = ner["end"]
|
|||
|
|
|||
|
# تصحیح عنوان نوع موجودیت ها
|
|||
|
key = ner["key"]
|
|||
|
if key == "H_REF":
|
|||
|
key = "Href"
|
|||
|
elif key == "REF":
|
|||
|
key = "Ref"
|
|||
|
elif key == "ORG":
|
|||
|
key = "Org"
|
|||
|
elif key == "DATE":
|
|||
|
key = "Date"
|
|||
|
elif key == "DATE2":
|
|||
|
key = "Date"
|
|||
|
elif key == "DATE3":
|
|||
|
key = "Date"
|
|||
|
|
|||
|
np_tokens[begin][1] = f"B-{key}"
|
|||
|
for index in range(begin+1, end):
|
|||
|
np_tokens[index][1] = f"I-{key}"
|
|||
|
|
|||
|
return np_tokens
|
|||
|
|
|||
|
def create_dataset(all_dataset_parts):
|
|||
|
final_dataset_text = ""
|
|||
|
for sentence_tokens in all_dataset_parts:
|
|||
|
for token in sentence_tokens:
|
|||
|
final_dataset_text = final_dataset_text + ''.join(f"{token[0]} {token[1]}\n")
|
|||
|
pass
|
|||
|
final_dataset_text += "\n"
|
|||
|
|
|||
|
return final_dataset_text
|
|||
|
|
|||
|
sections_110_addresss = "./data/sections_110_ner.json"
|
|||
|
sections_list = read_from_json(sections_110_addresss)
|
|||
|
|
|||
|
ners = []
|
|||
|
content = ''
|
|||
|
token_list = []
|
|||
|
|
|||
|
all_sections = []
|
|||
|
for section in sections_list:
|
|||
|
|
|||
|
section_id = section["id"]
|
|||
|
content = section["content"]
|
|||
|
ners = section["ners_v1"]
|
|||
|
np_tokens = create_dataset_part(content=content, ners=ners)
|
|||
|
all_sections.append(np_tokens)
|
|||
|
|
|||
|
final_dataset = create_dataset(all_sections).strip()
|
|||
|
|
|||
|
path = "./data/ner_dataset_110.txt"
|
|||
|
|
|||
|
save_to_file_by_address(path, final_dataset)
|
|||
|
|
|||
|
print(' operation finished! ')
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|