""" در این فایل، متن قانون به همراه آبجکتی از موجودیت های نامدار که قبلا مدل تشخیص داده است به عنوان ورودی در نظر گرفته می شود و خروجی آن، یک دیتاست با فرمت NER خواهد بود """ import numpy as np from funcs import save_to_file_by_address, read_from_json def create_dataset_part(content, ners): tokens = content.split() token_list = [[item,"O"] for item in tokens] np_tokens = np.array(token_list) for ner in ners: pass begin = ner["begin"] end = ner["end"] # تصحیح عنوان نوع موجودیت ها key = ner["key"] if key == "H_REF": key = "Href" elif key == "REF": key = "Ref" elif key == "ORG": key = "Org" elif key == "DATE": key = "Date" elif key == "DATE2": key = "Date" elif key == "DATE3": key = "Date" np_tokens[begin][1] = f"B-{key}" for index in range(begin+1, end): np_tokens[index][1] = f"I-{key}" return np_tokens def create_dataset(all_dataset_parts): final_dataset_text = "" for sentence_tokens in all_dataset_parts: for token in sentence_tokens: final_dataset_text = final_dataset_text + ''.join(f"{token[0]} {token[1]}\n") pass final_dataset_text += "\n" return final_dataset_text sections_110_addresss = "./data/sections_110_ner.json" sections_list = read_from_json(sections_110_addresss) ners = [] content = '' token_list = [] all_sections = [] for section in sections_list: section_id = section["id"] content = section["content"] ners = section["ners_v1"] np_tokens = create_dataset_part(content=content, ners=ners) all_sections.append(np_tokens) final_dataset = create_dataset(all_sections).strip() path = "./data/ner_dataset_110.txt" save_to_file_by_address(path, final_dataset) print(' operation finished! ')