From 18db5b6fbc577b47408434c2c946ff7d1ba63534 Mon Sep 17 00:00:00 2001 From: ajokar Date: Tue, 17 Sep 2024 16:45:26 +0000 Subject: [PATCH] Upload files to "import_data" --- import_data/hamta_dataset.py | 175 ++++ import_data/label_tokens.py | 67 ++ import_data/normalizer.py | 1367 +++++++++++++++++++++++++++ import_data/peyma_dataset.py | 177 ++++ import_data/qavanin_dataset_test.py | 172 ++++ 5 files changed, 1958 insertions(+) create mode 100644 import_data/hamta_dataset.py create mode 100644 import_data/label_tokens.py create mode 100644 import_data/normalizer.py create mode 100644 import_data/peyma_dataset.py create mode 100644 import_data/qavanin_dataset_test.py diff --git a/import_data/hamta_dataset.py b/import_data/hamta_dataset.py new file mode 100644 index 0000000..74dfb44 --- /dev/null +++ b/import_data/hamta_dataset.py @@ -0,0 +1,175 @@ +import json +import requests +from decimal import Decimal +import os + +TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc' +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} + +url = "" +headers = HEADERS + +address = os.getcwd() +if 'impoert_data' in address: + address += '/data/arman_train.txt' +else: + address += '/impoert_data/data/arman_train.txt' + +# باز کردن فایل متنی +with open(address, 'r', encoding='utf-8') as file: + input_text = file.read() + +# تبدیل متن به لیستی از خطوط +lines = input_text.strip().split('\n') + +key = '' +begin = -1 +end = -1 +tokenNumber = -1 +content = '' +result_token = [] + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Decimal): + return float(obj) + return json.JSONEncoder.default(self, obj) + + + +# def createIndex(content, result_token): +# output = { +# "content": content, +# "domain": "پیکره آرمان", +# "ref_id": "", +# "ref_url": "", +# "result_token": result_token, +# } +# # print(output) +# # print(json.dumps(output, indent=4, ensure_ascii=False)) +# return output +def createIndex(content, result_token): + result_objects = [{ + "task":"ner", + "key":"qavanin_keyword", + "label":"خروجی تیم همتا", + "values":result_token + } ] + output ={ + "content": content, + "domain": "پیکره قوانین(کلیدواژه)", + "ref_id": "", + "ref_url": "", + "result_objects": result_objects, + } + # print(output) + # print(json.dumps(output, indent=4, ensure_ascii=False)) + return output + + + +def appendResultToken(text, key, begin, tokenNumber, result_token): + end = -1 + # if key == 'HALFREFERENCE' : + # key = 'H_REF' + # elif key == 'REFERENCE' : + # key = 'REF' + if key: + if key == 'org' : + key = 'ORG' + elif key == 'loc' : + key = 'LOC' + elif key == 'Facility' : + key = 'fac' + elif key == 'event' : + key = 'EVENT' + elif key == 'pro' : + key = 'PRO' + elif key == 'pers' : + key = 'PER' + + end = tokenNumber -1 + result_token.append({ + "begin": begin, + "end": end, + "result_key": key, + "text" : text + }) + begin = -1 + end = -1 + key = '' + return key, begin, end, result_token + +bulk_data = [] +bulk_count = 1 +count = 0 +text = '' + +for i, line in enumerate(lines): + print('line: ' + str(i)) + count += 1 + tokenNumber = tokenNumber + 1 + + if line.strip() == '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + data = createIndex(content, result_token) + tokenNumber = -1 + content = '' + result_token = [] + + bulk_data.append(data) + bulk_count +=1 + if bulk_data.__len__() > 100: + print('=' * 30 ) + print('count ' + str(count)) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response) + bulk_data = [] + bulk_count = 1 + + continue + + + + parts = line.split() + if len(parts) != 2: + continue + + + content += ' ' + parts[0] + + result_key = parts[1] + if result_key.startswith('I-'): + text += ' ' + parts[0] + continue + + if result_key == 'O' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + if result_key.startswith('B-'): + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + text = parts[0] + begin = tokenNumber + end = -1 + key = result_key.replace('B-', '') + + +if content != '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + data = createIndex(content, result_token) + bulk_data.append(data) + bulk_count +=1 + + +if bulk_data.__len__() > 0: + print(bulk_count) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response.text) + +# نمایش دیکشنری خروجی به صورت JSON +print("***************** end ") \ No newline at end of file diff --git a/import_data/label_tokens.py b/import_data/label_tokens.py new file mode 100644 index 0000000..4839882 --- /dev/null +++ b/import_data/label_tokens.py @@ -0,0 +1,67 @@ +from docx import Document +import re + +def extract_key_phrases(file_path): + doc = Document(file_path) + key_phrases = [] + pattern = re.compile(r'!(.*?)!') + + for paragraph in doc.paragraphs: + matches = pattern.findall(paragraph.text) + key_phrases.extend(matches) + + return key_phrases + +def extract_text_from_docx(file_path): + doc = Document(file_path) + text = '' + + for paragraph in doc.paragraphs: + text += paragraph.text + ' ' + + return text.strip() + +def tokenize_and_tag(text, key_phrases): + + key_phrases_sorted = sorted(key_phrases, key=len, reverse=True) + for key_phrase in key_phrases_sorted: + + text = text.replace(f'!{key_phrase}!', f' {key_phrase} ') + + tokens = re.findall(r'\S+|[^\s\w]', text) + tagged_tokens = [] + i = 0 + + while i < len(tokens): + token = tokens[i] + matched = False + + + for key_phrase in key_phrases_sorted: + key_tokens = key_phrase.split() + if tokens[i:i + len(key_tokens)] == key_tokens: + tagged_tokens.append((key_tokens[0], 'B-key')) + for key_token in key_tokens[1:]: + tagged_tokens.append((key_token, 'I-key')) + i += len(key_tokens) + matched = True + break + + if not matched: + tagged_tokens.append((token, 'O')) + i += 1 + + return tagged_tokens + +def write_tokens_to_file(tagged_tokens, output_file): + with open(output_file, 'w', encoding='utf-8') as f: + for token, tag in tagged_tokens: + f.write(f"{token}\t{tag}\n") + +file_path = 'D:/project/output.docx' +output_file = 'output.txt' + +text = extract_text_from_docx(file_path) +key_phrases = extract_key_phrases(file_path) +tagged_tokens = tokenize_and_tag(text, key_phrases) +write_tokens_to_file(tagged_tokens, output_file) \ No newline at end of file diff --git a/import_data/normalizer.py b/import_data/normalizer.py new file mode 100644 index 0000000..863fe34 --- /dev/null +++ b/import_data/normalizer.py @@ -0,0 +1,1367 @@ +from re import sub +import copy +import os +from tokenizer import Tokenizer +from data_helper import DataHelper +import general_functions as gf +import traceback,sys + +class Normalizer(): + + def __init__(self, + half_space_char='\u200c', + date_normalizing_needed=False, + pinglish_conversion_needed=False, + train_file_path="resource/tokenizer/Bijan_khan_chunk.txt", + token_merger_path="resource/tokenizer/TokenMerger.pckl"): + self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" + + self.dic1_path = self.dir_path + 'resource/normalizer/Dic1_new.txt' + self.dic2_path = self.dir_path + 'resource/normalizer/Dic2_new.txt' + self.dic3_path = self.dir_path + 'resource/normalizer/Dic3_new.txt' + self.dic1 = self.load_dictionary(self.dic1_path) + self.dic2 = self.load_dictionary(self.dic2_path) + self.dic3 = self.load_dictionary(self.dic3_path) + + self.date_normalizing_needed = date_normalizing_needed + self.pinglish_conversion_needed = pinglish_conversion_needed + self.data_helper = DataHelper() + + + if self.date_normalizing_needed or self.pinglish_conversion_needed: + self.tokenizer = Tokenizer() + self.date_normalizer = DateNormalizer() + self.pinglish_conversion = PinglishNormalizer() + + + + def load_dictionary(self, file_path): + dict = {} + with open(file_path, 'r', encoding='utf-8') as f: + g = f.readlines() + for Wrds in g: + wrd = Wrds.split(' ') + dict[wrd[0].strip()] = sub('\n', '', wrd[1].strip()) + return dict + + def sub_alphabets(self, doc_string): + # try: + # doc_string = doc_string.decode('utf-8') + # except UnicodeEncodeError: + # pass + a0 = "ء" + b0 = "ئ" + c0 = sub(a0, b0, doc_string) + a1 = r"ٲ|ٱ|إ|ﺍ|أ" + a11 = r"ﺁ|آ" + b1 = r"ا" + b11 = r"آ" + c11 = sub(a11, b11, c0) + c1 = sub(a1, b1, c11) + a2 = r"ﺐ|ﺏ|ﺑ" + b2 = r"ب" + c2 = sub(a2, b2, c1) + a3 = r"ﭖ|ﭗ|ﭙ|ﺒ|ﭘ" + b3 = r"پ" + c3 = sub(a3, b3, c2) + a4 = r"ﭡ|ٺ|ٹ|ﭞ|ٿ|ټ|ﺕ|ﺗ|ﺖ|ﺘ" + b4 = r"ت" + c4 = sub(a4, b4, c3) + a5 = r"ﺙ|ﺛ" + b5 = r"ث" + c5 = sub(a5, b5, c4) + a6 = r"ﺝ|ڃ|ﺠ|ﺟ" + b6 = r"ج" + c6 = sub(a6, b6, c5) + a7 = r"ڃ|ﭽ|ﭼ" + b7 = r"چ" + c7 = sub(a7, b7, c6) + a8 = r"ﺢ|ﺤ|څ|ځ|ﺣ" + b8 = r"ح" + c8 = sub(a8, b8, c7) + a9 = r"ﺥ|ﺦ|ﺨ|ﺧ" + b9 = r"خ" + c9 = sub(a9, b9, c8) + a10 = r"ڏ|ډ|ﺪ|ﺩ" + b10 = r"د" + c10 = sub(a10, b10, c9) + a11 = r"ﺫ|ﺬ|ﻧ" + b11 = r"ذ" + c11 = sub(a11, b11, c10) + a12 = r"ڙ|ڗ|ڒ|ڑ|ڕ|ﺭ|ﺮ" + b12 = r"ر" + c12 = sub(a12, b12, c11) + a13 = r"ﺰ|ﺯ" + b13 = r"ز" + c13 = sub(a13, b13, c12) + a14 = r"ﮊ" + b14 = r"ژ" + c14 = sub(a14, b14, c13) + a15 = r"ݭ|ݜ|ﺱ|ﺲ|ښ|ﺴ|ﺳ" + b15 = r"س" + c15 = sub(a15, b15, c14) + a16 = r"ﺵ|ﺶ|ﺸ|ﺷ" + b16 = r"ش" + c16 = sub(a16, b16, c15) + a17 = r"ﺺ|ﺼ|ﺻ" + b17 = r"ص" + c17 = sub(a17, b17, c16) + a18 = r"ﺽ|ﺾ|ﺿ|ﻀ" + b18 = r"ض" + c18 = sub(a18, b18, c17) + a19 = r"ﻁ|ﻂ|ﻃ|ﻄ" + b19 = r"ط" + c19 = sub(a19, b19, c18) + a20 = r"ﻆ|ﻇ|ﻈ" + b20 = r"ظ" + c20 = sub(a20, b20, c19) + a21 = r"ڠ|ﻉ|ﻊ|ﻋ" + b21 = r"ع" + c21 = sub(a21, b21, c20) + a22 = r"ﻎ|ۼ|ﻍ|ﻐ|ﻏ" + b22 = r"غ" + c22 = sub(a22, b22, c21) + a23 = r"ﻒ|ﻑ|ﻔ|ﻓ" + b23 = r"ف" + c23 = sub(a23, b23, c22) + a24 = r"ﻕ|ڤ|ﻖ|ﻗ" + b24 = r"ق" + c24 = sub(a24, b24, c23) + a25 = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك" + b25 = r"ک" + c25 = sub(a25, b25, c24) + a26 = r"ﮚ|ﮒ|ﮓ|ﮕ|ﮔ" + b26 = r"گ" + c26 = sub(a26, b26, c25) + a27 = r"ﻝ|ﻞ|ﻠ|ڵ" + b27 = r"ل" + c27 = sub(a27, b27, c26) + a28 = r"ﻡ|ﻤ|ﻢ|ﻣ" + b28 = r"م" + c28 = sub(a28, b28, c27) + a29 = r"ڼ|ﻦ|ﻥ|ﻨ" + b29 = r"ن" + c29 = sub(a29, b29, c28) + a30 = r"ވ|ﯙ|ۈ|ۋ|ﺆ|ۊ|ۇ|ۏ|ۅ|ۉ|ﻭ|ﻮ|ؤ" + b30 = r"و" + c30 = sub(a30, b30, c29) + a31 = r"ﺔ|ﻬ|ھ|ﻩ|ﻫ|ﻪ|ۀ|ە|ة|ہ" + b31 = r"ه" + c31 = sub(a31, b31, c30) + a32 = r"ﭛ|ﻯ|ۍ|ﻰ|ﻱ|ﻲ|ں|ﻳ|ﻴ|ﯼ|ې|ﯽ|ﯾ|ﯿ|ێ|ے|ى|ي" + b32 = r"ی" + c32 = sub(a32, b32, c31) + a33 = r'¬' + b33 = r'‌' + c33 = sub(a33, b33, c32) + pa0 = r'•|·|●|·|・|∙|。|ⴰ' + pb0 = r'.' + pc0 = sub(pa0, pb0, c33) + pa1 = r',|٬|٫|‚|,' + pb1 = r'،' + pc1 = sub(pa1, pb1, pc0) + pa2 = r'ʕ' + pb2 = r'؟' + pc2 = sub(pa2, pb2, pc1) + na0 = r'۰|٠' + nb0 = r'0' + nc0 = sub(na0, nb0, pc2) + na1 = r'۱|١' + nb1 = r'1' + nc1 = sub(na1, nb1, nc0) + na2 = r'۲|٢' + nb2 = r'2' + nc2 = sub(na2, nb2, nc1) + na3 = r'۳|٣' + nb3 = r'3' + nc3 = sub(na3, nb3, nc2) + na4 = r'۴|٤' + nb4 = r'4' + nc4 = sub(na4, nb4, nc3) + na5 = r'۵' + nb5 = r'5' + nc5 = sub(na5, nb5, nc4) + na6 = r'۶|٦' + nb6 = r'6' + nc6 = sub(na6, nb6, nc5) + na7 = r'۷|٧' + nb7 = r'7' + nc7 = sub(na7, nb7, nc6) + na8 = r'۸|٨' + nb8 = r'8' + nc8 = sub(na8, nb8, nc7) + na9 = r'۹|٩' + nb9 = r'9' + nc9 = sub(na9, nb9, nc8) + np2 = r'²' + nm2 = r'2' + ng2 = sub(np2, nm2, nc9) + ea1 = r'|ِ|ُ|َ|ٍ|ٌ|ً|' + eb1 = r'' + ec1 = sub(ea1, eb1, ng2) + ea1 = r'ـ' + eb1 = r'_' + ec2 = sub(ea1, eb1, ec1) + Sa1 = r'( )+' + Sb1 = r' ' + Sc1 = sub(Sa1, Sb1, ec2) + Sa2 = r'(\n)+' + Sb2 = r'\n' + Sc2 = sub(Sa2, Sb2, Sc1) + return Sc2 + + def space_correction(self, doc_string): + a00 = r'^(بی|می|نمی)( )' + b00 = r'\1‌' + c00 = sub(a00, b00, doc_string) + a0 = r'( )(می|نمی|بی)( )' + b0 = r'\1\2‌' + c0 = sub(a0, b0, c00) + a1 = r'( )(هایی|ها|های|ایی|هایم|هایت|هایش|هایمان|هایتان|هایشان|ات|ان|ین' \ + r'|انی|بان|ام|ای|یم|ید|اید|اند|بودم|بودی|بود|بودیم|بودید|بودند|ست)( )' + b1 = r'‌\2\3' + c1 = sub(a1, b1, c0) + a2 = r'( )(شده|نشده)( )' + b2 = r'‌\2‌' + c2 = sub(a2, b2, c1) + a3 = r'( )(طلبان|طلب|گرایی|گرایان|شناس|شناسی|گذاری|گذار|گذاران|شناسان|گیری|پذیری|بندی|آوری|سازی|' \ + r'بندی|کننده|کنندگان|گیری|پرداز|پردازی|پردازان|آمیز|سنجی|ریزی|داری|دهنده|آمیز|پذیری' \ + r'|پذیر|پذیران|گر|ریز|ریزی|رسانی|یاب|یابی|گانه|گانه‌ای|انگاری|گا|بند|رسانی|دهندگان|دار)( )' + b3 = r'‌\2\3' + c3 = sub(a3, b3, c2) + return c3 + + def space_correction_plus1(self, doc_string): + out_sentences = '' + for wrd in doc_string.split(' '): + try: + out_sentences = out_sentences + ' ' + self.dic1[wrd] + except KeyError: + out_sentences = out_sentences + ' ' + wrd + return out_sentences + + def space_correction_plus2(self, doc_string): + out_sentences = '' + wrds = doc_string.split(' ') + L = wrds.__len__() + if L < 2: + return doc_string + cnt = 1 + for i in range(0, L - 1): + w = wrds[i] + wrds[i + 1] + try: + out_sentences = out_sentences + ' ' + self.dic2[w] + cnt = 0 + except KeyError: + if cnt == 1: + out_sentences = out_sentences + ' ' + wrds[i] + cnt = 1 + if cnt == 1: + out_sentences = out_sentences + ' ' + wrds[i + 1] + return out_sentences + + def space_correction_plus3(self, doc_string): + # Dict = {'گفتوگو': 'گفت‌وگو'} + out_sentences = '' + wrds = doc_string.split(' ') + L = wrds.__len__() + if L < 3: + return doc_string + cnt = 1 + cnt2 = 0 + for i in range(0, L - 2): + w = wrds[i] + wrds[i + 1] + wrds[i + 2] + try: + out_sentences = out_sentences + ' ' + self.dic3[w] + cnt = 0 + cnt2 = 2 + except KeyError: + if cnt == 1 and cnt2 == 0: + out_sentences = out_sentences + ' ' + wrds[i] + else: + cnt2 -= 1 + cnt = 1 + if cnt == 1 and cnt2 == 0: + out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2] + elif cnt == 1 and cnt2 == 1: + out_sentences = out_sentences + ' ' + wrds[i + 2] + return out_sentences + + def normalize(self, doc_string, new_line_elimination=False, return_dates = False): + normalized_string = gf.normalYehKe(doc_string) + normalized_string = self.sub_alphabets(doc_string) + #normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination).strip() + + #normalized_string = self.space_correction(self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip() + + # !!!آئین نامه را به آئین‌نامه تبدیل می کند و دو توکن را به یکی تبدیل می کند + #normalized_string = (self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip() + + #if self.pinglish_conversion_needed: + #normalized_string = self.pinglish_conversion.pingilish2persian(self.tokenizer.tokenize_words(normalized_string)) + + if self.date_normalizing_needed: + token_list = self.tokenizer.tokenize_words(normalized_string) + # نرمالایز کردن اعداد حروفی و ... + normalized_string, additional_token_index_array = self.date_normalizer.normalize_numbers2(token_list) + # نرمالایز و تشخیص تاریخ ها + normalized_string, dates, recognized_dates = self.date_normalizer.general_normalize_date(normalized_string, additional_token_index_array) + #normalized_string_list = normalized_string.strip().split() + #normalized_string, dates, recognized_dates = self.date_normalizer.normalize_dates(normalized_string_list, additional_token_index_array, token_list_len= len(normalized_string_list), previous_slice_index_array=[0,]) + # کنترل اعدادی که پشت سر هم آمده اند و با واو از هم جدا شده اند + normalized_string, recognized_numbers = self.date_normalizer.handle_continuous_numbers(normalized_string) + # در مواردی مانند وسیصد که واو به عدد سیصد چسبیده، ابتدا این دو را از هم جدا می کنیم + # تا عدد اصلی را به دست بیاوریم. در این صورت یک توکن اضافه تولید می شود که با این متد، توکن های اضافی را حذف می کنیم + normalized_string =self.date_normalizer.remove_additional_tokens(normalized_string.split(), additional_token_index_array) + if return_dates: + # مرتب کردن آرایه تاریخ های پیدا شده بر اساس توکن شروع عبارت حاوی تاریخ + recognized_dates.sort(key=lambda x: int(x['start_date_token_index']), reverse=False) + + return normalized_string, dates, recognized_dates, recognized_numbers + else: + return normalized_string + + +class DateNormalizer(): + def __init__(self): + + self.month_dict = { "فروردین": 1,"فروردینماه": 1,'فروردین\u200cماه':1, + "اردیبهشت": 2,"اردیبهشتماه": 2,'اردیبهشت\u200cماه':2, + "خرداد": 3,"خردادماه": 3, + "تیر": 4,"تیرماه": 4, + "مرداد": 5,"مردادماه": 5, + "شهریور": 6,"شهریورماه": 6, + "مهر": 7,"مهرماه": 7, + "آبان": 8,"آبانماه": 8,'آبان\u200cماه':8, + "آذر": 9,"آذرماه": 9, + "دی": 10,"دیماه": 10,'دی\u200cماه':10, + "بهمن": 11,"بهمنماه": 11,'بهمن\u200cماه':11, + "اسفند": 12,"اسفندماه": 12} + self.num_dict = {"صد": 100,"وصد": 100, "یکصد": 100, "ویکصد": 100, "هزار": 1000,"وهزار": 1000, + "یکهزار": 1000,"ویکهزار": 1000, + "میلیون": 1000000,"ملیون": 1000000, "ومیلیون": 1000000,"وملیون": 1000000, + "یکمیلیون": 1000000,"یکملیون": 1000000, + "ویکمیلیون": 1000000,"ویکملیون": 1000000, "دویست": 200,"ودویست": 200, + "ده": 10,"وده": 10, "نه": 9,"ونه": 9, "هشت": 8,"وهشت": 8, "هفت": 7,"وهفت": 7, + "شش": 6,"وشش": 6, "پنج": 5,"وپنج": 5,"چهار": 4,"وچهار": 4, "سه": 3,"وسه": 3, + "دو": 2,"ودو": 2, "یک": 1,"ویک": 1, "یازده": 11,"ویازده": 11, "سیزده": 13, "وسیزده": 13, + "چهارده": 14, "دوازده": 12, "پانزده": 15, "شانزده": 16, "هفده": 17,"هیفده": 17, + "وچهارده": 14, "ودوازده": 12, "وپانزده": 15, "وشانزده": 16, "وهفده": 17,"وهیفده": 17, + "هجده": 18,"هیجده": 18, "نوزده": 19, "بیست": 20, "سی": 30, "چهل": 40, "پنجاه": 50, + "وهجده": 18,"وهیجده": 18, "ونوزده": 19, "وبیست": 20, "وسی": 30, "وچهل": 40, "وپنجاه": 50, + "شصت": 60, "هفتاد": 70, "نود": 90, "سیصد": 300, "چهارصد": 400, + "وشصت": 60, "وهفتاد": 70, "ونود": 90, "وسیصد": 300, "وچهارصد": 400, + "پانصد": 500, "ششصد": 600, "هفتصد": 700, "هشتصد": 800, "نهصد": 900, + "وپانصد": 500, "وششصد": 600, "وهفتصد": 700, "وهشتصد": 800, "ونهصد": 900, + "هشتاد": 80, " ": 0, "میلیارد": 1000000000,"ملیارد": 1000000000, + "یکمیلیارد": 1000000000,"یکملیارد": 1000000000, + "وهشتاد": 80, " ": 0, "ومیلیارد": 1000000000,"وملیارد": 1000000000, + "ویکمیلیارد": 1000000000,"ویکملیارد": 1000000000, + "صدم": 100, "هزارم": 1000, "دویستم": 200, + "وصدم": 100, "وهزارم": 1000, "ودویستم": 200, + "دهم": 10, "نهم": 9, "هشتم": 8, "هفتم": 7, "ششم": 6, "پنجم": 5, + "ودهم": 10, "ونهم": 9, "وهشتم": 8, "وهفتم": 7, "وششم": 6, "وپنجم": 5, + "چهارم": 4, "سوم": 3, "دوم": 2, "یکم": 1, "اول": 1, "یازدهم": 11, "سیزدهم": 13, + "وچهارم": 4, "وسوم": 3, "ودوم": 2, "ویکم": 1, "واول": 1, "ویازدهم": 11, "وسیزدهم": 13, + "چهاردهم": 14, "دوازدهم": 12, "پانزدهم": 15, "شانزدهم": 16, "هفدهم": 17,"هیفدهم": 17, + "وچهاردهم": 14, "ودوازدهم": 12, "وپانزدهم": 15, "وشانزدهم": 16, "وهفدهم": 17,"وهیفدهم": 17, + "هجدهم": 18,"هیجدهم": 18, "نوزدهم": 19, "بیستم": 20, "چهلم": 40, "پنجاهم": 50, + "وهجدهم": 18,"وهیجدهم": 18, "ونوزدهم": 19, "وبیستم": 20, "وچهلم": 40, "وپنجاهم": 50, + "شصتم": 60, "هفتادم": 70, "نودم": 90, "سیصدم": 300, "چهارصدم": 400, + "وشصتم": 60, "وهفتادم": 70, "ونودم": 90, "وسیصدم": 300, "وچهارصدم": 400, + "پانصدم": 500, "ششصدم": 600, "هفتصدم": 700, "هشتصدم": 800, "نهصدم": 900, + "وپانصدم": 500, "وششصدم": 600, "وهفتصدم": 700, "وهشتصدم": 800, "ونهصدم": 900, + "هشتادم": 80,"وهشتادم": 80} + + def find_date_part(self, token_list): + import re + for index, element in enumerate(token_list): + # بررسی الگوی تاریخ 1398/12/18 + pattern_date = r'^(\d{4}\s*/\s*([1-9]|0[1-9]|1[0-2])\s*/\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$' + date_pattern = re.compile(pattern_date) + match_date = date_pattern.match(element) + if match_date: + date_parts = match_date.string.split('/') + year = int(re.search(r'\d+', date_parts[0]).group()) + month = int(re.search(r'\d+', date_parts[1]).group()) + day = int(re.search(r'\d+', date_parts[2]).group()) + # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد + if day > 999: + day = int(re.search(r'\d+', date_parts[0]).group()) + year = int(re.search(r'\d+', date_parts[2]).group()) + formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) + return formal_date, index, index, index + + # 24 /12 /1401 بررسی الگوی تاریخ + pattern_date = r'\s*\d{2}\s*/\s*\d{2}\s*/\s*\d{4}\s*' + date_pattern = re.compile(pattern_date) + match_date = date_pattern.match(element) + if match_date: + date_parts = match_date.string.split('/') + year = int(re.search(r'\d+', date_parts[0]).group()) + month = int(re.search(r'\d+', date_parts[1]).group()) + day = int(re.search(r'\d+', date_parts[2]).group()) + # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد + if day > 999: + day = int(re.search(r'\d+', date_parts[0]).group()) + year = int(re.search(r'\d+', date_parts[2]).group()) + formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) + return formal_date, index, index, index + + # بررسی الگوی تاریخ 1402،10،06 + patterndate2 = r'^(\d{4}\s*،\s*([1-9]|0[1-9]|1[0-2])\s*،\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$' + date_pattern = re.compile(patterndate2) + match_date2 = date_pattern.match(element) + if match_date2: + date_parts = match_date2.string.split('،') + year = int(re.search(r'\d+', date_parts[0]).group()) + month = int(re.search(r'\d+', date_parts[1]).group()) + day = int(re.search(r'\d+', date_parts[2]).group()) + # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد + if day > 999: + day = int(re.search(r'\d+', date_parts[0]).group()) + year = int(re.search(r'\d+', date_parts[2]).group()) + formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) + return formal_date, index, index, index + + # بررسی الگوی تاریخ 13ر04ر1345 + patterndate2 = r'^(\d{4}\s*ر\s*([1-9]|0[1-9]|1[0-2])\s*ر\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$' + date_pattern = re.compile(patterndate2) + match_date2 = date_pattern.match(element) + if match_date2: + date_parts = match_date2.string.split('ر') + year = int(re.search(r'\d+', date_parts[0]).group()) + month = int(re.search(r'\d+', date_parts[1]).group()) + day = int(re.search(r'\d+', date_parts[2]).group()) + # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد + if day > 999: + day = int(re.search(r'\d+', date_parts[0]).group()) + year = int(re.search(r'\d+', date_parts[2]).group()) + formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day) + return formal_date, index, index, index + + if element == "/": + if index-1 >= 0 and index+1 < len(token_list) \ + and token_list[index -1].isdigit() and token_list[index+1].isdigit(): + if index+3 < len(token_list) and token_list[index+2] == "/" \ + and token_list[index + 3].isdigit(): + if int(token_list[index-1]) < 1450 and int(token_list[index+1]) < 13 and \ + int(token_list[index+3]) < 32: + formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])] + formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2]) + #return formal_date, index-1, index+3, index + return formal_date, index, index, index + elif int(token_list[index-1]) < 32 and int(token_list[index+1]) < 13 and \ + int(token_list[index+3]) < 1450: + formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) + #return formal_date, index-1, index+3, index + return formal_date, index, index, index + else: + formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])] + formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + "/" + str(formal_date[2]) + ")" + #return formal_date, index-1, index+3, index + return formal_date, index, index, index + elif (int(token_list[index-1]) < 1450 and int(token_list[index-1]) > 1250) and int(token_list[index+1]) < 13: + formal_date = [int(token_list[index-1]), int(token_list[index+ 1]), 0] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) + #return formal_date, index-1 , index+1, index + return formal_date, index , index, index + else: + formal_date = [int(token_list[index-1]), int(token_list[index+ 1])] + formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + ")" + #return formal_date, index-1 , index+1, index + return formal_date, index , index, index + + if element in self.month_dict or element == "سال": + if index + 1 < len(token_list) and index - 1 > -2: + try: + start_date_index = end_date_index = 0 + if(token_list[index + 1] == 'ماه'): + if(token_list[index + 2] == 'سال' and str(token_list[index + 3]).isdigit()): + + if(int(token_list[index + 3])==1000): + # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد + # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم + start_date_index = index - 1 + day = int(token_list[index - 1]) + if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): + day += int(token_list[index - 3])# رقم دهگان روز + start_date_index = index - 3 + year = 1000 + if(len(token_list)>= index + 5): + if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()): + year += int(token_list[index + 5])# رقم صدگان سال + end_date_index = index + 5 + if(len(token_list)>= index + 7): + if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()): + year += int(token_list[index + 7])# رقم دهگان سال + end_date_index = index + 7 + if(len(token_list)>= index + 9): + if(token_list[index + 8]=='و' and token_list[index + 9].isdigit()): + year += int(token_list[index + 9])# رقم یکان سال + end_date_index = index + 9 + + formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5 + + else: + start_date_index = index - 1 + end_date_index = index + 3 + day = int(token_list[index - 1]) + if(token_list[index - 2]=='و' and int(token_list[index - 1])<10 and int(token_list[index - 3])<31 and token_list[index - 3].isdigit()): + start_date_index = index - 3 + day += int(token_list[index - 3])# رقم دهگان روز + + formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 20 و 7 تیر ماه سال 1368 + + + formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 27 تیر ماه سال 1368 + + elif(str(token_list[index + 2]).isdigit()): + if(int(token_list[index + 2])==1000): + + if token_list[index-1].strip() == 'ام': # مثلا 30 ام تیر ماه 1000 و 300 و 80 و 2 + start_date_index = index - 2 + day = int(token_list[index - 2]) + else: + # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد + # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم + start_date_index = index - 1 + day = int(token_list[index - 1]) + + if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): + day += int(token_list[index - 3])# رقم دهگان روز + start_date_index = index - 3 + year = 1000 + if(len(token_list)>= index + 4): + if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()): + year += int(token_list[index + 4])# رقم صدگان سال + end_date_index = index + 4 + if(len(token_list)>= index + 6): + if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()): + year += int(token_list[index + 6])# رقم دهگان سال + end_date_index = index + 6 + if(len(token_list)>= index + 8): + if(token_list[index + 7]=='و' and token_list[index + 8].isdigit()): + year += int(token_list[index + 8])# رقم یکان سال + end_date_index = index + 8 + formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5 + + else: + formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر ماه سال 1368 + start_date_index = index - 1 + end_date_index = index + 2 + #formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر ماه 1368 + + elif(token_list[index + 1] == 'سال' and (not(token_list[index + 2]).isdigit())): + + formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر سال 1368 + start_date_index = index - 1 + end_date_index = index + 2 + else: + if(token_list[index + 1] == 'سال' and str(token_list[index + 2]).isdigit()): + + if(int(token_list[index + 2])==1000): + + # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد + # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم + start_date_index = index - 1 + day = int(token_list[index - 1]) + if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): + day += int(token_list[index - 3])# رقم دهگان روز + start_date_index = index - 3 + year = 1000 + if(len(token_list)>= index + 4): + if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()): + year += int(token_list[index + 4])# رقم صدگان سال + end_date_index = index + 4 + if(len(token_list)>= index + 6): + if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()): + year += int(token_list[index + 6])# # رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد + end_date_index = index + 6 + if(len(token_list)>= index + 8): + if((len(token_list)>= index + 7 and token_list[index + 7]=='و') and (len(token_list)>= index + 8 and token_list[index + 8].isdigit())): + year += int(token_list[index + 8])# رقم یکان سال + end_date_index = index + 8 + + formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5 + + else: + formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر سال 1368 + start_date_index = index - 1 + end_date_index = index + 2 + + elif(str(token_list[index + 1]).isdigit()): + if(int(token_list[index + 1])==1000): + # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد + # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم + start_date_index = index - 1 + day = int(token_list[index - 1]) + if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()): + day += int(token_list[index - 3])# رقم دهگان روز + start_date_index = index - 3 + year = 1000 + if(len(token_list)>= index + 3): + if(token_list[index + 2]=='و' and token_list[index + 3].isdigit()): + year += int(token_list[index + 3])# رقم صدگان سال + end_date_index = index + 3 + if(len(token_list)>= index + 5): + if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()): + year += int(token_list[index + 5])# رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد + end_date_index = index + 5 + if(len(token_list)>= index + 7): + if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()): + year += int(token_list[index + 7])# رقم یکان سال + end_date_index = index + 7 + formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5 + else: + formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1320‌ + start_date_index = index - 1 + end_date_index = index + 1 + + else: + formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1365 + start_date_index = index - 1 + end_date_index = index + 1 + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) + if token_list[index - 1] and token_list[index + 1]: + return formal_date, start_date_index, end_date_index, start_date_index + + except Exception as e: + error = e.args[0] + try: + formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), 0] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])#مثلا y1358m12d0 + return formal_date, index-1, index, index-1 + except: + try: + # مثلا سال 1400 + if token_list[index] == "سال": + formal_date = [int(token_list[index + 1]),0, 0] + formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])# y1358m0d0 + return formal_date, index, index+1, index + elif token_list[index+1] == "ماه" and index + 1 < len(token_list): + if index + 2 < len(token_list) and token_list[index + 2].isdigit() and \ + int(token_list[index-2]) < 1450: + formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5 + return formal_date, index, index+2, index + else: + formal_date = [0,int(self.month_dict[token_list[index]]),0] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) #y0m12d0 + return formal_date, index, index+1, index + elif index + 1 < len(token_list) and token_list[index + 1].isdigit() and int(token_list[index-2]) < 1450: + formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+1])] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5 + return formal_date, index, index+1, index + elif index-2 >= 0 and index+2 < len(token_list) and \ + token_list[index - 1] == "/" and token_list[index+1] == "/" and \ + token_list[index - 2].isdigit() and int(token_list[index-2]) < 32 and \ + token_list[index + 2].isdigit() and int(token_list[index+2]) < 1450: + formal_date = [int(token_list[index-2]),int(self.month_dict[token_list[index]]),int(token_list[index+2])] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) + return formal_date, index-2, index+2, index-2 + elif index + 2 < len(token_list) and token_list[index + 1] == 'سال' and \ + token_list[index + 2].isdigit() and int(token_list[index-2]) < 1450 : + formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])] + formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) + return formal_date, index, index+1, index + #else: + # print("Name : %s -> after1: %s -> after2: %s -> after3: %s" % (token_list[index], token_list[index+1], token_list[index+2], token_list[index+3])) + except: + pass + + def general_normalize_date(self,normalized_string, additional_token_index_array): + # 24 /12 /1401 جهت بررسی الگوی تاریخ + normalized_string, recognized_dates = DateNormalizer.separated_date_format_finder(normalized_string) + normalized_string_list = normalized_string.strip().split() + # جهت یافتن فرمت های مختلف تاریخ غیر از فرمت بررسی شده در بالا + normalized_string, dates, recognized_dates_2 = \ + self.normalize_dates(normalized_string_list, additional_token_index_array, + token_list_len = len(normalized_string_list), previous_slice_index_array=[0,]) + # تجمیع همه تاریخ ها + for item in recognized_dates_2: + recognized_dates.append(item) + return normalized_string, dates, recognized_dates + + # 24 /12 /1401 متد بررسی الگوی تاریخ + def separated_date_format_finder(normalized_string): + import re + date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}' + regex = re.compile(date_pattern) + match_dates = regex.finditer(normalized_string) + recognized_dates = [] + for date_item in match_dates: + position = date_item.span() + founded_item = date_item.group() + start_index = date_item.start() + 1 + end_index = date_item.end() - 1 + current_date = founded_item.replace(' ', '') + date_parts = current_date.split('/') + formal_date = "y" + str(date_parts[2]) + "m" + str(date_parts[1]) + "d" + str(date_parts[0]) + pattern_tokens_state = gf.token_state_finder(normalized_string, start_index, end_index) + recognized_dates.append( + { + "original_date" : founded_item, + "date" : formal_date, + "start_index" : start_index, + "end_index" : end_index, + "date_token_index" : pattern_tokens_state["start_token_state"], + "start_date_token_index": pattern_tokens_state["start_token_state"], + "end_date_token_index" : pattern_tokens_state["end_token_state"], + }) + normalized_string_list = normalized_string.strip().split() + for date_item in recognized_dates: + normalized_string_list[int(date_item['date_token_index'])] = date_item['date'] + normalized_string_list[int(date_item['start_date_token_index'])+1] = 'tttt' + normalized_string_list[int(date_item['end_date_token_index'])] = 'tttt' + normalized_string_temp = '' + for token in normalized_string_list: + normalized_string_temp = normalized_string_temp + ' ' + token + return normalized_string_temp.strip(), recognized_dates + + def normalize_dates(self, token_list, additional_token_index_array, token_list_len, previous_slice_index_array= []): + try: + finded = self.find_date_part(token_list) + except Exception as e: + finded = None + + filename = 'law_section_errors.txt' + exc_type, exc_value, exc_traceback = sys.exc_info() + frame = exc_traceback.tb_frame + function_array = traceback.extract_tb(exc_traceback) + current_function = function_array[len(function_array)-1] + error = f''' + filename : {current_function.filename} + function : {current_function.name} + err line no : {current_function.lineno} + err line : {current_function.line} + err message : {e} + ''' + gf.save_error(error, filename) + + recognized_dates = [] + if finded != None: + date_part = finded[0] + start_date_token_index = finded[1] + end_date_token_index = finded[2] + date_token_index = finded[3] + befor_date_part = " ".join(x for x in token_list[:start_date_token_index]) + after_date_part = [x for x in token_list[end_date_token_index + 1:]] + previous_slice_index = token_list_len - len(after_date_part) + previous_slice_index_array.append(previous_slice_index) + after_normalized, after_dates, recognized_dates = self.normalize_dates(after_date_part, additional_token_index_array, token_list_len, previous_slice_index_array) + if after_dates == '': + after_dates = [] + if recognized_dates == '': + recognized_dates = [] + after_dates.insert(0, date_part) + previous_slice_index = previous_slice_index_array.pop(len(previous_slice_index_array)-2) + recognized_dates.append( + { + "date" : date_part, + "date_token_index" : date_token_index + previous_slice_index, + "start_date_token_index": start_date_token_index + previous_slice_index, + "end_date_token_index" : end_date_token_index + previous_slice_index + }) + + i = 0 + while((date_token_index - start_date_token_index) > i ): + befor_date_part = ''.join([befor_date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t + i += 1 + i = 0 + while((end_date_token_index - date_token_index) > i ): + date_part = ''.join([date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t + i += 1 + + return befor_date_part + " " + date_part + " " + after_normalized, after_dates, recognized_dates + else: + return " ".join(x for x in token_list), '','' + + def list2num(self, numerical_section_list): + value = 1 + l = len(numerical_section_list)>3 + if l : + value = 0 + for index, el in enumerate(numerical_section_list): + if self.is_number(el): + value = self.num_dict[el] + elif el == '000': + #value *= 1000 + pass + elif l == True: + value += float(el) + else: + value *= float(el) + return value + + def convert2num(self, numerical_section_list): + value = 0 + tmp_section_list = [] + for index, el in enumerate(numerical_section_list): + if self.is_number(el) or (el.replace('.', '', 1).isdigit()): + tmp_section_list.append(el) + elif el == "و": + value += self.list2num(tmp_section_list) + tmp_section_list[:] = [] + if len(tmp_section_list) > 0: + value += self.list2num(tmp_section_list) + tmp_section_list[:] = [] + try: + if (value-int(value) == 0): + return int(value) + else: + return value + except: + return 0 + + + def is_number(self, word): + return word in self.num_dict + + def find_number_location(self, token_list, addWithVaa = False): + start_index = 0 + number_section =[] + for i , el in enumerate(token_list): + if self.is_number(el) or (el.replace('.', '', 1).isdigit()): + start_index = i + number_section.append(start_index) + break + + i = start_index+1 + while(i < len(token_list)): + if token_list[i] == "و" and (i+1)= 0: + # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند تبصره های مربوط به یک ماده نباشند + is_tabsare_numbers = (token_list[token_index-2] + ' ' + token_list[token_index-1]=='تبصره های') + if token_index - 1 >= 0: + # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند مواد یک قانون نباشند + is_mavad_numbers = (token_list[token_index-1]=='مواد') + if token_index - 1 >= 0: + # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند ماده های یک قانون نباشند + is_madde_numbers = (token_list[token_index-1]=='ماده') + if is_tabsare_numbers or is_mavad_numbers or is_madde_numbers: + flag = False + start_token_index = end_token_index = token_index + number_parts_array.clear() + current_token_index = token_index + number_parts_array.append(token) + if current_token_index + 1 < len(token_list): + while ((flag) and (token_list[current_token_index+1] == 'و' or token_list[current_token_index+1].isdigit())): + number_parts_array.append(token_list[current_token_index+1]) + current_token_index += 1 + end_token_index = current_token_index + if not (current_token_index + 1 < len(token_list)): + break + + final_number = number_completter(number_parts_array) + token_list[token_index] = str(final_number) + recognized_numbers.append({ + 'number_value' : final_number, + 'number_token_list': number_parts_array, + 'start_token_index': start_token_index, + 'end_token_index' : end_token_index + }) + # جایگذاری مقدار رشته بی ارزش به جای توکن های مربوط به عدد جاری + # به منظور اینکه متن اصلی از نظر تعداد توکن تغییر نکند + for i in range(token_index + 1 , len(number_parts_array) + token_index): + token_list[i] = 'nnnn' + converted_string = converted_string + ' ' + token_list[token_index] + if token_index + 1 < len(token_list): + if flag == False and (token_list[token_index+1]=='و'): + flag = False + else: + flag = True + else: + converted_string = converted_string + ' ' + token + + return converted_string, recognized_numbers + + + + + def remove_additional_tokens(self,token_list,additional_token_index_array): + converted_string = '' + for index in additional_token_index_array: + del token_list[index] + + for token in token_list: + converted_string = converted_string + ' ' + token + return converted_string + +class PinglishNormalizer(): + def __init__(self): + self.data_helper = DataHelper() + self.file_dir = os.path.dirname(os.path.realpath(__file__)) + "/" + + self.en_dict_filename = self.file_dir + "resource/tokenizer/enDict" + self.en_dict = self.data_helper.load_var(self.en_dict_filename) + + self.fa_dict_filename = self.file_dir + "resource/tokenizer/faDict" + self.fa_dict = self.data_helper.load_var(self.fa_dict_filename) + + + def pingilish2persian(self, pinglish_words_list): + + for i, word in enumerate(pinglish_words_list): + if word in self.en_dict: + pinglish_words_list[i] = self.en_dict[word]#.decode("utf-8") + #inp = inp.replace(word, enDict[word], 1) + else: + ch = self.characterize(word) + pr = self.map_char(ch) + amir = self.make_word(pr) + for wd in amir: + am = self.escalation(wd) + asd = ''.join(am) + if asd in self.fa_dict: + pinglish_words_list[i] = asd#.decode("utf-8") + #inp = inp.replace(word, asd, 1) + inp = " ".join(x for x in pinglish_words_list) + return inp + + def characterize(self, word): + list_of_char = [] + i = 0 + while i < len(word): + char = word[i] + sw_out = self.switcher(char) + if (sw_out == None): + esp_out = None + if(i < len(word) - 1): + esp_out = self.esp_check(word[i], word[i + 1]) + if(esp_out == None): + list_of_char.append(word[i]) + else: + list_of_char.append(esp_out) + i += 1 + else: + list_of_char.append(sw_out) + i += 1 + return list_of_char + + def switcher(self, ch): + switcher = { + "c": None, + "k": None, + "z": None, + "s": None, + "g": None, + "a": None, + "u": None, + "e": None, + "o": None + } + return switcher.get(ch, ch) + + def esp_check(self, char1, char2): + st = char1 + char2 + if (st == "ch"): + return "ch" + elif (st == "kh"): + return "kh" + elif (st == "zh"): + return "zh" + elif (st == "sh"): + return "sh" + elif (st == "gh"): + return "gh" + elif (st == "aa"): + return "aa" + elif (st == "ee"): + return "ee" + elif (st == "oo"): + return "oo" + elif (st == "ou"): + return "ou" + else: + return None + + def map_char(self, word): + listm = [] + sw_out = self.map_switcher(word[0]) + i = 0 + if (sw_out == None): + listm.append(["ا"]) + i += 1 + if (word[0] == "oo"): + listm.append(["او"]) + i += 1 + while i < len(word): + listm.append(self.char_switcher(word[i])) + i += 1 + if word[len(word) - 1] == "e": + listm.append(["ه"]) + elif word[len(word) - 1] == "a": + listm.append(["ا"]) + elif word[len(word) - 1] == "o": + listm.append(["و"]) + elif word[len(word) - 1] == "u": + listm.append(["و"]) + + return listm + + def map_switcher(self, ch): + switcher = { + "a": None, + "e": None, + "o": None, + "u": None, + "ee": None, + + "ou": None + } + return switcher.get(ch, ch) + + def make_word(self, chp): + word_list = [[]] + for char in chp: + word_list_temp = [] + for tmp_word_list in word_list: + for chch in char: + tmp = copy.deepcopy(tmp_word_list) + tmp.append(chch) + word_list_temp.append(tmp) + word_list = word_list_temp + return word_list + + def escalation(self, word): + tmp = [] + i = 0 + t = len(word) + while i < t - 1: + tmp.append(word[i]) + if word[i] == word[i + 1]: + i += 1 + i += 1 + if i != t: + tmp.append(word[i]) + return tmp + + def char_switcher(self, ch): + switcher = { + 'a': ["", "ا"], + 'c': ["ث", "ص", "ص"], + 'h': ["ه", "ح"], + 'b': ["ب"], + 'p': ["پ"], + 't': ["ت", "ط"], + 's': ["س", "ص", "ث"], + 'j': ["ج"], + 'ch': ["چ"], + 'kh': ["خ"], + 'q': ["ق", "غ"], + 'd': ["د"], + 'z': ["ز", "ذ", "ض", "ظ"], + 'r': ["ر"], + 'zh': ["ژ"], + 'sh': ["ش"], + 'gh': [",ق", "غ"], + 'f': ["ف"], + 'k': ["ک"], + 'g': ["گ"], + 'l': ["ل"], + 'm': ["م"], + 'n': ["ن"], + 'v': ["و"], + 'aa': ["ا"], + 'ee': ["ی"], + 'oo': ["و"], + 'ou': ["و"], + 'i': ["ی"], + 'y': ["ی"], + ' ': [""], + 'w': ["و"], + 'e': ["", "ه"], + 'o': ["", "و"] + } + return switcher.get(ch, "") + +# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند +def number_completter2(number_parts_array): + zarb_value = 0 + temp_number_array = [] + sum_number_parts = 0 + final_number = 0 + for index,item in enumerate(number_parts_array): + if item.isdigit(): + temp_number_array.append(int(item)) + if index + 1 >= len(number_parts_array): + for item3 in temp_number_array: + final_number += int(item3) + return final_number + current_value = number_parts_array[index+1] + if (current_value.isdigit() and (int(current_value) == 1000 or int(current_value) == 1000000 or int(current_value) == 1000000000)):# or int(current_value) == 1000000 + zarb_value = int(number_parts_array[index+1]) + for num_item in temp_number_array: + sum_number_parts += num_item + final_number = sum_number_parts * zarb_value + temp_array2 = [] + x = index + 2 + while x < len(number_parts_array): + # for x in range(index+2,len(number_parts_array)): + temp_array2.append(number_parts_array[x]) + if x + 1 < len(number_parts_array): + if number_parts_array[x+1].isdigit() and (int(number_parts_array[x+1]) == 1000 or int(number_parts_array[x+1]) == 1000000 or int(number_parts_array[x+1]) == 1000000000): + if int(number_parts_array[x+1]) > zarb_value: # 1000000>1000 + zarb_value = int(number_parts_array[x+1]) + # تابع بازگشتی برای محاسبه عدد نهایی + final_number += number_completter2(temp_array2) + final_number *= zarb_value + temp_array2.clear() + zarb_value = 0 + x += 2 # به این دلیل که مقدار ضرب را در آرایه تمپ ذخیره نکند + else: # 1000!>1000000 + zarb_value = int(number_parts_array[x+1]) + temp_num = 0 + if (zarb_value == 1000 or zarb_value == 1000000) and len(number_parts_array) > x + 1: + num_array = number_parts_array[x+2:len(number_parts_array)] + temp_num = number_completter(num_array) + if temp_num == None: + temp_num = 0 + + '''for i in range(x+2,len(number_parts_array)): + if(number_parts_array[i].isdigit()): + temp_num += int(number_parts_array[i]) + i+=1''' + temp_num2 = 1 + for num_item2 in temp_array2: + if(num_item2.isdigit()): + temp_num2 += int(num_item2) + + temp_num2 *= zarb_value + final_number += temp_num + temp_num2 + break + else: + x += 1 # + else: # اگر از رنج آرایه خارج شدیم + temp_num = 0 + # مقادیر موجود در آرایه تمپ را جمع کن + for num_item3 in temp_array2: + if(num_item3.isdigit()): + temp_num += int(num_item3) + # حاصل جمع اعداد موجود در آرایه ذخیره را در عدد نهایی که قبلا داشته ایم ضرب کن و از حلقه خارج شو + final_number *= temp_num + #final_number += temp_num + break + return final_number + +# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند +def number_completter(number_parts_array): + zarb_value = 0 + previous_zarb_value = 0 + previous_number_parts_sum = 0 + temp_number_array = [] + number_parts_sum = 0 + final_number = 0 + current_number_part = 0 + for index,item in enumerate(number_parts_array): + if item.isdigit(): + if (not(int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)): + temp_number_array.append(item) + continue + elif((int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)): + zarb_value = int(item) + for num_item in temp_number_array: + number_parts_sum += int(num_item) + if number_parts_sum == 0 and previous_number_parts_sum == 0: + number_parts_sum = 1 + temp_number_array.clear() + + else:# for example 952 + zarb_value = 1 + for num_item in temp_number_array: + number_parts_sum += int(num_item) + current_number_part = number_parts_sum + previous_number_parts_sum + continue + if previous_zarb_value < zarb_value:# for example 1000 < 1000000000 + current_number_part = previous_number_parts_sum + number_parts_sum + current_number_part = zarb_value * current_number_part + else:# previous_zarb_value > zarb_value + if number_parts_sum == 0: + number_parts_sum = 1 + current_number_part = zarb_value * number_parts_sum + current_number_part += previous_number_parts_sum + + previous_number_parts_sum = current_number_part + current_number_part = 0 + previous_zarb_value = zarb_value + number_parts_sum = 0 + if len(temp_number_array) != 0: + remained_parts_sum = 0 + for num_item in temp_number_array: + remained_parts_sum += int(num_item) + final_number = previous_number_parts_sum + remained_parts_sum + return final_number + + final_number = previous_number_parts_sum + return final_number + + + + + + \ No newline at end of file diff --git a/import_data/peyma_dataset.py b/import_data/peyma_dataset.py new file mode 100644 index 0000000..f22534f --- /dev/null +++ b/import_data/peyma_dataset.py @@ -0,0 +1,177 @@ +import json +import requests +from decimal import Decimal +import os + +TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc' +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} + +url = "https://api.tavasi.ir/repo/dataset/multi/add/peyma/ner" +headers = HEADERS + +address = os.getcwd() +if 'impoert_data' in address: + address += '/data/peyma_train.txt' +else: + address += '/impoert_data/data/peyma_train.txt' +# باز کردن فایل متنی +with open(address, 'r', encoding='utf-8') as file: + input_text = file.read() + +# تبدیل متن به لیستی از خطوط +lines = input_text.strip().split('\n') + +key = '' +begin = -1 +end = -1 +tokenNumber = -1 +content = '' +result_token = [] + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Decimal): + return float(obj) + return json.JSONEncoder.default(self, obj) + + + +# def createIndex(content, result_token): +# output = { +# "content": content, +# "domain": "پیکره پیما", +# "ref_id": "", +# "ref_url": "", +# "result_token": result_token, +# } +# # print(output) +# # print(json.dumps(output, indent=4, ensure_ascii=False)) +# return output + +def createIndex(content, result_token): + result_objects = [{ + "task":"ner", + "key":"peyma_ner", + "label":"خروجی تیم پیما", + "values":result_token + } ] + output ={ + "content": content, + "domain": "پیکره پیما", + "ref_id": "", + "ref_url": "", + "result_objects": result_objects, + } + # print(output) + # print(json.dumps(output, indent=4, ensure_ascii=False)) + return output + +def appendResultToken(text, key, begin, tokenNumber, result_token): + end = -1 + # if key == 'HALFREFERENCE' : + # key = 'H_REF' + # elif key == 'REFERENCE' : + # key = 'REF' + if key: + if key == 'ORG' : + key = 'ORG' + elif key == 'MON' : + key = 'MON' + elif key == 'LOC' : + key = 'LOC' + elif key == 'DAT' : + key = 'DAT' + elif key == 'TIM' : + key = 'TIM' + elif key == 'PER' : + key = 'PER' + elif key == 'PCT' : + key = 'PCT' + + end = tokenNumber -1 + result_token.append({ + "begin": begin, + "end": end, + "result_key": key, + "text" : text + }) + begin = -1 + end = -1 + key = '' + return key, begin, end, result_token + +bulk_data = [] +bulk_count = 1 +count = 0 +text = '' + +for i, line in enumerate(lines): + print('line: ' + str(i)) + count += 1 + tokenNumber += 1 + + if line.strip() == '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + data = createIndex(content, result_token) + tokenNumber = -1 + content = '' + result_token = [] + + bulk_data.append(data) + bulk_count +=1 + if bulk_data.__len__() > 100: + print('=' * 30 ) + print('count ' + str(count)) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response) + bulk_data = [] + bulk_count = 1 + + continue + + parts = line.split('|') + if len(parts) != 2: + continue + + result_key = parts[1] + token_part = (parts[0].strip()).split() + if len(token_part) > 1: + new_token_part = token_part[0].strip() + '\u200c' + token_part[1].strip() + content += ' ' + parts[0] + else: + content += ' ' + parts[0] + + if result_key.startswith('I_'): + text += ' ' + parts[0] + continue + + if result_key == 'O' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + if result_key.startswith('B_'): + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + text = parts[0] + begin = tokenNumber + end = -1 + key = result_key.replace('B_', '') + + +if content != '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + data = createIndex(content, result_token) + bulk_data.append(data) + bulk_count +=1 + + +if bulk_data.__len__() > 0: + print(bulk_count) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response.text) + +# نمایش دیکشنری خروجی به صورت JSON +print("***************** end ") \ No newline at end of file diff --git a/import_data/qavanin_dataset_test.py b/import_data/qavanin_dataset_test.py new file mode 100644 index 0000000..4d46fad --- /dev/null +++ b/import_data/qavanin_dataset_test.py @@ -0,0 +1,172 @@ +from decimal import Decimal +import requests +import json +import os + +""" !!! place of token:: Application > Local Storage > id_token !!! """ + +# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم: +# api.tavasi.ir token and url +TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjQ5Mzk3NTksImp0aSI6IktwbjlBdEV1ZGh2WngxZUFXOFRRRUFrcmpcL01xa2JmWiIsImlzcyI6bnVsbCwiZXhwIjoxNzI2MjM5NzU4LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.r_A8IMQdN50ZM8oPmIq31Pz-phPDJyfQYLmbsLjv4Ao' +url = "https://api.tavasi.ir/repo/dataset/multi/add/qasection_test/ner" + +# Mortaza Server token and url +# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم: +# 192.168.23.160 token +#TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjUxMDk1NDAsImp0aSI6IjdWWUhCTFhiMVJFb3VpY2dORzE1dHRkcnBObU1cL0JobCIsImlzcyI6bnVsbCwiZXhwIjoxNzI2NDA5NTM5LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjEsImZpcnN0X25hbWUiOiJcdTA2MjhcdTA2MzFcdTA2NDZcdTA2MjdcdTA2NDVcdTA2NDcgXHUwNjQ2XHUwNjQ4XHUwNmNjXHUwNjMzIiwibGFzdF9uYW1lIjoiXHUwNjQxXHUwNjQ2XHUwNmNjIiwiZW1haWwiOiJkZXZAZ21haWwuY29tIiwidXNlcm5hbWUiOiJkZXYiLCJ1c2VyX2xldmVsIjoyfX0.WayvUXRnBukSyc6o_Qv4dClTv9Hn75uOTcCfzGQ9El8' +#url = "http://192.168.23.160:8080/repo/dataset/multi/add/qasection_test/ner" + +ACCEPT = "application/json" +HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT} +headers = HEADERS + +address = './impoert_data/data/sample_dataset.txt' +# address = os.getcwd() +# if 'impoert_data' in address: +# address += '/data/sample_dataset.txt' ### +# else: +# address += '/impoert_data/data/sample_dataset.txt' +# باز کردن فایل متنی +with open(address, 'r', encoding='utf-8') as file: + input_text = file.read() + +# تبدیل متن به لیستی از خطوط +lines = input_text.strip().split('\n') + +key = '' +begin = -1 +end = -1 +tokenNumber = -1 +content = '' +result_token = [] + +class JSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Decimal): + return float(obj) + return json.JSONEncoder.default(self, obj) + + +def createIndex(content, result_token, section_order): + result_objects = [{ + "task":"ner", + "key":"qavanin_ner", + "label":"خروجی تیم همتا", + "values":result_token + } ] + output ={ + "content": content, + "domain": "پیکره قوانین آزمایشی", + "ref_id": "", + "ref_url": "", + "result_objects": result_objects, + "order": str(section_order), + } + # print(output) + # print(json.dumps(output, indent=4, ensure_ascii=False)) + return output + +def appendResultToken(text, key, begin, tokenNumber, result_token): + end = -1 + # if key == 'HALFREFERENCE' : + # key = 'H_REF' + # elif key == 'REFERENCE' : + # key = 'REF' + if key: + # if key == 'org' : + # key = 'ORG' + # elif key == 'loc' : + # key = 'LOC' + # elif key == 'Facility' : + # key = 'fac' + # elif key == 'event' : + # key = 'EVENT' + # elif key == 'pro' : + # key = 'PRO' + # elif key == 'pers' : + # key = 'PER' + + end = tokenNumber -1 + result_token.append({ + "begin": begin, + "end": end, + "result_key": key, + "text" : text + }) + begin = -1 + end = -1 + key = '' + return key, begin, end, result_token + +bulk_data = [] +bulk_count = 1 +count = 0 +section_order = 0 +text = '' + +for i, line in enumerate(lines): + print('line: ' + str(i)) + count += 1 + tokenNumber = tokenNumber + 1 + + if line.strip() == '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + section_order += 1 + data = createIndex(content, result_token, section_order) + tokenNumber = -1 + content = '' + result_token = [] + + bulk_data.append(data) + bulk_count +=1 + if bulk_data.__len__() > 100: + print('=' * 30 ) + print('count ' + str(count)) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response) + bulk_data = [] + bulk_count = 1 + + continue + + + + parts = line.split() + if len(parts) != 2: + continue + + content += ' ' + parts[0] + + result_key = parts[1] + if result_key.startswith('I-'): + text += ' ' + parts[0] + continue + + if result_key == 'O' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + if result_key.startswith('B-'): + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + + text = parts[0] + begin = tokenNumber + end = -1 + key = result_key.replace('B-', '') + +if content != '' : + key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token) + section_order += 1 + data = createIndex(content, result_token, section_order) + bulk_data.append(data) + bulk_count +=1 + + +if bulk_data.__len__() > 0: + print(bulk_count) + payload = json.dumps(bulk_data, cls=JSONEncoder) #Works! + response = requests.request("POST", url, headers=headers, data=payload) + print(response) + +# نمایش دیکشنری خروجی به صورت JSON +print("***************** end *****************") \ No newline at end of file