From 18db5b6fbc577b47408434c2c946ff7d1ba63534 Mon Sep 17 00:00:00 2001
From: ajokar <ajokar91@yahoo.com>
Date: Tue, 17 Sep 2024 16:45:26 +0000
Subject: [PATCH] Upload files to "import_data"

---
 import_data/hamta_dataset.py        |  175 ++++
 import_data/label_tokens.py         |   67 ++
 import_data/normalizer.py           | 1367 +++++++++++++++++++++++++++
 import_data/peyma_dataset.py        |  177 ++++
 import_data/qavanin_dataset_test.py |  172 ++++
 5 files changed, 1958 insertions(+)
 create mode 100644 import_data/hamta_dataset.py
 create mode 100644 import_data/label_tokens.py
 create mode 100644 import_data/normalizer.py
 create mode 100644 import_data/peyma_dataset.py
 create mode 100644 import_data/qavanin_dataset_test.py

diff --git a/import_data/hamta_dataset.py b/import_data/hamta_dataset.py
new file mode 100644
index 0000000..74dfb44
--- /dev/null
+++ b/import_data/hamta_dataset.py
@@ -0,0 +1,175 @@
+import json
+import requests
+from decimal import Decimal
+import os
+
+TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc'
+ACCEPT = "application/json"
+HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
+
+url = ""
+headers = HEADERS
+
+address = os.getcwd()
+if 'impoert_data' in address:
+    address += '/data/arman_train.txt'
+else:
+    address += '/impoert_data/data/arman_train.txt'
+    
+# باز کردن فایل متنی
+with open(address, 'r', encoding='utf-8') as file:
+    input_text = file.read()
+
+# تبدیل متن به لیستی از خطوط
+lines = input_text.strip().split('\n')
+
+key = ''
+begin = -1
+end = -1
+tokenNumber = -1
+content = ''
+result_token = []
+
+class JSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Decimal):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+
+# def createIndex(content, result_token):
+#     output = {
+#         "content": content,
+#         "domain": "پیکره آرمان",
+#         "ref_id": "",
+#         "ref_url": "",
+#         "result_token": result_token,
+#     }
+#     # print(output)
+#     # print(json.dumps(output, indent=4, ensure_ascii=False))
+#     return output
+def createIndex(content, result_token):
+    result_objects = [{
+          "task":"ner",
+          "key":"qavanin_keyword",
+          "label":"خروجی تیم همتا",
+          "values":result_token
+          } ]
+    output ={
+        "content": content,
+        "domain": "پیکره قوانین(کلیدواژه)",
+        "ref_id": "",
+        "ref_url": "",
+        "result_objects": result_objects,
+    }
+    # print(output)
+    # print(json.dumps(output, indent=4, ensure_ascii=False))
+    return output
+
+
+
+def appendResultToken(text, key, begin, tokenNumber, result_token):
+    end = -1
+    # if key == 'HALFREFERENCE' :
+    #     key = 'H_REF'
+    # elif key == 'REFERENCE' :
+    #     key = 'REF'
+    if key:
+        if key == 'org' :
+            key = 'ORG'
+        elif key == 'loc' :
+            key = 'LOC'
+        elif key == 'Facility' :
+            key = 'fac'
+        elif key == 'event' :
+            key = 'EVENT'
+        elif key == 'pro' :
+            key = 'PRO'
+        elif key == 'pers' :
+            key = 'PER'
+
+        end = tokenNumber -1
+        result_token.append({
+        "begin": begin,
+        "end": end,
+        "result_key": key,
+        "text" : text
+        }) 
+        begin = -1
+        end = -1
+        key = ''
+    return  key, begin, end, result_token  
+
+bulk_data = []
+bulk_count = 1
+count = 0
+text = ''
+
+for i, line in enumerate(lines):
+    print('line: ' + str(i))
+    count += 1
+    tokenNumber = tokenNumber + 1
+
+    if line.strip() == '' : 
+       key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+       data = createIndex(content, result_token) 
+       tokenNumber = -1
+       content = ''
+       result_token = []
+
+       bulk_data.append(data)    
+       bulk_count +=1 
+       if bulk_data.__len__() > 100:
+            print('=' * 30 )
+            print('count ' + str(count))
+            payload = json.dumps(bulk_data, cls=JSONEncoder)  #Works!
+            response = requests.request("POST", url, headers=headers, data=payload)
+            print(response)
+            bulk_data = []
+            bulk_count = 1
+
+       continue
+
+
+
+    parts = line.split()
+    if len(parts) != 2:
+        continue
+
+
+    content += ' ' + parts[0]
+
+    result_key = parts[1]
+    if result_key.startswith('I-'):    
+        text += ' ' + parts[0]
+        continue
+
+    if result_key == 'O' :     
+        key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+    if result_key.startswith('B-'):        
+        key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+        text = parts[0]
+        begin = tokenNumber
+        end = -1
+        key = result_key.replace('B-', '')
+
+
+if content != '' :
+    key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+    data = createIndex(content, result_token) 
+    bulk_data.append(data)    
+    bulk_count +=1 
+
+
+if  bulk_data.__len__() > 0:
+    print(bulk_count)
+    payload = json.dumps(bulk_data, cls=JSONEncoder)  #Works!
+    response = requests.request("POST", url, headers=headers, data=payload)
+    print(response.text)
+    
+# نمایش دیکشنری خروجی به صورت JSON
+print("*****************   end ")
\ No newline at end of file
diff --git a/import_data/label_tokens.py b/import_data/label_tokens.py
new file mode 100644
index 0000000..4839882
--- /dev/null
+++ b/import_data/label_tokens.py
@@ -0,0 +1,67 @@
+from docx import Document
+import re
+
+def extract_key_phrases(file_path):
+    doc = Document(file_path)
+    key_phrases = []
+    pattern = re.compile(r'!(.*?)!')
+
+    for paragraph in doc.paragraphs:
+        matches = pattern.findall(paragraph.text)
+        key_phrases.extend(matches)
+    
+    return key_phrases
+    
+def extract_text_from_docx(file_path):
+    doc = Document(file_path)
+    text = ''
+    
+    for paragraph in doc.paragraphs:
+        text += paragraph.text + ' '
+    
+    return text.strip()
+
+def tokenize_and_tag(text, key_phrases):
+
+    key_phrases_sorted = sorted(key_phrases, key=len, reverse=True)
+    for key_phrase in key_phrases_sorted:
+
+        text = text.replace(f'!{key_phrase}!', f' {key_phrase} ')
+    
+    tokens = re.findall(r'\S+|[^\s\w]', text) 
+    tagged_tokens = []
+    i = 0
+
+    while i < len(tokens):
+        token = tokens[i]
+        matched = False
+
+
+        for key_phrase in key_phrases_sorted:
+            key_tokens = key_phrase.split()
+            if tokens[i:i + len(key_tokens)] == key_tokens:
+                tagged_tokens.append((key_tokens[0], 'B-key'))
+                for key_token in key_tokens[1:]:
+                    tagged_tokens.append((key_token, 'I-key'))
+                i += len(key_tokens)
+                matched = True
+                break
+        
+        if not matched:
+            tagged_tokens.append((token, 'O'))
+            i += 1
+
+    return tagged_tokens
+
+def write_tokens_to_file(tagged_tokens, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for token, tag in tagged_tokens:
+            f.write(f"{token}\t{tag}\n")
+
+file_path = 'D:/project/output.docx'
+output_file = 'output.txt'
+
+text = extract_text_from_docx(file_path)
+key_phrases = extract_key_phrases(file_path)
+tagged_tokens = tokenize_and_tag(text, key_phrases)
+write_tokens_to_file(tagged_tokens, output_file)
\ No newline at end of file
diff --git a/import_data/normalizer.py b/import_data/normalizer.py
new file mode 100644
index 0000000..863fe34
--- /dev/null
+++ b/import_data/normalizer.py
@@ -0,0 +1,1367 @@
+from re import sub
+import copy
+import os
+from tokenizer import Tokenizer
+from data_helper import DataHelper
+import general_functions as gf
+import traceback,sys
+
+class Normalizer():
+
+    def __init__(self,
+                 half_space_char='\u200c',
+                 date_normalizing_needed=False,
+                 pinglish_conversion_needed=False,
+                 train_file_path="resource/tokenizer/Bijan_khan_chunk.txt",
+                 token_merger_path="resource/tokenizer/TokenMerger.pckl"):
+        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"
+
+        self.dic1_path = self.dir_path + 'resource/normalizer/Dic1_new.txt'
+        self.dic2_path = self.dir_path + 'resource/normalizer/Dic2_new.txt'
+        self.dic3_path = self.dir_path + 'resource/normalizer/Dic3_new.txt'
+        self.dic1 = self.load_dictionary(self.dic1_path)
+        self.dic2 = self.load_dictionary(self.dic2_path)
+        self.dic3 = self.load_dictionary(self.dic3_path)
+
+        self.date_normalizing_needed = date_normalizing_needed
+        self.pinglish_conversion_needed = pinglish_conversion_needed
+        self.data_helper = DataHelper()
+
+
+        if self.date_normalizing_needed or self.pinglish_conversion_needed:
+            self.tokenizer = Tokenizer()
+            self.date_normalizer = DateNormalizer()
+            self.pinglish_conversion = PinglishNormalizer()
+
+        
+
+    def load_dictionary(self, file_path):
+        dict = {}
+        with open(file_path, 'r', encoding='utf-8') as f:
+            g = f.readlines()
+            for Wrds in g:
+                wrd = Wrds.split(' ')
+                dict[wrd[0].strip()] = sub('\n', '', wrd[1].strip())
+        return dict
+
+    def sub_alphabets(self, doc_string):
+        # try:
+        #     doc_string = doc_string.decode('utf-8')
+        # except UnicodeEncodeError:
+        #     pass
+        a0 = "ء"
+        b0 = "ئ"
+        c0 = sub(a0, b0, doc_string)
+        a1 = r"ٲ|ٱ|إ|ﺍ|أ"
+        a11 = r"ﺁ|آ"
+        b1 = r"ا"
+        b11 = r"آ"
+        c11 = sub(a11, b11, c0)
+        c1 = sub(a1, b1, c11)
+        a2 = r"ﺐ|ﺏ|ﺑ"
+        b2 = r"ب"
+        c2 = sub(a2, b2, c1)
+        a3 = r"ﭖ|ﭗ|ﭙ|ﺒ|ﭘ"
+        b3 = r"پ"
+        c3 = sub(a3, b3, c2)
+        a4 = r"ﭡ|ٺ|ٹ|ﭞ|ٿ|ټ|ﺕ|ﺗ|ﺖ|ﺘ"
+        b4 = r"ت"
+        c4 = sub(a4, b4, c3)
+        a5 = r"ﺙ|ﺛ"
+        b5 = r"ث"
+        c5 = sub(a5, b5, c4)
+        a6 = r"ﺝ|ڃ|ﺠ|ﺟ"
+        b6 = r"ج"
+        c6 = sub(a6, b6, c5)
+        a7 = r"ڃ|ﭽ|ﭼ"
+        b7 = r"چ"
+        c7 = sub(a7, b7, c6)
+        a8 = r"ﺢ|ﺤ|څ|ځ|ﺣ"
+        b8 = r"ح"
+        c8 = sub(a8, b8, c7)
+        a9 = r"ﺥ|ﺦ|ﺨ|ﺧ"
+        b9 = r"خ"
+        c9 = sub(a9, b9, c8)
+        a10 = r"ڏ|ډ|ﺪ|ﺩ"
+        b10 = r"د"
+        c10 = sub(a10, b10, c9)
+        a11 = r"ﺫ|ﺬ|ﻧ"
+        b11 = r"ذ"
+        c11 = sub(a11, b11, c10)
+        a12 = r"ڙ|ڗ|ڒ|ڑ|ڕ|ﺭ|ﺮ"
+        b12 = r"ر"
+        c12 = sub(a12, b12, c11)
+        a13 = r"ﺰ|ﺯ"
+        b13 = r"ز"
+        c13 = sub(a13, b13, c12)
+        a14 = r"ﮊ"
+        b14 = r"ژ"
+        c14 = sub(a14, b14, c13)
+        a15 = r"ݭ|ݜ|ﺱ|ﺲ|ښ|ﺴ|ﺳ"
+        b15 = r"س"
+        c15 = sub(a15, b15, c14)
+        a16 = r"ﺵ|ﺶ|ﺸ|ﺷ"
+        b16 = r"ش"
+        c16 = sub(a16, b16, c15)
+        a17 = r"ﺺ|ﺼ|ﺻ"
+        b17 = r"ص"
+        c17 = sub(a17, b17, c16)
+        a18 = r"ﺽ|ﺾ|ﺿ|ﻀ"
+        b18 = r"ض"
+        c18 = sub(a18, b18, c17)
+        a19 = r"ﻁ|ﻂ|ﻃ|ﻄ"
+        b19 = r"ط"
+        c19 = sub(a19, b19, c18)
+        a20 = r"ﻆ|ﻇ|ﻈ"
+        b20 = r"ظ"
+        c20 = sub(a20, b20, c19)
+        a21 = r"ڠ|ﻉ|ﻊ|ﻋ"
+        b21 = r"ع"
+        c21 = sub(a21, b21, c20)
+        a22 = r"ﻎ|ۼ|ﻍ|ﻐ|ﻏ"
+        b22 = r"غ"
+        c22 = sub(a22, b22, c21)
+        a23 = r"ﻒ|ﻑ|ﻔ|ﻓ"
+        b23 = r"ف"
+        c23 = sub(a23, b23, c22)
+        a24 = r"ﻕ|ڤ|ﻖ|ﻗ"
+        b24 = r"ق"
+        c24 = sub(a24, b24, c23)
+        a25 = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك"
+        b25 = r"ک"
+        c25 = sub(a25, b25, c24)
+        a26 = r"ﮚ|ﮒ|ﮓ|ﮕ|ﮔ"
+        b26 = r"گ"
+        c26 = sub(a26, b26, c25)
+        a27 = r"ﻝ|ﻞ|ﻠ|ڵ"
+        b27 = r"ل"
+        c27 = sub(a27, b27, c26)
+        a28 = r"ﻡ|ﻤ|ﻢ|ﻣ"
+        b28 = r"م"
+        c28 = sub(a28, b28, c27)
+        a29 = r"ڼ|ﻦ|ﻥ|ﻨ"
+        b29 = r"ن"
+        c29 = sub(a29, b29, c28)
+        a30 = r"ވ|ﯙ|ۈ|ۋ|ﺆ|ۊ|ۇ|ۏ|ۅ|ۉ|ﻭ|ﻮ|ؤ"
+        b30 = r"و"
+        c30 = sub(a30, b30, c29)
+        a31 = r"ﺔ|ﻬ|ھ|ﻩ|ﻫ|ﻪ|ۀ|ە|ة|ہ"
+        b31 = r"ه"
+        c31 = sub(a31, b31, c30)
+        a32 = r"ﭛ|ﻯ|ۍ|ﻰ|ﻱ|ﻲ|ں|ﻳ|ﻴ|ﯼ|ې|ﯽ|ﯾ|ﯿ|ێ|ے|ى|ي"
+        b32 = r"ی"
+        c32 = sub(a32, b32, c31)
+        a33 = r'¬'
+        b33 = r'‌'
+        c33 = sub(a33, b33, c32)
+        pa0 = r'•|·|●|·|・|∙|｡|ⴰ'
+        pb0 = r'.'
+        pc0 = sub(pa0, pb0, c33)
+        pa1 = r',|٬|٫|‚|，'
+        pb1 = r'،'
+        pc1 = sub(pa1, pb1, pc0)
+        pa2 = r'ʕ'
+        pb2 = r'؟'
+        pc2 = sub(pa2, pb2, pc1)
+        na0 = r'۰|٠'
+        nb0 = r'0'
+        nc0 = sub(na0, nb0, pc2)
+        na1 = r'۱|١'
+        nb1 = r'1'
+        nc1 = sub(na1, nb1, nc0)
+        na2 = r'۲|٢'
+        nb2 = r'2'
+        nc2 = sub(na2, nb2, nc1)
+        na3 = r'۳|٣'
+        nb3 = r'3'
+        nc3 = sub(na3, nb3, nc2)
+        na4 = r'۴|٤'
+        nb4 = r'4'
+        nc4 = sub(na4, nb4, nc3)
+        na5 = r'۵'
+        nb5 = r'5'
+        nc5 = sub(na5, nb5, nc4)
+        na6 = r'۶|٦'
+        nb6 = r'6'
+        nc6 = sub(na6, nb6, nc5)
+        na7 = r'۷|٧'
+        nb7 = r'7'
+        nc7 = sub(na7, nb7, nc6)
+        na8 = r'۸|٨'
+        nb8 = r'8'
+        nc8 = sub(na8, nb8, nc7)
+        na9 = r'۹|٩'
+        nb9 = r'9'
+        nc9 = sub(na9, nb9, nc8)
+        np2 = r'²'
+        nm2 = r'2'
+        ng2 = sub(np2, nm2, nc9)
+        ea1 = r'|ِ|ُ|َ|ٍ|ٌ|ً|'
+        eb1 = r''
+        ec1 = sub(ea1, eb1, ng2)
+        ea1 = r'ـ'
+        eb1 = r'_'
+        ec2 = sub(ea1, eb1, ec1)
+        Sa1 = r'( )+'
+        Sb1 = r' '
+        Sc1 = sub(Sa1, Sb1, ec2)
+        Sa2 = r'(\n)+'
+        Sb2 = r'\n'
+        Sc2 = sub(Sa2, Sb2, Sc1)
+        return Sc2
+
+    def space_correction(self, doc_string):
+        a00 = r'^(بی|می|نمی)( )'
+        b00 = r'\1‌'
+        c00 = sub(a00, b00, doc_string)
+        a0 = r'( )(می|نمی|بی)( )'
+        b0 = r'\1\2‌'
+        c0 = sub(a0, b0, c00)
+        a1 = r'( )(هایی|ها|های|ایی|هایم|هایت|هایش|هایمان|هایتان|هایشان|ات|ان|ین' \
+             r'|انی|بان|ام|ای|یم|ید|اید|اند|بودم|بودی|بود|بودیم|بودید|بودند|ست)( )'
+        b1 = r'‌\2\3'
+        c1 = sub(a1, b1, c0)
+        a2 = r'( )(شده|نشده)( )'
+        b2 = r'‌\2‌'
+        c2 = sub(a2, b2, c1)
+        a3 = r'( )(طلبان|طلب|گرایی|گرایان|شناس|شناسی|گذاری|گذار|گذاران|شناسان|گیری|پذیری|بندی|آوری|سازی|' \
+             r'بندی|کننده|کنندگان|گیری|پرداز|پردازی|پردازان|آمیز|سنجی|ریزی|داری|دهنده|آمیز|پذیری' \
+             r'|پذیر|پذیران|گر|ریز|ریزی|رسانی|یاب|یابی|گانه|گانه‌ای|انگاری|گا|بند|رسانی|دهندگان|دار)( )'
+        b3 = r'‌\2\3'
+        c3 = sub(a3, b3, c2)
+        return c3
+
+    def space_correction_plus1(self, doc_string):
+        out_sentences = ''
+        for wrd in doc_string.split(' '):
+            try:
+                out_sentences = out_sentences + ' ' + self.dic1[wrd]
+            except KeyError:
+                out_sentences = out_sentences + ' ' + wrd
+        return out_sentences
+
+    def space_correction_plus2(self, doc_string):
+        out_sentences = ''
+        wrds = doc_string.split(' ')
+        L = wrds.__len__()
+        if L < 2:
+            return doc_string
+        cnt = 1
+        for i in range(0, L - 1):
+            w = wrds[i] + wrds[i + 1]
+            try:
+                out_sentences = out_sentences + ' ' + self.dic2[w]
+                cnt = 0
+            except KeyError:
+                if cnt == 1:
+                    out_sentences = out_sentences + ' ' + wrds[i]
+                cnt = 1
+        if cnt == 1:
+            out_sentences = out_sentences + ' ' + wrds[i + 1]
+        return out_sentences
+
+    def space_correction_plus3(self, doc_string):
+        # Dict = {'گفتوگو': 'گفت‌وگو'}
+        out_sentences = ''
+        wrds = doc_string.split(' ')
+        L = wrds.__len__()
+        if L < 3:
+            return doc_string
+        cnt = 1
+        cnt2 = 0
+        for i in range(0, L - 2):
+            w = wrds[i] + wrds[i + 1] + wrds[i + 2]
+            try:
+                out_sentences = out_sentences + ' ' + self.dic3[w]
+                cnt = 0
+                cnt2 = 2
+            except KeyError:
+                if cnt == 1 and cnt2 == 0:
+                    out_sentences = out_sentences + ' ' + wrds[i]
+                else:
+                    cnt2 -= 1
+                cnt = 1
+        if cnt == 1 and cnt2 == 0:
+            out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
+        elif cnt == 1 and cnt2 == 1:
+            out_sentences = out_sentences + ' ' + wrds[i + 2]
+        return out_sentences
+
+    def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
+        normalized_string = gf.normalYehKe(doc_string)
+        normalized_string = self.sub_alphabets(doc_string)
+        #normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination).strip()
+
+        #normalized_string = self.space_correction(self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip()
+        
+        # !!!آئین نامه را به آئین‌نامه تبدیل می کند و دو توکن را به یکی تبدیل می کند
+        #normalized_string = (self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip()
+
+        #if self.pinglish_conversion_needed:
+            #normalized_string = self.pinglish_conversion.pingilish2persian(self.tokenizer.tokenize_words(normalized_string))
+
+        if self.date_normalizing_needed:
+            token_list = self.tokenizer.tokenize_words(normalized_string)
+            # نرمالایز کردن اعداد حروفی و ...
+            normalized_string, additional_token_index_array = self.date_normalizer.normalize_numbers2(token_list)
+            # نرمالایز و تشخیص تاریخ ها
+            normalized_string, dates, recognized_dates = self.date_normalizer.general_normalize_date(normalized_string, additional_token_index_array)
+            #normalized_string_list = normalized_string.strip().split()
+            #normalized_string, dates, recognized_dates = self.date_normalizer.normalize_dates(normalized_string_list, additional_token_index_array, token_list_len= len(normalized_string_list), previous_slice_index_array=[0,])
+            # کنترل اعدادی که پشت سر هم آمده اند و با واو از هم جدا شده اند
+            normalized_string, recognized_numbers = self.date_normalizer.handle_continuous_numbers(normalized_string)
+        # در مواردی مانند وسیصد که واو به عدد سیصد چسبیده، ابتدا این دو را از هم جدا می کنیم
+        # تا عدد اصلی را به دست بیاوریم. در این صورت یک توکن اضافه تولید می شود که با این متد، توکن های اضافی را حذف می کنیم            
+        normalized_string =self.date_normalizer.remove_additional_tokens(normalized_string.split(), additional_token_index_array)
+        if return_dates:
+            # مرتب کردن آرایه تاریخ های پیدا شده بر اساس توکن شروع عبارت حاوی تاریخ
+            recognized_dates.sort(key=lambda x: int(x['start_date_token_index']), reverse=False)
+            
+            return normalized_string, dates, recognized_dates, recognized_numbers
+        else:
+            return normalized_string
+
+
+class DateNormalizer():
+    def __init__(self):
+
+        self.month_dict = { "فروردین": 1,"فروردینماه": 1,'فروردین\u200cماه':1,
+                            "اردیبهشت": 2,"اردیبهشتماه": 2,'اردیبهشت\u200cماه':2,
+                            "خرداد": 3,"خردادماه": 3,
+                            "تیر": 4,"تیرماه": 4,
+                            "مرداد": 5,"مردادماه": 5,
+                            "شهریور": 6,"شهریورماه": 6,
+                            "مهر": 7,"مهرماه": 7,
+                            "آبان": 8,"آبانماه": 8,'آبان\u200cماه':8,
+                            "آذر": 9,"آذرماه": 9,
+                            "دی": 10,"دیماه": 10,'دی\u200cماه':10,
+                            "بهمن": 11,"بهمنماه": 11,'بهمن\u200cماه':11,
+                            "اسفند": 12,"اسفندماه": 12}
+        self.num_dict = {"صد": 100,"وصد": 100, "یکصد": 100, "ویکصد": 100, "هزار": 1000,"وهزار": 1000,
+                         "یکهزار": 1000,"ویکهزار": 1000, 
+                         "میلیون": 1000000,"ملیون": 1000000, "ومیلیون": 1000000,"وملیون": 1000000,
+                         "یکمیلیون": 1000000,"یکملیون": 1000000,
+                         "ویکمیلیون": 1000000,"ویکملیون": 1000000, "دویست": 200,"ودویست": 200,
+                         "ده": 10,"وده": 10, "نه": 9,"ونه": 9, "هشت": 8,"وهشت": 8, "هفت": 7,"وهفت": 7,
+                         "شش": 6,"وشش": 6, "پنج": 5,"وپنج": 5,"چهار": 4,"وچهار": 4, "سه": 3,"وسه": 3,
+                         "دو": 2,"ودو": 2, "یک": 1,"ویک": 1, "یازده": 11,"ویازده": 11, "سیزده": 13, "وسیزده": 13,
+                         "چهارده": 14, "دوازده": 12, "پانزده": 15, "شانزده": 16, "هفده": 17,"هیفده": 17,
+                         "وچهارده": 14, "ودوازده": 12, "وپانزده": 15, "وشانزده": 16, "وهفده": 17,"وهیفده": 17,
+                         "هجده": 18,"هیجده": 18, "نوزده": 19, "بیست": 20, "سی": 30, "چهل": 40, "پنجاه": 50,
+                         "وهجده": 18,"وهیجده": 18, "ونوزده": 19, "وبیست": 20, "وسی": 30, "وچهل": 40, "وپنجاه": 50,
+                         "شصت": 60, "هفتاد": 70, "نود": 90, "سیصد": 300, "چهارصد": 400,
+                         "وشصت": 60, "وهفتاد": 70, "ونود": 90, "وسیصد": 300, "وچهارصد": 400,
+                         "پانصد": 500, "ششصد": 600, "هفتصد": 700, "هشتصد": 800, "نهصد": 900,
+                         "وپانصد": 500, "وششصد": 600, "وهفتصد": 700, "وهشتصد": 800, "ونهصد": 900,
+                         "هشتاد": 80, " ": 0, "میلیارد": 1000000000,"ملیارد": 1000000000,
+                         "یکمیلیارد": 1000000000,"یکملیارد": 1000000000,
+                         "وهشتاد": 80, " ": 0, "ومیلیارد": 1000000000,"وملیارد": 1000000000,
+                         "ویکمیلیارد": 1000000000,"ویکملیارد": 1000000000,
+                         "صدم": 100, "هزارم": 1000, "دویستم": 200,
+                         "وصدم": 100, "وهزارم": 1000, "ودویستم": 200,
+                         "دهم": 10, "نهم": 9, "هشتم": 8, "هفتم": 7, "ششم": 6, "پنجم": 5,
+                         "ودهم": 10, "ونهم": 9, "وهشتم": 8, "وهفتم": 7, "وششم": 6, "وپنجم": 5,
+                         "چهارم": 4, "سوم": 3, "دوم": 2, "یکم": 1, "اول": 1, "یازدهم": 11, "سیزدهم": 13,
+                         "وچهارم": 4, "وسوم": 3, "ودوم": 2, "ویکم": 1, "واول": 1, "ویازدهم": 11, "وسیزدهم": 13,
+                         "چهاردهم": 14, "دوازدهم": 12, "پانزدهم": 15, "شانزدهم": 16, "هفدهم": 17,"هیفدهم": 17,
+                         "وچهاردهم": 14, "ودوازدهم": 12, "وپانزدهم": 15, "وشانزدهم": 16, "وهفدهم": 17,"وهیفدهم": 17,
+                         "هجدهم": 18,"هیجدهم": 18, "نوزدهم": 19, "بیستم": 20, "چهلم": 40, "پنجاهم": 50,
+                         "وهجدهم": 18,"وهیجدهم": 18, "ونوزدهم": 19, "وبیستم": 20, "وچهلم": 40, "وپنجاهم": 50,
+                         "شصتم": 60, "هفتادم": 70, "نودم": 90, "سیصدم": 300, "چهارصدم": 400,
+                         "وشصتم": 60, "وهفتادم": 70, "ونودم": 90, "وسیصدم": 300, "وچهارصدم": 400,
+                         "پانصدم": 500, "ششصدم": 600, "هفتصدم": 700, "هشتصدم": 800, "نهصدم": 900,
+                         "وپانصدم": 500, "وششصدم": 600, "وهفتصدم": 700, "وهشتصدم": 800, "ونهصدم": 900,
+                         "هشتادم": 80,"وهشتادم": 80}
+
+    def find_date_part(self, token_list):
+        import re
+        for index, element in enumerate(token_list):
+            # بررسی الگوی تاریخ 1398/12/18 
+            pattern_date = r'^(\d{4}\s*/\s*([1-9]|0[1-9]|1[0-2])\s*/\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
+            date_pattern = re.compile(pattern_date)
+            match_date   = date_pattern.match(element)
+            if match_date:
+                date_parts = match_date.string.split('/')
+                year  = int(re.search(r'\d+', date_parts[0]).group())
+                month = int(re.search(r'\d+', date_parts[1]).group())
+                day   = int(re.search(r'\d+', date_parts[2]).group())
+                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
+                if day > 999:
+                    day  = int(re.search(r'\d+', date_parts[0]).group())
+                    year = int(re.search(r'\d+', date_parts[2]).group())
+                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
+                return formal_date, index, index, index
+            
+            # 24 /12 /1401 بررسی الگوی تاریخ 
+            pattern_date = r'\s*\d{2}\s*/\s*\d{2}\s*/\s*\d{4}\s*'
+            date_pattern = re.compile(pattern_date)
+            match_date   = date_pattern.match(element)
+            if match_date:
+                date_parts = match_date.string.split('/')
+                year       = int(re.search(r'\d+', date_parts[0]).group())
+                month      = int(re.search(r'\d+', date_parts[1]).group())
+                day        = int(re.search(r'\d+', date_parts[2]).group())
+                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
+                if day > 999:
+                    day  = int(re.search(r'\d+', date_parts[0]).group())
+                    year = int(re.search(r'\d+', date_parts[2]).group())
+                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
+                return formal_date, index, index, index
+            
+            # بررسی الگوی تاریخ 1402،10،06
+            patterndate2 = r'^(\d{4}\s*،\s*([1-9]|0[1-9]|1[0-2])\s*،\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
+            date_pattern = re.compile(patterndate2)
+            match_date2  = date_pattern.match(element)
+            if match_date2:
+                date_parts = match_date2.string.split('،')
+                year  = int(re.search(r'\d+', date_parts[0]).group())
+                month = int(re.search(r'\d+', date_parts[1]).group())
+                day   = int(re.search(r'\d+', date_parts[2]).group())
+                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
+                if day > 999:
+                    day  = int(re.search(r'\d+', date_parts[0]).group())
+                    year = int(re.search(r'\d+', date_parts[2]).group())
+                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
+                return formal_date, index, index, index
+            
+            # بررسی الگوی تاریخ 13ر04ر1345
+            patterndate2 = r'^(\d{4}\s*ر\s*([1-9]|0[1-9]|1[0-2])\s*ر\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
+            date_pattern = re.compile(patterndate2)
+            match_date2  = date_pattern.match(element)
+            if match_date2:
+                date_parts = match_date2.string.split('ر')
+                year  = int(re.search(r'\d+', date_parts[0]).group())
+                month = int(re.search(r'\d+', date_parts[1]).group())
+                day   = int(re.search(r'\d+', date_parts[2]).group())
+                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
+                if day > 999:
+                    day  = int(re.search(r'\d+', date_parts[0]).group())
+                    year = int(re.search(r'\d+', date_parts[2]).group())
+                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
+                return formal_date, index, index, index
+
+            if element == "/":
+                if index-1 >= 0 and index+1 < len(token_list) \
+                        and token_list[index -1].isdigit() and token_list[index+1].isdigit():
+                    if index+3 < len(token_list) and token_list[index+2] == "/" \
+                            and token_list[index + 3].isdigit():
+                        if int(token_list[index-1]) < 1450 and int(token_list[index+1]) < 13 and \
+                            int(token_list[index+3]) < 32:
+                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
+                            formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])
+                            #return formal_date, index-1, index+3, index
+                            return formal_date, index, index, index
+                        elif int(token_list[index-1]) < 32 and int(token_list[index+1]) < 13 and \
+                            int(token_list[index+3]) < 1450:
+                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
+                            formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
+                            #return formal_date, index-1, index+3, index
+                            return formal_date, index, index, index
+                        else:
+                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
+                            formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + "/" + str(formal_date[2]) + ")"
+                            #return formal_date, index-1, index+3, index
+                            return formal_date, index, index, index
+                    elif (int(token_list[index-1]) < 1450 and int(token_list[index-1]) > 1250)  and int(token_list[index+1]) < 13:
+                        formal_date = [int(token_list[index-1]), int(token_list[index+ 1]), 0]
+                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
+                        #return formal_date, index-1 , index+1, index
+                        return formal_date, index , index, index
+                    else:
+                        formal_date = [int(token_list[index-1]), int(token_list[index+ 1])]
+                        formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + ")"
+                        #return formal_date, index-1 , index+1, index
+                        return formal_date, index , index, index
+
+            if element in self.month_dict or element == "سال":
+                if index + 1 < len(token_list) and index - 1 > -2:
+                    try:
+                        start_date_index = end_date_index = 0
+                        if(token_list[index + 1] == 'ماه'):
+                            if(token_list[index + 2] == 'سال' and str(token_list[index + 3]).isdigit()):
+
+                                if(int(token_list[index + 3])==1000):
+                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
+                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
+                                    start_date_index = index - 1
+                                    day = int(token_list[index - 1])
+                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
+                                        day += int(token_list[index - 3])# رقم دهگان روز
+                                        start_date_index = index - 3
+                                    year = 1000
+                                    if(len(token_list)>= index + 5):
+                                        if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()):
+                                            year += int(token_list[index + 5])# رقم صدگان سال
+                                            end_date_index = index + 5
+                                    if(len(token_list)>= index + 7):
+                                        if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()):
+                                            year += int(token_list[index + 7])# رقم دهگان سال
+                                            end_date_index = index + 7
+                                    if(len(token_list)>= index + 9):
+                                        if(token_list[index + 8]=='و' and token_list[index + 9].isdigit()):
+                                            year += int(token_list[index + 9])# رقم یکان سال
+                                            end_date_index = index + 9
+
+                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5
+                                    
+                                else:
+                                    start_date_index = index - 1
+                                    end_date_index = index + 3
+                                    day = int(token_list[index - 1])
+                                    if(token_list[index - 2]=='و' and int(token_list[index - 1])<10 and int(token_list[index - 3])<31 and token_list[index - 3].isdigit()):
+                                        start_date_index = index - 3
+                                        day += int(token_list[index - 3])# رقم دهگان روز
+
+                                        formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 20 و 7 تیر ماه سال 1368
+                                        
+                                    
+                                    formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 27 تیر ماه سال 1368
+
+                            elif(str(token_list[index + 2]).isdigit()):
+                                if(int(token_list[index + 2])==1000):
+                                    
+                                    if token_list[index-1].strip() == 'ام': # مثلا 30 ام تیر ماه 1000 و 300 و 80 و 2
+                                        start_date_index = index - 2
+                                        day = int(token_list[index - 2])
+                                    else:
+                                        # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
+                                        # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
+                                        start_date_index = index - 1
+                                        day = int(token_list[index - 1])
+                                        
+                                        if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
+                                            day += int(token_list[index - 3])# رقم دهگان روز
+                                            start_date_index = index - 3
+                                    year = 1000
+                                    if(len(token_list)>= index + 4):
+                                        if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()):
+                                            year += int(token_list[index + 4])# رقم صدگان سال
+                                            end_date_index = index + 4
+                                    if(len(token_list)>= index + 6):
+                                        if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()):
+                                            year += int(token_list[index + 6])# رقم دهگان سال
+                                            end_date_index = index + 6
+                                    if(len(token_list)>= index + 8):
+                                        if(token_list[index + 7]=='و' and token_list[index + 8].isdigit()):
+                                            year += int(token_list[index + 8])# رقم یکان سال
+                                            end_date_index = index + 8
+                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5
+                                    
+                                else:
+                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر ماه سال 1368
+                                    start_date_index = index - 1
+                                    end_date_index = index + 2
+                                #formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر ماه 1368
+                        
+                        elif(token_list[index + 1] == 'سال' and (not(token_list[index + 2]).isdigit())):
+                            
+                            formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر سال 1368
+                            start_date_index = index - 1
+                            end_date_index = index + 2
+                        else:
+                            if(token_list[index + 1] == 'سال' and str(token_list[index + 2]).isdigit()):
+
+                                if(int(token_list[index + 2])==1000):
+                                    
+                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
+                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
+                                    start_date_index = index - 1
+                                    day = int(token_list[index - 1])
+                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
+                                        day += int(token_list[index - 3])# رقم دهگان روز
+                                        start_date_index = index - 3
+                                    year = 1000
+                                    if(len(token_list)>= index + 4):
+                                        if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()):
+                                            year += int(token_list[index + 4])# رقم صدگان سال
+                                            end_date_index = index + 4
+                                    if(len(token_list)>= index + 6):
+                                        if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()):
+                                            year += int(token_list[index + 6])#  # رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد
+                                            end_date_index = index + 6
+                                    if(len(token_list)>= index + 8):
+                                        if((len(token_list)>= index + 7 and token_list[index + 7]=='و') and (len(token_list)>= index + 8 and token_list[index + 8].isdigit())):
+                                            year += int(token_list[index + 8])# رقم یکان سال
+                                            end_date_index = index + 8
+
+                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5
+                                    
+                                else:
+                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر سال 1368
+                                    start_date_index = index - 1
+                                    end_date_index = index + 2
+
+                            elif(str(token_list[index + 1]).isdigit()):
+                                if(int(token_list[index + 1])==1000):
+                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
+                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
+                                    start_date_index = index - 1
+                                    day = int(token_list[index - 1])
+                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
+                                        day += int(token_list[index - 3])# رقم دهگان روز
+                                        start_date_index = index - 3
+                                    year = 1000
+                                    if(len(token_list)>= index + 3):
+                                        if(token_list[index + 2]=='و' and token_list[index + 3].isdigit()):
+                                            year += int(token_list[index + 3])# رقم صدگان سال
+                                            end_date_index = index + 3
+                                    if(len(token_list)>= index + 5):
+                                        if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()):
+                                            year += int(token_list[index + 5])# رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد
+                                            end_date_index = index + 5
+                                    if(len(token_list)>= index + 7):
+                                        if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()):
+                                            year += int(token_list[index + 7])# رقم یکان سال
+                                            end_date_index = index + 7
+                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5
+                                else:
+                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1320‌
+                                    start_date_index = index - 1
+                                    end_date_index = index + 1
+                                    
+                            else:
+                                formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1365
+                                start_date_index = index - 1
+                                end_date_index = index + 1
+                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
+                        if token_list[index - 1] and token_list[index + 1]:
+                            return formal_date, start_date_index, end_date_index, start_date_index
+                        
+                    except Exception as e:
+                        error = e.args[0]
+                        try:
+                            formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), 0]
+                            formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])#مثلا  y1358m12d0
+                            return formal_date, index-1, index, index-1
+                        except:
+                            try:
+                                # مثلا سال 1400
+                                if token_list[index] == "سال":
+                                    formal_date = [int(token_list[index + 1]),0, 0]
+                                    formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])# y1358m0d0
+                                    return formal_date, index, index+1, index
+                                elif token_list[index+1] == "ماه" and index + 1 < len(token_list):
+                                    if index + 2 < len(token_list) and token_list[index + 2].isdigit() and \
+                                        int(token_list[index-2]) < 1450:
+                                        formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])]
+                                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5
+                                        return formal_date, index, index+2, index
+                                    else:
+                                        formal_date = [0,int(self.month_dict[token_list[index]]),0]
+                                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) #y0m12d0
+                                        return formal_date, index, index+1, index
+                                elif index + 1 < len(token_list) and token_list[index + 1].isdigit() and int(token_list[index-2]) < 1450:
+                                    formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+1])]
+                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5
+                                    return formal_date, index, index+1, index
+                                elif index-2 >= 0 and index+2 < len(token_list) and \
+                                    token_list[index - 1] == "/" and token_list[index+1] == "/" and \
+                                    token_list[index - 2].isdigit() and int(token_list[index-2]) < 32 and \
+                                    token_list[index + 2].isdigit() and int(token_list[index+2]) < 1450:
+                                    formal_date = [int(token_list[index-2]),int(self.month_dict[token_list[index]]),int(token_list[index+2])]
+                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
+                                    return formal_date, index-2, index+2, index-2
+                                elif index + 2 < len(token_list) and token_list[index + 1] == 'سال' and \
+                                    token_list[index + 2].isdigit() and int(token_list[index-2]) < 1450 :
+                                    formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])]
+                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
+                                    return formal_date, index, index+1, index
+                                #else:
+                                #    print("Name : %s -> after1: %s -> after2: %s -> after3: %s" % (token_list[index], token_list[index+1], token_list[index+2], token_list[index+3]))
+                            except:
+                                pass
+
+    def general_normalize_date(self,normalized_string, additional_token_index_array):
+        # 24 /12 /1401 جهت بررسی الگوی تاریخ 
+        normalized_string, recognized_dates = DateNormalizer.separated_date_format_finder(normalized_string)
+        normalized_string_list = normalized_string.strip().split()
+        # جهت یافتن فرمت های مختلف تاریخ غیر از فرمت بررسی شده در بالا
+        normalized_string, dates, recognized_dates_2 = \
+            self.normalize_dates(normalized_string_list, additional_token_index_array,
+                token_list_len = len(normalized_string_list), previous_slice_index_array=[0,])
+        # تجمیع همه تاریخ ها
+        for item in recognized_dates_2:
+            recognized_dates.append(item)
+        return normalized_string, dates, recognized_dates
+
+    # 24 /12 /1401 متد بررسی الگوی تاریخ 
+    def separated_date_format_finder(normalized_string):
+        import re
+        date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}'
+        regex = re.compile(date_pattern)
+        match_dates = regex.finditer(normalized_string)
+        recognized_dates = []
+        for date_item in match_dates:
+            position = date_item.span()
+            founded_item = date_item.group()
+            start_index = date_item.start() + 1
+            end_index = date_item.end() - 1
+            current_date = founded_item.replace(' ', '')
+            date_parts = current_date.split('/')
+            formal_date = "y" + str(date_parts[2]) + "m" + str(date_parts[1]) + "d" + str(date_parts[0])
+            pattern_tokens_state = gf.token_state_finder(normalized_string, start_index, end_index)
+            recognized_dates.append(
+                    {
+                        "original_date"         : founded_item,
+                        "date"                  : formal_date,
+                        "start_index"           : start_index,
+                        "end_index"             : end_index,
+                        "date_token_index"      : pattern_tokens_state["start_token_state"],
+                        "start_date_token_index": pattern_tokens_state["start_token_state"],
+                        "end_date_token_index"  : pattern_tokens_state["end_token_state"],
+                    })
+        normalized_string_list = normalized_string.strip().split()
+        for date_item in recognized_dates:
+            normalized_string_list[int(date_item['date_token_index'])]         = date_item['date']
+            normalized_string_list[int(date_item['start_date_token_index'])+1] = 'tttt'
+            normalized_string_list[int(date_item['end_date_token_index'])]     = 'tttt'
+        normalized_string_temp = ''
+        for token in normalized_string_list:
+            normalized_string_temp = normalized_string_temp + ' ' + token
+        return normalized_string_temp.strip(), recognized_dates   
+
+    def normalize_dates(self, token_list, additional_token_index_array, token_list_len, previous_slice_index_array= []):
+        try:
+            finded = self.find_date_part(token_list)
+        except Exception as e:
+            finded = None
+            
+            filename =  'law_section_errors.txt'
+            exc_type, exc_value, exc_traceback = sys.exc_info()
+            frame = exc_traceback.tb_frame
+            function_array = traceback.extract_tb(exc_traceback)
+            current_function = function_array[len(function_array)-1]
+            error = f'''
+                filename     : {current_function.filename}
+                function     : {current_function.name}
+                err line no  : {current_function.lineno}
+                err line     : {current_function.line}
+                err message  : {e}
+            '''
+            gf.save_error(error, filename)
+
+        recognized_dates = []
+        if finded != None:
+            date_part              = finded[0]
+            start_date_token_index = finded[1]
+            end_date_token_index   = finded[2]
+            date_token_index       = finded[3]
+            befor_date_part        = " ".join(x for x in token_list[:start_date_token_index])
+            after_date_part        = [x for x in token_list[end_date_token_index + 1:]]
+            previous_slice_index   = token_list_len - len(after_date_part)
+            previous_slice_index_array.append(previous_slice_index)
+            after_normalized, after_dates, recognized_dates = self.normalize_dates(after_date_part, additional_token_index_array, token_list_len, previous_slice_index_array)
+            if after_dates == '':
+                after_dates = []
+            if recognized_dates == '':
+                recognized_dates = []
+            after_dates.insert(0, date_part)
+            previous_slice_index = previous_slice_index_array.pop(len(previous_slice_index_array)-2)
+            recognized_dates.append(
+                {
+                 "date"                  : date_part,
+                 "date_token_index"      : date_token_index + previous_slice_index,
+                 "start_date_token_index": start_date_token_index + previous_slice_index,
+                 "end_date_token_index"  : end_date_token_index + previous_slice_index
+                })
+            
+            i = 0
+            while((date_token_index - start_date_token_index) > i ):
+                befor_date_part = ''.join([befor_date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t
+                i += 1
+            i = 0
+            while((end_date_token_index - date_token_index) > i ):
+                date_part = ''.join([date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t
+                i += 1
+            
+            return befor_date_part + " " + date_part + " " + after_normalized, after_dates, recognized_dates
+        else:
+            return " ".join(x for x in token_list), '',''
+
+    def list2num(self, numerical_section_list):
+        value = 1
+        l = len(numerical_section_list)>3
+        if l :
+            value = 0
+        for index, el in enumerate(numerical_section_list):
+            if self.is_number(el):
+                value = self.num_dict[el]
+            elif el == '000':
+                #value *= 1000
+                pass
+            elif l == True:
+                value += float(el)
+            else:
+                value *= float(el)
+        return value
+
+    def convert2num(self, numerical_section_list):
+        value = 0
+        tmp_section_list = []
+        for index, el in enumerate(numerical_section_list):
+            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
+                tmp_section_list.append(el)
+            elif el == "و":
+                value += self.list2num(tmp_section_list)
+                tmp_section_list[:] = []
+        if len(tmp_section_list) > 0:
+            value += self.list2num(tmp_section_list)
+            tmp_section_list[:] = []
+        try:
+            if (value-int(value) == 0):
+                return int(value)
+            else:
+                return value
+        except:
+            return 0
+        
+
+    def is_number(self, word):
+        return word in self.num_dict
+
+    def find_number_location(self, token_list, addWithVaa = False):
+        start_index = 0
+        number_section =[]
+        for i , el in enumerate(token_list):
+            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
+                start_index = i
+                number_section.append(start_index)
+                break
+
+        i = start_index+1
+        while(i < len(token_list)):
+            if token_list[i] == "و" and (i+1)<len(token_list) and addWithVaa:
+                if self.is_number(token_list[i+1]) or (token_list[i+1].replace('.', '', 1).isdigit()):
+                    number_section.append(i)
+                    number_section.append(i+1)
+                    i += 2
+                else:
+                    break
+            elif self.is_number(token_list[i]) or (token_list[i].replace('.', '', 1).isdigit()):
+                number_section.append(i)
+                i += 1
+            elif token_list[i] == "/" and (i+1)<len(token_list) and token_list[i+1] == "000":
+                number_section.append(i)
+                number_section.append(i+1)
+                i += 2
+            else:
+                break
+        return number_section
+
+    def normalize_numbers(self, token_list, converted=""):
+        for i, el in enumerate(token_list):
+            if el.endswith("ین") and self.is_number(el[:-2]):
+                token_list[i] = el[:-2]
+        finded = self.find_number_location(token_list)
+        if len(finded) == 0:
+            rest_of_string = " ".join(t for t in token_list)
+            return converted + " " + rest_of_string
+        else:
+            numerical_subsection = [token_list[x] for x in finded]
+            numerical_subsection = self.convert2num(numerical_subsection)
+
+            converted = converted + " " + " ".join(x for x in token_list[:finded[0]]) + " " + str(numerical_subsection)
+
+            new_index = finded[-1] + 1
+            return self.normalize_numbers(token_list[new_index:], converted)
+
+    # این متد برای تبدیل اعدادی حروفی به معادل رقمی ایجاد شده است
+    def normalize_numbers2(self, token_list, converted=""):
+        additional_token_index_array = [] # آرایه ای از توکن هایی که اضافه بر اصل متن ممکن است در این متد اضافه شوند
+        for index, token in enumerate(token_list):
+
+            if token in self.num_dict:
+                if token.startswith('و'): # برای حالتی مانند وسیصد که بین واو و سیصد، فاصله ای وجود ندارد
+                    temp_token = token[1:]
+                    if temp_token in self.num_dict:
+                       # در متن، واو را از عدد جدا می کنیم
+                       # در این حالت یک توکن به متن اصلی اضافه می شود که باید کنترل شود
+                       converted = converted + " " + "و" + " " + str(int(self.num_dict[token]))
+                       additional_token_index_array.append(index) 
+                else:    
+                    converted = converted + " " + str(int(self.num_dict[token]))
+            else:
+                converted = converted + " " + token
+        return converted, additional_token_index_array
+
+    # این متد برای ضرب اعداد متوالی با جدا کننده اسپیس ایجاد شده
+    def multiply_continuous_numbers(self, normalized_string):
+        token_list       = normalized_string.strip().split()
+        converted_string = ''
+        for index,token in enumerate(token_list):
+            num_array = []
+            if token.isdigit():
+                #if (token_list[index+1]).isdigit():    
+                    # اگر عدد مورد نظر با فرمت بالا نبود، حالت اعداد بدون فاصله کنار هم که باید در هم ضرب شوند را بررسی می کنیم
+                    num_array.clear() 
+                    num_array.append(int(token))
+                    step = 1
+                    deletting_indexes = []
+                    num_array_sum = 1
+                    while True:
+                        if (token_list[index+step]).isdigit() :
+                            num_array.append(int(token_list[index+step]))
+                            deletting_indexes.append(index+step)
+                            step += 1
+                            continue
+                        else:
+                            break
+                        
+                    for index in deletting_indexes:
+                        # بازای توکن هایی که عدد هستند و به دلیل ضرب در هم شدن باید حذف شوند، رشته بی معنی زیر را جایگزین کن
+                        token_list[index] =   'nnnn' # به معنای توکنی که از جنس عدد بوده number
+                    for number in num_array: # ،عدادی که پشت سرهم بوده اند، بدون فاصله و در مراحل قبل پردازش نشده اند و در آرایه ذخیره شده را ضرب می کند و در یک عدد، یک کاسه می کند، مثلا 2 1000 1000000000 تومان را
+                        num_array_sum *= number 
+                    converted_string = converted_string + " " + str(num_array_sum) 
+               
+            else:
+                converted_string = converted_string + " " + token
+        return converted_string
+    
+    # این متد برای جمع اعداد متوالی با جدا کننده واو ایجاد شده
+    def sum_continuous_numbers(self, normalized_string):
+        token_list = normalized_string.strip().split()
+        converted_string = ''
+        operand = '+'
+        for index,token in enumerate(token_list):
+            num_array = []
+            if token.isdigit():
+                # این بخش، متن را بررسی می کند و اعداد دو یا سه رقمی و بیشتر که با واو از هم جدا شده را با هم جمع و یک کاسه می کند
+                if token_list[index+1]=='و': # 2 , 50 , 600 :مانند
+                    
+                    if(int(token) % 10 != 0): # اگر عدد دو یا سه یا چهار یا ... رقمی باشد، رقم دهگاه، صدگان، هزارگان و ... باید باقیمانده شان بر 2 برابر با صفر باشد، اگر نبود، ادامه نده
+                        converted_string = converted_string + " " + token
+                    else: 
+                        num_array.append(int(token))
+                        step = 1
+                        deletting_indexes = []
+                        num_array_sum = 0
+                        while True:
+                            if token_list[index+step] == 'و' :
+                                # اگر یکی از شرط های زیر را داشت، اعدادی که با واو جدا شده اند، مربوط به تاریخ هستند و نباید جمع عددی انجام شود
+                                if (token_list[index+step+1] == 'tttt' or token_list[index+step+1].startswith('y')):
+                                    deletting_indexes.clear()
+                                    num_array.clear()
+                                    num_array_sum = int(token)
+                                    break
+                                elif token_list[index+step+1].isdigit() and token_list[index+step+2]=='nnnn' :
+                                    operand = '*'
+                                    num_array_sum = 1
+                                    num_array.append(int(token_list[index+step+1]))
+                                    # ذخیره کردن ایندکس هایی که جمع شده و باید به جای آن یک توکن خنثی بگذاریم
+                                    deletting_indexes.append(index+step)
+                                    deletting_indexes.append(index+step+1)
+                                    step += 3
+                                    continue
+                                elif token_list[index+step+1].isdigit() :
+                                    num_array.append(int(token_list[index+step+1]))
+                                    # ذخیره کردن ایندکس هایی که جمع شده و باید به جای آن یک توکن خنثی بگذاریم
+                                    deletting_indexes.append(index+step)
+                                    deletting_indexes.append(index+step+1)
+                                    step += 2
+                                    continue
+                            else:
+                                break
+                            
+                        for index in deletting_indexes:
+                            # بازای توکن هایی که عدد یا واو هستند و  باید حذف شوند، رشته بی معنی زیر را جایگزین کن
+                            token_list[index] =   'nnnn' # به معنای توکنی که از جنس عدد بوده number
+
+                        for number in num_array: # اعدادی که با واو پشت سرهم بوده اند و در آرایه ذخیره شده را جمع می کند و در یک عدد، یک کاسه می کند
+                            if operand == '+':
+                                num_array_sum += number 
+                            elif operand == '*':
+                                num_array_sum *= number
+                        converted_string = converted_string + " " + str(num_array_sum)
+ 
+            else:
+                converted_string = converted_string + " " + token
+        return converted_string
+    
+    # این متد اعداد پشت سر هم که با اسپیس یا واو از هم جدا شده اند و باید تبدیل به یک عدد به صورت یک کاسه شوند را هندل می کند
+    def handle_continuous_numbers(self, normalized_string):
+        token_list = normalized_string.strip().split()
+        converted_string = ''
+        number_parts_array = []
+        start_token_index = 0
+        end_token_index = 0
+        recognized_numbers = []
+        flag = True
+        is_tabsare_numbers = False
+        is_mavad_numbers = False
+        is_madde_numbers = False
+        for token_index,token in enumerate(token_list):
+            if token.isdigit() :
+                # if token == '61' :
+                #      print(len(token_list))
+                if token_index - 2 >= 0:
+                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند تبصره های مربوط به یک ماده نباشند
+                    is_tabsare_numbers = (token_list[token_index-2] + ' ' + token_list[token_index-1]=='تبصره های')
+                if token_index - 1 >= 0:
+                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند مواد یک قانون نباشند
+                    is_mavad_numbers = (token_list[token_index-1]=='مواد')
+                if token_index - 1 >= 0:
+                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند ماده های یک قانون نباشند
+                    is_madde_numbers = (token_list[token_index-1]=='ماده')
+                if is_tabsare_numbers or is_mavad_numbers or is_madde_numbers:
+                    flag = False
+                start_token_index = end_token_index = token_index
+                number_parts_array.clear()
+                current_token_index = token_index
+                number_parts_array.append(token)
+                if current_token_index + 1 < len(token_list):
+                    while ((flag) and (token_list[current_token_index+1] == 'و' or token_list[current_token_index+1].isdigit())):
+                        number_parts_array.append(token_list[current_token_index+1])
+                        current_token_index += 1
+                        end_token_index = current_token_index
+                        if not (current_token_index + 1 < len(token_list)):
+                            break
+                
+                final_number = number_completter(number_parts_array)
+                token_list[token_index] = str(final_number)
+                recognized_numbers.append({
+                    'number_value'     : final_number,
+                    'number_token_list': number_parts_array,
+                    'start_token_index': start_token_index,
+                    'end_token_index'  : end_token_index
+                    })
+                # جایگذاری مقدار رشته بی ارزش به جای توکن های مربوط به عدد جاری
+                # به منظور اینکه متن اصلی از نظر تعداد توکن تغییر نکند
+                for i in range(token_index + 1 , len(number_parts_array) + token_index):
+                    token_list[i] = 'nnnn'
+                converted_string = converted_string + ' ' + token_list[token_index]
+                if token_index + 1 < len(token_list):
+                    if flag == False and (token_list[token_index+1]=='و'):
+                        flag = False
+                    else:
+                        flag = True
+            else:
+                converted_string = converted_string + ' ' + token
+               
+        return converted_string, recognized_numbers
+
+    
+
+
+    def remove_additional_tokens(self,token_list,additional_token_index_array):
+        converted_string = ''
+        for index in additional_token_index_array:
+            del token_list[index]
+
+        for token in token_list:
+            converted_string = converted_string + ' ' + token
+        return converted_string
+    
+class PinglishNormalizer():
+    def __init__(self):
+        self.data_helper = DataHelper()
+        self.file_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
+
+        self.en_dict_filename = self.file_dir + "resource/tokenizer/enDict"
+        self.en_dict = self.data_helper.load_var(self.en_dict_filename)
+
+        self.fa_dict_filename = self.file_dir + "resource/tokenizer/faDict"
+        self.fa_dict = self.data_helper.load_var(self.fa_dict_filename)
+
+
+    def pingilish2persian(self, pinglish_words_list):
+
+        for i, word in enumerate(pinglish_words_list):
+            if word in self.en_dict:
+                pinglish_words_list[i] = self.en_dict[word]#.decode("utf-8")
+                #inp = inp.replace(word, enDict[word], 1)
+            else:
+                ch = self.characterize(word)
+                pr = self.map_char(ch)
+                amir = self.make_word(pr)
+                for wd in amir:
+                    am = self.escalation(wd)
+                    asd = ''.join(am)
+                    if asd in self.fa_dict:
+                        pinglish_words_list[i] = asd#.decode("utf-8")
+                        #inp = inp.replace(word, asd, 1)
+        inp = " ".join(x for x in pinglish_words_list)
+        return inp
+
+    def characterize(self, word):
+        list_of_char = []
+        i = 0
+        while i < len(word):
+            char = word[i]
+            sw_out = self.switcher(char)
+            if (sw_out == None):
+                esp_out = None
+                if(i < len(word) - 1):
+                    esp_out = self.esp_check(word[i], word[i + 1])
+                if(esp_out == None):
+                    list_of_char.append(word[i])
+                else:
+                    list_of_char.append(esp_out)
+                    i += 1
+            else:
+                list_of_char.append(sw_out)
+            i += 1
+        return list_of_char
+
+    def switcher(self, ch):
+        switcher = {
+            "c": None,
+            "k": None,
+            "z": None,
+            "s": None,
+            "g": None,
+            "a": None,
+            "u": None,
+            "e": None,
+            "o": None
+        }
+        return switcher.get(ch, ch)
+
+    def esp_check(self, char1, char2):
+        st = char1 + char2
+        if (st == "ch"):
+            return "ch"
+        elif (st == "kh"):
+            return "kh"
+        elif (st == "zh"):
+            return "zh"
+        elif (st == "sh"):
+            return "sh"
+        elif (st == "gh"):
+            return "gh"
+        elif (st == "aa"):
+            return "aa"
+        elif (st == "ee"):
+            return "ee"
+        elif (st == "oo"):
+            return "oo"
+        elif (st == "ou"):
+            return "ou"
+        else:
+            return None
+
+    def map_char(self, word):
+        listm = []
+        sw_out = self.map_switcher(word[0])
+        i = 0
+        if (sw_out == None):
+            listm.append(["ا"])
+            i += 1
+        if (word[0] == "oo"):
+            listm.append(["او"])
+            i += 1
+        while i < len(word):
+            listm.append(self.char_switcher(word[i]))
+            i += 1
+        if word[len(word) - 1] == "e":
+            listm.append(["ه"])
+        elif word[len(word) - 1] == "a":
+            listm.append(["ا"])
+        elif word[len(word) - 1] == "o":
+            listm.append(["و"])
+        elif word[len(word) - 1] == "u":
+            listm.append(["و"])
+
+        return listm
+
+    def map_switcher(self, ch):
+        switcher = {
+            "a": None,
+            "e": None,
+            "o": None,
+            "u": None,
+            "ee": None,
+
+            "ou": None
+        }
+        return switcher.get(ch, ch)
+
+    def make_word(self, chp):
+        word_list = [[]]
+        for char in chp:
+            word_list_temp = []
+            for tmp_word_list in word_list:
+                for chch in char:
+                    tmp = copy.deepcopy(tmp_word_list)
+                    tmp.append(chch)
+                    word_list_temp.append(tmp)
+            word_list = word_list_temp
+        return word_list
+
+    def escalation(self, word):
+        tmp = []
+        i = 0
+        t = len(word)
+        while i < t - 1:
+            tmp.append(word[i])
+            if word[i] == word[i + 1]:
+                i += 1
+            i += 1
+        if i != t:
+            tmp.append(word[i])
+        return tmp
+
+    def char_switcher(self, ch):
+        switcher = {
+            'a': ["", "ا"],
+            'c': ["ث", "ص", "ص"],
+            'h': ["ه", "ح"],
+            'b': ["ب"],
+            'p': ["پ"],
+            't': ["ت", "ط"],
+            's': ["س", "ص", "ث"],
+            'j': ["ج"],
+            'ch': ["چ"],
+            'kh': ["خ"],
+            'q': ["ق", "غ"],
+            'd': ["د"],
+            'z': ["ز", "ذ", "ض", "ظ"],
+            'r': ["ر"],
+            'zh': ["ژ"],
+            'sh': ["ش"],
+            'gh': [",ق", "غ"],
+            'f': ["ف"],
+            'k': ["ک"],
+            'g': ["گ"],
+            'l': ["ل"],
+            'm': ["م"],
+            'n': ["ن"],
+            'v': ["و"],
+            'aa': ["ا"],
+            'ee': ["ی"],
+            'oo': ["و"],
+            'ou': ["و"],
+            'i': ["ی"],
+            'y': ["ی"],
+            ' ': [""],
+            'w': ["و"],
+            'e': ["", "ه"],
+            'o': ["", "و"]
+        }
+        return switcher.get(ch, "")
+
+# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند
+def number_completter2(number_parts_array):
+        zarb_value = 0
+        temp_number_array = []
+        sum_number_parts = 0
+        final_number = 0
+        for index,item in enumerate(number_parts_array):
+                    if item.isdigit():
+                        temp_number_array.append(int(item))
+                        if index + 1 >= len(number_parts_array):
+                            for item3 in temp_number_array:
+                                final_number += int(item3)
+                            return final_number 
+                        current_value = number_parts_array[index+1]       
+                        if (current_value.isdigit() and (int(current_value) == 1000 or int(current_value) == 1000000 or int(current_value) == 1000000000)):# or int(current_value) == 1000000
+                            zarb_value = int(number_parts_array[index+1])
+                            for num_item in temp_number_array:
+                                sum_number_parts += num_item
+                            final_number = sum_number_parts * zarb_value
+                            temp_array2 = []
+                            x = index + 2
+                            while x < len(number_parts_array):
+                            # for x in range(index+2,len(number_parts_array)):
+                                temp_array2.append(number_parts_array[x])
+                                if x + 1 < len(number_parts_array):
+                                    if number_parts_array[x+1].isdigit() and (int(number_parts_array[x+1]) == 1000 or int(number_parts_array[x+1]) == 1000000 or int(number_parts_array[x+1]) == 1000000000):
+                                        if int(number_parts_array[x+1]) > zarb_value: # 1000000>1000
+                                            zarb_value = int(number_parts_array[x+1])
+                                            # تابع بازگشتی برای محاسبه عدد نهایی
+                                            final_number += number_completter2(temp_array2)
+                                            final_number *= zarb_value
+                                            temp_array2.clear()
+                                            zarb_value = 0
+                                            x += 2 # به این دلیل که مقدار ضرب را در آرایه تمپ ذخیره نکند
+                                        else: # 1000!>1000000
+                                            zarb_value = int(number_parts_array[x+1])
+                                            temp_num = 0
+                                            if (zarb_value == 1000 or zarb_value == 1000000) and len(number_parts_array) > x + 1:
+                                                num_array = number_parts_array[x+2:len(number_parts_array)]
+                                                temp_num = number_completter(num_array) 
+                                                if temp_num == None:
+                                                    temp_num = 0 
+                                                
+                                                '''for i in range(x+2,len(number_parts_array)):
+                                                    if(number_parts_array[i].isdigit()):
+                                                        temp_num += int(number_parts_array[i])
+                                                        i+=1'''
+                                            temp_num2 = 1
+                                            for num_item2 in temp_array2:
+                                                if(num_item2.isdigit()):
+                                                    temp_num2 += int(num_item2)
+                                            
+                                            temp_num2 *= zarb_value  
+                                            final_number += temp_num + temp_num2
+                                            break
+                                    else:
+                                        x += 1 #
+                                else: # اگر از رنج آرایه خارج شدیم
+                                    temp_num = 0
+                                    # مقادیر موجود در آرایه تمپ را جمع کن
+                                    for num_item3 in temp_array2:
+                                        if(num_item3.isdigit()):
+                                            temp_num += int(num_item3)
+                                     # حاصل جمع اعداد موجود در آرایه ذخیره را در عدد نهایی که قبلا داشته ایم ضرب کن و از حلقه خارج شو
+                                    final_number *= temp_num
+                                    #final_number += temp_num
+                                    break       
+                            return final_number
+                        
+# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند
+def number_completter(number_parts_array):
+        zarb_value                = 0
+        previous_zarb_value       = 0
+        previous_number_parts_sum = 0
+        temp_number_array         = []
+        number_parts_sum          = 0
+        final_number              = 0
+        current_number_part       = 0
+        for index,item in enumerate(number_parts_array):
+                    if item.isdigit():      
+                        if (not(int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)):
+                            temp_number_array.append(item)
+                            continue
+                        elif((int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)):
+                            zarb_value = int(item)    
+                            for num_item in temp_number_array:
+                                number_parts_sum += int(num_item)
+                            if number_parts_sum == 0 and previous_number_parts_sum == 0:  
+                                number_parts_sum = 1  
+                            temp_number_array.clear()    
+                            
+                        else:# for example 952
+                            zarb_value = 1    
+                            for num_item in temp_number_array:
+                                number_parts_sum += int(num_item)
+                            current_number_part = number_parts_sum + previous_number_parts_sum
+                            continue
+                        if previous_zarb_value < zarb_value:# for example 1000 < 1000000000   
+                            current_number_part = previous_number_parts_sum + number_parts_sum
+                            current_number_part = zarb_value * current_number_part
+                        else:# previous_zarb_value > zarb_value
+                            if number_parts_sum == 0:
+                                number_parts_sum = 1
+                            current_number_part  = zarb_value * number_parts_sum
+                            current_number_part += previous_number_parts_sum
+
+                        previous_number_parts_sum = current_number_part
+                        current_number_part = 0
+                        previous_zarb_value = zarb_value
+                        number_parts_sum = 0
+        if len(temp_number_array) != 0:
+            remained_parts_sum = 0  
+            for num_item in temp_number_array:
+                remained_parts_sum += int(num_item)
+            final_number = previous_number_parts_sum + remained_parts_sum
+            return final_number
+            
+        final_number = previous_number_parts_sum
+        return final_number
+
+
+
+
+                           
+                       
\ No newline at end of file
diff --git a/import_data/peyma_dataset.py b/import_data/peyma_dataset.py
new file mode 100644
index 0000000..f22534f
--- /dev/null
+++ b/import_data/peyma_dataset.py
@@ -0,0 +1,177 @@
+import json
+import requests
+from decimal import Decimal
+import os
+
+TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MTg0NTE3MTUsImp0aSI6InNWaDljNkdlRWdEK0IzM1N2UHNzbXkybkxUK0tWWnBjIiwiaXNzIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImV4cCI6MTcxOTc1MTcxNCwiYXVkIjoiaHR0cHM6XC9cL2NwLnRhdmFzaS5pciIsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.frkLaR1HYyfZ8sFhLtiuEZaBPwDPWKSxKDQpql6aOZc'
+ACCEPT = "application/json"
+HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
+
+url = "https://api.tavasi.ir/repo/dataset/multi/add/peyma/ner"
+headers = HEADERS
+
+address = os.getcwd()
+if 'impoert_data' in address:
+    address += '/data/peyma_train.txt'
+else:
+    address += '/impoert_data/data/peyma_train.txt'
+# باز کردن فایل متنی
+with open(address, 'r', encoding='utf-8') as file:
+    input_text = file.read()
+
+# تبدیل متن به لیستی از خطوط
+lines = input_text.strip().split('\n')
+
+key = ''
+begin = -1
+end = -1
+tokenNumber = -1
+content = ''
+result_token = []
+
+class JSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Decimal):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+
+# def createIndex(content, result_token):
+#     output = {
+#         "content": content,
+#         "domain": "پیکره پیما",
+#         "ref_id": "",
+#         "ref_url": "",
+#         "result_token": result_token,
+#     }
+#     # print(output)
+#     # print(json.dumps(output, indent=4, ensure_ascii=False))
+#     return output
+
+def createIndex(content, result_token):
+    result_objects = [{
+          "task":"ner",
+          "key":"peyma_ner",
+          "label":"خروجی تیم پیما",
+          "values":result_token
+          } ]
+    output ={
+        "content": content,
+        "domain": "پیکره پیما",
+        "ref_id": "",
+        "ref_url": "",
+        "result_objects": result_objects,
+    }
+    # print(output)
+    # print(json.dumps(output, indent=4, ensure_ascii=False))
+    return output
+
+def appendResultToken(text, key, begin, tokenNumber, result_token):
+    end = -1
+    # if key == 'HALFREFERENCE' :
+    #     key = 'H_REF'
+    # elif key == 'REFERENCE' :
+    #     key = 'REF'
+    if key:
+        if key == 'ORG' :
+            key = 'ORG'
+        elif key == 'MON' :
+            key = 'MON'
+        elif key == 'LOC' :
+            key = 'LOC'
+        elif key == 'DAT' :
+            key = 'DAT'
+        elif key == 'TIM' :
+            key = 'TIM'
+        elif key == 'PER' :
+            key = 'PER'
+        elif key == 'PCT' :
+            key = 'PCT'
+
+        end = tokenNumber -1
+        result_token.append({
+        "begin": begin,
+        "end": end,
+        "result_key": key,
+        "text" : text
+        }) 
+        begin = -1
+        end = -1
+        key = ''
+    return  key, begin, end, result_token  
+
+bulk_data = []
+bulk_count = 1
+count = 0
+text = ''
+
+for i, line in enumerate(lines):
+    print('line: ' + str(i))
+    count += 1
+    tokenNumber += 1
+
+    if line.strip() == '' : 
+       key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+       data = createIndex(content, result_token) 
+       tokenNumber = -1
+       content = ''
+       result_token = []
+
+       bulk_data.append(data)    
+       bulk_count +=1 
+       if bulk_data.__len__() > 100:
+            print('=' * 30 )
+            print('count ' + str(count))
+            payload = json.dumps(bulk_data, cls=JSONEncoder)  #Works!
+            response = requests.request("POST", url, headers=headers, data=payload)
+            print(response)
+            bulk_data = []
+            bulk_count = 1
+
+       continue
+
+    parts = line.split('|')
+    if len(parts) != 2:
+        continue
+
+    result_key = parts[1]
+    token_part = (parts[0].strip()).split()
+    if len(token_part) > 1:
+        new_token_part = token_part[0].strip() + '\u200c' + token_part[1].strip()
+        content += ' ' + parts[0]
+    else:
+        content += ' ' + parts[0]
+        
+    if result_key.startswith('I_'):    
+        text += ' ' + parts[0]
+        continue
+
+    if result_key == 'O' :     
+        key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+    if result_key.startswith('B_'):        
+        key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+        text = parts[0]
+        begin = tokenNumber
+        end = -1
+        key = result_key.replace('B_', '')
+
+
+if content != '' :
+    key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+    data = createIndex(content, result_token) 
+    bulk_data.append(data)    
+    bulk_count +=1 
+
+
+if  bulk_data.__len__() > 0:
+    print(bulk_count)
+    payload = json.dumps(bulk_data, cls=JSONEncoder)  #Works!
+    response = requests.request("POST", url, headers=headers, data=payload)
+    print(response.text)
+    
+# نمایش دیکشنری خروجی به صورت JSON
+print("*****************   end ")
\ No newline at end of file
diff --git a/import_data/qavanin_dataset_test.py b/import_data/qavanin_dataset_test.py
new file mode 100644
index 0000000..4d46fad
--- /dev/null
+++ b/import_data/qavanin_dataset_test.py
@@ -0,0 +1,172 @@
+from decimal import Decimal
+import requests
+import json
+import os
+
+""" !!! place of token:: Application > Local Storage > id_token !!! """
+
+# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم:
+# api.tavasi.ir token and url
+TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjQ5Mzk3NTksImp0aSI6IktwbjlBdEV1ZGh2WngxZUFXOFRRRUFrcmpcL01xa2JmWiIsImlzcyI6bnVsbCwiZXhwIjoxNzI2MjM5NzU4LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjQwLCJmaXJzdF9uYW1lIjoiXHUwNjM5XHUwNjQ1XHUwNjI3XHUwNjMxIiwibGFzdF9uYW1lIjoiXHUwNjJjXHUwNjQ4XHUwNmE5XHUwNjI3XHUwNjMxIiwiZW1haWwiOiJham9rYXI5MUB5YWhvby5jb20iLCJ1c2VybmFtZSI6ImFqb2thciIsInVzZXJfbGV2ZWwiOjF9fQ.r_A8IMQdN50ZM8oPmIq31Pz-phPDJyfQYLmbsLjv4Ao'
+url = "https://api.tavasi.ir/repo/dataset/multi/add/qasection_test/ner"
+
+# Mortaza Server token and url
+# توکن جدید را از آدرس زیر دریافت و جایگزین می کنیم:
+# 192.168.23.160 token
+#TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MjUxMDk1NDAsImp0aSI6IjdWWUhCTFhiMVJFb3VpY2dORzE1dHRkcnBObU1cL0JobCIsImlzcyI6bnVsbCwiZXhwIjoxNzI2NDA5NTM5LCJhdWQiOm51bGwsImRhdGEiOnsiaWQiOjEsImZpcnN0X25hbWUiOiJcdTA2MjhcdTA2MzFcdTA2NDZcdTA2MjdcdTA2NDVcdTA2NDcgXHUwNjQ2XHUwNjQ4XHUwNmNjXHUwNjMzIiwibGFzdF9uYW1lIjoiXHUwNjQxXHUwNjQ2XHUwNmNjIiwiZW1haWwiOiJkZXZAZ21haWwuY29tIiwidXNlcm5hbWUiOiJkZXYiLCJ1c2VyX2xldmVsIjoyfX0.WayvUXRnBukSyc6o_Qv4dClTv9Hn75uOTcCfzGQ9El8'
+#url = "http://192.168.23.160:8080/repo/dataset/multi/add/qasection_test/ner"
+
+ACCEPT = "application/json"
+HEADERS = {"Authorization": TOKEN, "Accept": ACCEPT}
+headers = HEADERS
+
+address = './impoert_data/data/sample_dataset.txt'
+# address = os.getcwd()
+# if 'impoert_data' in address:
+#     address += '/data/sample_dataset.txt' ###
+# else:
+#     address += '/impoert_data/data/sample_dataset.txt'
+# باز کردن فایل متنی
+with open(address, 'r', encoding='utf-8') as file:
+    input_text = file.read()
+
+# تبدیل متن به لیستی از خطوط
+lines = input_text.strip().split('\n')
+
+key = ''
+begin = -1
+end = -1
+tokenNumber = -1
+content = ''
+result_token = []
+
+class JSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Decimal):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+def createIndex(content, result_token, section_order):
+    result_objects = [{
+          "task":"ner",
+          "key":"qavanin_ner",
+          "label":"خروجی تیم همتا",
+          "values":result_token
+          } ]
+    output ={
+        "content": content,
+        "domain": "پیکره قوانین آزمایشی",
+        "ref_id": "",
+        "ref_url": "",
+        "result_objects": result_objects,
+        "order": str(section_order),
+    }
+    # print(output)
+    # print(json.dumps(output, indent=4, ensure_ascii=False))
+    return output
+
+def appendResultToken(text, key, begin, tokenNumber, result_token):
+    end = -1
+    # if key == 'HALFREFERENCE' :
+    #     key = 'H_REF'
+    # elif key == 'REFERENCE' :
+    #     key = 'REF'
+    if key:
+        # if key == 'org' :
+        #     key = 'ORG'
+        # elif key == 'loc' :
+        #     key = 'LOC'
+        # elif key == 'Facility' :
+        #     key = 'fac'
+        # elif key == 'event' :
+        #     key = 'EVENT'
+        # elif key == 'pro' :
+        #     key = 'PRO'
+        # elif key == 'pers' :
+        #     key = 'PER'
+
+        end = tokenNumber -1
+        result_token.append({
+        "begin": begin,
+        "end": end,
+        "result_key": key,
+        "text" : text
+        }) 
+        begin = -1
+        end = -1
+        key = ''
+    return  key, begin, end, result_token  
+
+bulk_data = []
+bulk_count = 1
+count = 0
+section_order = 0
+text = ''
+
+for i, line in enumerate(lines):
+    print('line: ' + str(i))
+    count += 1
+    tokenNumber = tokenNumber + 1
+
+    if line.strip() == '' : 
+       key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+       section_order += 1
+       data = createIndex(content, result_token, section_order) 
+       tokenNumber = -1
+       content = ''
+       result_token = []
+
+       bulk_data.append(data)    
+       bulk_count +=1 
+       if bulk_data.__len__() > 100:
+            print('=' * 30 )
+            print('count ' + str(count))
+            payload = json.dumps(bulk_data, cls=JSONEncoder)  #Works!
+            response = requests.request("POST", url, headers=headers, data=payload)
+            print(response)
+            bulk_data = []
+            bulk_count = 1
+
+       continue
+
+
+
+    parts = line.split()
+    if len(parts) != 2:
+        continue
+
+    content += ' ' + parts[0]
+
+    result_key = parts[1]
+    if result_key.startswith('I-'):    
+        text += ' ' + parts[0]
+        continue
+
+    if result_key == 'O' :     
+        key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+    if result_key.startswith('B-'):        
+        key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+
+        text = parts[0]
+        begin = tokenNumber
+        end = -1
+        key = result_key.replace('B-', '')
+
+if content != '' :
+    key, begin, end, result_token = appendResultToken(text, key, begin, tokenNumber, result_token)
+    section_order += 1
+    data = createIndex(content, result_token, section_order) 
+    bulk_data.append(data)    
+    bulk_count +=1 
+
+
+if  bulk_data.__len__() > 0:
+    print(bulk_count)
+    payload = json.dumps(bulk_data, cls=JSONEncoder)  #Works!
+    response = requests.request("POST", url, headers=headers, data=payload)
+    print(response)
+    
+# نمایش دیکشنری خروجی به صورت JSON
+print("*****************   end   *****************")
\ No newline at end of file