Flair_NER/normalizer.py

from cleantext import clean
# from scipy.linalg import triu
# import hazm
import re
from re import sub
import copy
import os
from tokenizer import Tokenizer
# from data_helper import DataHelper
#import general_functions as gf
import traceback,sys

class Normalizer():

    def __init__(self,
                 half_space_char='\u200c',
                 date_normalizing_needed=False,
                 pinglish_conversion_needed=False,
                 train_file_path="resource/tokenizer/Bijan_khan_chunk.txt",
                 token_merger_path="resource/tokenizer/TokenMerger.pckl"):
        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"

        self.dic1_path = self.dir_path + 'resource/normalizer/Dic1_new.txt'
        self.dic2_path = self.dir_path + 'resource/normalizer/Dic2_new.txt'
        self.dic3_path = self.dir_path + 'resource/normalizer/Dic3_new.txt'
        self.dic1 = self.load_dictionary(self.dic1_path)
        self.dic2 = self.load_dictionary(self.dic2_path)
        self.dic3 = self.load_dictionary(self.dic3_path)

        self.date_normalizing_needed = date_normalizing_needed
        self.pinglish_conversion_needed = pinglish_conversion_needed
        # self.data_helper = DataHelper()


        if self.date_normalizing_needed or self.pinglish_conversion_needed:
            self.tokenizer = Tokenizer()
            self.date_normalizer = DateNormalizer()
            self.pinglish_conversion = PinglishNormalizer()


    def load_dictionary(self, file_path):
        dict = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            g = f.readlines()
            for Wrds in g:
                wrd = Wrds.split(' ')
                dict[wrd[0].strip()] = sub('\n', '', wrd[1].strip())
        return dict

    def sub_alphabets(self, doc_string):
        # try:
        #     doc_string = doc_string.decode('utf-8')
        # except UnicodeEncodeError:
        #     pass
        # a0 = "ء"
        # b0 = "ئ"
        #c0 = sub(a0, b0, doc_string)
        a11 = r"ﺁ|آ"
        b11 = r"آ"
        a1 = r"ٲ|ٱ|إ|ﺍ|أ"
        b1 = r"ا"
        c11 = sub(a11, b11, doc_string)
        c1 = sub(a1, b1, c11)
        a2 = r"ﺐ|ﺏ|ﺑ"
        b2 = r"ب"
        c2 = sub(a2, b2, c1)
        a3 = r"ﭖ|ﭗ|ﭙ|ﺒ|ﭘ"
        b3 = r"پ"
        c3 = sub(a3, b3, c2)
        a4 = r"ﭡ|ٺ|ٹ|ﭞ|ٿ|ټ|ﺕ|ﺗ|ﺖ|ﺘ"
        b4 = r"ت"
        c4 = sub(a4, b4, c3)
        a5 = r"ﺙ|ﺛ"
        b5 = r"ث"
        c5 = sub(a5, b5, c4)
        a6 = r"ﺝ|ڃ|ﺠ|ﺟ"
        b6 = r"ج"
        c6 = sub(a6, b6, c5)
        a7 = r"ڃ|ﭽ|ﭼ"
        b7 = r"چ"
        c7 = sub(a7, b7, c6)
        a8 = r"ﺢ|ﺤ|څ|ځ|ﺣ"
        b8 = r"ح"
        c8 = sub(a8, b8, c7)
        a9 = r"ﺥ|ﺦ|ﺨ|ﺧ"
        b9 = r"خ"
        c9 = sub(a9, b9, c8)
        a10 = r"ڏ|ډ|ﺪ|ﺩ"
        b10 = r"د"
        c10 = sub(a10, b10, c9)
        a11 = r"ﺫ|ﺬ|ﻧ"
        b11 = r"ذ"
        c11 = sub(a11, b11, c10)
        a12 = r"ڙ|ڗ|ڒ|ڑ|ڕ|ﺭ|ﺮ"
        b12 = r"ر"
        c12 = sub(a12, b12, c11)
        a13 = r"ﺰ|ﺯ"
        b13 = r"ز"
        c13 = sub(a13, b13, c12)
        a14 = r"ﮊ"
        b14 = r"ژ"
        c14 = sub(a14, b14, c13)
        a15 = r"ݭ|ݜ|ﺱ|ﺲ|ښ|ﺴ|ﺳ"
        b15 = r"س"
        c15 = sub(a15, b15, c14)
        a16 = r"ﺵ|ﺶ|ﺸ|ﺷ"
        b16 = r"ش"
        c16 = sub(a16, b16, c15)
        a17 = r"ﺺ|ﺼ|ﺻ"
        b17 = r"ص"
        c17 = sub(a17, b17, c16)
        a18 = r"ﺽ|ﺾ|ﺿ|ﻀ"
        b18 = r"ض"
        c18 = sub(a18, b18, c17)
        a19 = r"ﻁ|ﻂ|ﻃ|ﻄ"
        b19 = r"ط"
        c19 = sub(a19, b19, c18)
        a20 = r"ﻆ|ﻇ|ﻈ"
        b20 = r"ظ"
        c20 = sub(a20, b20, c19)
        a21 = r"ڠ|ﻉ|ﻊ|ﻋ"
        b21 = r"ع"
        c21 = sub(a21, b21, c20)
        a22 = r"ﻎ|ۼ|ﻍ|ﻐ|ﻏ"
        b22 = r"غ"
        c22 = sub(a22, b22, c21)
        a23 = r"ﻒ|ﻑ|ﻔ|ﻓ"
        b23 = r"ف"
        c23 = sub(a23, b23, c22)
        a24 = r"ﻕ|ڤ|ﻖ|ﻗ"
        b24 = r"ق"
        c24 = sub(a24, b24, c23)
        a25 = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك"
        b25 = r"ک"
        c25 = sub(a25, b25, c24)
        a26 = r"ﮚ|ﮒ|ﮓ|ﮕ|ﮔ"
        b26 = r"گ"
        c26 = sub(a26, b26, c25)
        a27 = r"ﻝ|ﻞ|ﻠ|ڵ"
        b27 = r"ل"
        c27 = sub(a27, b27, c26)
        a28 = r"ﻡ|ﻤ|ﻢ|ﻣ"
        b28 = r"م"
        c28 = sub(a28, b28, c27)
        a29 = r"ڼ|ﻦ|ﻥ|ﻨ"
        b29 = r"ن"
        c29 = sub(a29, b29, c28)
        a30 = r"ވ|ﯙ|ۈ|ۋ|ﺆ|ۊ|ۇ|ۏ|ۅ|ۉ|ﻭ|ﻮ|ؤ"
        b30 = r"و"
        c30 = sub(a30, b30, c29)
        a31 = r"ﺔ|ﻬ|ھ|ﻩ|ﻫ|ﻪ|ۀ|ە|ة|ہ"
        b31 = r"ه"
        c31 = sub(a31, b31, c30)
        a32 = r"ﭛ|ﻯ|ۍ|ﻰ|ﻱ|ﻲ|ں|ﻳ|ﻴ|ﯼ|ې|ﯽ|ﯾ|ﯿ|ێ|ے|ى|ي"
        b32 = r"ی"
        c32 = sub(a32, b32, c31)
        a33 = r'¬'
        b33 = r'‌'
        c33 = sub(a33, b33, c32)
        pa0 = r'•|·|●|·|・|∙|｡|ⴰ'
        pb0 = r'.'
        pc0 = sub(pa0, pb0, c33)
        pa1 = r',|٬|٫|‚|，'
        pb1 = r'،'
        pc1 = sub(pa1, pb1, pc0)
        pa2 = r'ʕ'
        pb2 = r'؟'
        pc2 = sub(pa2, pb2, pc1)
        na0 = r'۰|٠'
        nb0 = r'0'
        nc0 = sub(na0, nb0, pc2)
        na1 = r'۱|١'
        nb1 = r'1'
        nc1 = sub(na1, nb1, nc0)
        na2 = r'۲|٢'
        nb2 = r'2'
        nc2 = sub(na2, nb2, nc1)
        na3 = r'۳|٣'
        nb3 = r'3'
        nc3 = sub(na3, nb3, nc2)
        na4 = r'۴|٤'
        nb4 = r'4'
        nc4 = sub(na4, nb4, nc3)
        na5 = r'۵'
        nb5 = r'5'
        nc5 = sub(na5, nb5, nc4)
        na6 = r'۶|٦'
        nb6 = r'6'
        nc6 = sub(na6, nb6, nc5)
        na7 = r'۷|٧'
        nb7 = r'7'
        nc7 = sub(na7, nb7, nc6)
        na8 = r'۸|٨'
        nb8 = r'8'
        nc8 = sub(na8, nb8, nc7)
        na9 = r'۹|٩'
        nb9 = r'9'
        nc9 = sub(na9, nb9, nc8)
        np2 = r'²'
        nm2 = r'2'
        ng2 = sub(np2, nm2, nc9)
        ea1 = r'|ِ|ُ|َ|ٍ|ٌ|ً|'
        eb1 = r''
        ec1 = sub(ea1, eb1, ng2)
        ea1 = r'ـ'
        eb1 = r'_'
        ec2 = sub(ea1, eb1, ec1)
        Sa1 = r'( )+'
        Sb1 = r' '
        Sc1 = sub(Sa1, Sb1, ec2)
        Sa2 = r'(\n)+'
        Sb2 = r'\n'
        Sc2 = sub(Sa2, Sb2, Sc1)
        return Sc2

    def space_correction(self, doc_string):
        a00 = r'^(بی|می|نمی)( )'
        b00 = r'\1‌'
        c00 = sub(a00, b00, doc_string)
        a0 = r'( )(می|نمی|بی)( )'
        b0 = r'\1\2‌'
        c0 = sub(a0, b0, c00)
        a1 = r'( )(هایی|ها|های|ایی|هایم|هایت|هایش|هایمان|هایتان|هایشان|ات|ان|ین' \
             r'|انی|بان|ام|ای|یم|ید|اید|اند|بودم|بودی|بود|بودیم|بودید|بودند|ست)( )'
        b1 = r'‌\2\3'
        c1 = sub(a1, b1, c0)
        a2 = r'( )(شده|نشده)( )'
        b2 = r'‌\2‌'
        c2 = sub(a2, b2, c1)
        a3 = r'( )(طلبان|طلب|گرایی|گرایان|شناس|شناسی|گذاری|گذار|گذاران|شناسان|گیری|پذیری|بندی|آوری|سازی|' \
             r'بندی|کننده|کنندگان|گیری|پرداز|پردازی|پردازان|آمیز|سنجی|ریزی|داری|دهنده|آمیز|پذیری' \
             r'|پذیر|پذیران|گر|ریز|ریزی|رسانی|یاب|یابی|گانه|گانه‌ای|انگاری|گا|بند|رسانی|دهندگان|دار)( )'
        b3 = r'‌\2\3'
        c3 = sub(a3, b3, c2)
        return c3

    def space_correction_plus1(self, doc_string):
        out_sentences = ''
        for wrd in doc_string.split(' '):
            try:
                out_sentences = out_sentences + ' ' + self.dic1[wrd]
            except KeyError:
                out_sentences = out_sentences + ' ' + wrd
        return out_sentences

    def space_correction_plus2(self, doc_string):
        out_sentences = ''
        wrds = doc_string.split(' ')
        L = wrds.__len__()
        if L < 2:
            return doc_string
        cnt = 1
        for i in range(0, L - 1):
            w = wrds[i] + wrds[i + 1]
            try:
                out_sentences = out_sentences + ' ' + self.dic2[w]
                cnt = 0
            except KeyError:
                if cnt == 1:
                    out_sentences = out_sentences + ' ' + wrds[i]
                cnt = 1
        if cnt == 1:
            out_sentences = out_sentences + ' ' + wrds[i + 1]
        return out_sentences

    def space_correction_plus3(self, doc_string):
        # Dict = {'گفتوگو': 'گفت‌وگو'}
        out_sentences = ''
        wrds = doc_string.split(' ')
        L = wrds.__len__()
        if L < 3:
            return doc_string
        cnt = 1
        cnt2 = 0
        for i in range(0, L - 2):
            w = wrds[i] + wrds[i + 1] + wrds[i + 2]
            try:
                out_sentences = out_sentences + ' ' + self.dic3[w]
                cnt = 0
                cnt2 = 2
            except KeyError:
                if cnt == 1 and cnt2 == 0:
                    out_sentences = out_sentences + ' ' + wrds[i]
                else:
                    cnt2 -= 1
                cnt = 1
        if cnt == 1 and cnt2 == 0:
            out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
        elif cnt == 1 and cnt2 == 1:
            out_sentences = out_sentences + ' ' + wrds[i + 2]
        return out_sentences

    def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
        normalized_string = gf.normalize_content(doc_string)
        normalized_string = self.sub_alphabets(doc_string)
        #normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination).strip()

        #normalized_string = self.space_correction(self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip()

        # !!!آئین نامه را به آئین‌نامه تبدیل می کند و دو توکن را به یکی تبدیل می کند
        #normalized_string = (self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip()

        #if self.pinglish_conversion_needed:
            #normalized_string = self.pinglish_conversion.pingilish2persian(self.tokenizer.tokenize_words(normalized_string))

        if self.date_normalizing_needed:
            token_list = self.tokenizer.tokenize_words(normalized_string)
            # نرمالایز کردن اعداد حروفی و ...
            normalized_string, additional_token_index_array = self.date_normalizer.normalize_numbers2(token_list)
            # نرمالایز و تشخیص تاریخ ها
            normalized_string, dates, recognized_dates = self.date_normalizer.general_normalize_date(normalized_string, additional_token_index_array)
            #normalized_string_list = normalized_string.strip().split()
            #normalized_string, dates, recognized_dates = self.date_normalizer.normalize_dates(normalized_string_list, additional_token_index_array, token_list_len= len(normalized_string_list), previous_slice_index_array=[0,])
            # کنترل اعدادی که پشت سر هم آمده اند و با واو از هم جدا شده اند
            normalized_string, recognized_numbers = self.date_normalizer.handle_continuous_numbers(normalized_string)
        # در مواردی مانند وسیصد که واو به عدد سیصد چسبیده، ابتدا این دو را از هم جدا می کنیم
        # تا عدد اصلی را به دست بیاوریم. در این صورت یک توکن اضافه تولید می شود که با این متد، توکن های اضافی را حذف می کنیم
        normalized_string =self.date_normalizer.remove_additional_tokens(normalized_string.split(), additional_token_index_array)
        if return_dates:
            # مرتب کردن آرایه تاریخ های پیدا شده بر اساس توکن شروع عبارت حاوی تاریخ
            recognized_dates.sort(key=lambda x: int(x['start_date_token_index']), reverse=False)

            return normalized_string, dates, recognized_dates, recognized_numbers
        else:
            return normalized_string


class DateNormalizer():
    def __init__(self):

        self.month_dict = { "فروردین": 1,"فروردینماه": 1,'فروردین\u200cماه':1,
                            "اردیبهشت": 2,"اردیبهشتماه": 2,'اردیبهشت\u200cماه':2,
                            "خرداد": 3,"خردادماه": 3,
                            "تیر": 4,"تیرماه": 4,
                            "مرداد": 5,"مردادماه": 5,
                            "شهریور": 6,"شهریورماه": 6,
                            "مهر": 7,"مهرماه": 7,
                            "آبان": 8,"آبانماه": 8,'آبان\u200cماه':8,
                            "آذر": 9,"آذرماه": 9,
                            "دی": 10,"دیماه": 10,'دی\u200cماه':10,
                            "بهمن": 11,"بهمنماه": 11,'بهمن\u200cماه':11,
                            "اسفند": 12,"اسفندماه": 12}
        self.num_dict = {"صد": 100,"وصد": 100, "یکصد": 100, "ویکصد": 100, "هزار": 1000,"وهزار": 1000,
                         "یکهزار": 1000,"ویکهزار": 1000,
                         "میلیون": 1000000,"ملیون": 1000000, "ومیلیون": 1000000,"وملیون": 1000000,
                         "یکمیلیون": 1000000,"یکملیون": 1000000,
                         "ویکمیلیون": 1000000,"ویکملیون": 1000000, "دویست": 200,"ودویست": 200,
                         "ده": 10,"وده": 10, "نه": 9,"ونه": 9, "هشت": 8,"وهشت": 8, "هفت": 7,"وهفت": 7,
                         "شش": 6,"وشش": 6, "پنج": 5,"وپنج": 5,"چهار": 4,"وچهار": 4, "سه": 3,"وسه": 3,
                         "دو": 2,"ودو": 2, "یک": 1,"ویک": 1, "یازده": 11,"ویازده": 11, "سیزده": 13, "وسیزده": 13,
                         "چهارده": 14, "دوازده": 12, "پانزده": 15, "شانزده": 16, "هفده": 17,"هیفده": 17,
                         "وچهارده": 14, "ودوازده": 12, "وپانزده": 15, "وشانزده": 16, "وهفده": 17,"وهیفده": 17,
                         "هجده": 18,"هیجده": 18, "نوزده": 19, "بیست": 20, "سی": 30, "چهل": 40, "پنجاه": 50,
                         "وهجده": 18,"وهیجده": 18, "ونوزده": 19, "وبیست": 20, "وسی": 30, "وچهل": 40, "وپنجاه": 50,
                         "شصت": 60, "هفتاد": 70, "نود": 90, "سیصد": 300, "چهارصد": 400,
                         "وشصت": 60, "وهفتاد": 70, "ونود": 90, "وسیصد": 300, "وچهارصد": 400,
                         "پانصد": 500, "ششصد": 600, "هفتصد": 700, "هشتصد": 800, "نهصد": 900,
                         "وپانصد": 500, "وششصد": 600, "وهفتصد": 700, "وهشتصد": 800, "ونهصد": 900,
                         "هشتاد": 80, " ": 0, "میلیارد": 1000000000,"ملیارد": 1000000000,
                         "یکمیلیارد": 1000000000,"یکملیارد": 1000000000,
                         "وهشتاد": 80, " ": 0, "ومیلیارد": 1000000000,"وملیارد": 1000000000,
                         "ویکمیلیارد": 1000000000,"ویکملیارد": 1000000000,
                         "صدم": 100, "هزارم": 1000, "دویستم": 200,
                         "وصدم": 100, "وهزارم": 1000, "ودویستم": 200,
                         "دهم": 10, "نهم": 9, "هشتم": 8, "هفتم": 7, "ششم": 6, "پنجم": 5,
                         "ودهم": 10, "ونهم": 9, "وهشتم": 8, "وهفتم": 7, "وششم": 6, "وپنجم": 5,
                         "چهارم": 4, "سوم": 3, "دوم": 2, "یکم": 1, "اول": 1, "یازدهم": 11, "سیزدهم": 13,
                         "وچهارم": 4, "وسوم": 3, "ودوم": 2, "ویکم": 1, "واول": 1, "ویازدهم": 11, "وسیزدهم": 13,
                         "چهاردهم": 14, "دوازدهم": 12, "پانزدهم": 15, "شانزدهم": 16, "هفدهم": 17,"هیفدهم": 17,
                         "وچهاردهم": 14, "ودوازدهم": 12, "وپانزدهم": 15, "وشانزدهم": 16, "وهفدهم": 17,"وهیفدهم": 17,
                         "هجدهم": 18,"هیجدهم": 18, "نوزدهم": 19, "بیستم": 20, "چهلم": 40, "پنجاهم": 50,
                         "وهجدهم": 18,"وهیجدهم": 18, "ونوزدهم": 19, "وبیستم": 20, "وچهلم": 40, "وپنجاهم": 50,
                         "شصتم": 60, "هفتادم": 70, "نودم": 90, "سیصدم": 300, "چهارصدم": 400,
                         "وشصتم": 60, "وهفتادم": 70, "ونودم": 90, "وسیصدم": 300, "وچهارصدم": 400,
                         "پانصدم": 500, "ششصدم": 600, "هفتصدم": 700, "هشتصدم": 800, "نهصدم": 900,
                         "وپانصدم": 500, "وششصدم": 600, "وهفتصدم": 700, "وهشتصدم": 800, "ونهصدم": 900,
                         "هشتادم": 80,"وهشتادم": 80}

    def find_date_part(self, token_list):
        import re
        for index, element in enumerate(token_list):
            # بررسی الگوی تاریخ 1398/12/18
            pattern_date = r'^(\d{4}\s*/\s*([1-9]|0[1-9]|1[0-2])\s*/\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
            date_pattern = re.compile(pattern_date)
            match_date   = date_pattern.match(element)
            if match_date:
                date_parts = match_date.string.split('/')
                year  = int(re.search(r'\d+', date_parts[0]).group())
                month = int(re.search(r'\d+', date_parts[1]).group())
                day   = int(re.search(r'\d+', date_parts[2]).group())
                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
                if day > 999:
                    day  = int(re.search(r'\d+', date_parts[0]).group())
                    year = int(re.search(r'\d+', date_parts[2]).group())
                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
                return formal_date, index, index, index

            # 24 /12 /1401 بررسی الگوی تاریخ
            pattern_date = r'\s*\d{2}\s*/\s*\d{2}\s*/\s*\d{4}\s*'
            date_pattern = re.compile(pattern_date)
            match_date   = date_pattern.match(element)
            if match_date:
                date_parts = match_date.string.split('/')
                year       = int(re.search(r'\d+', date_parts[0]).group())
                month      = int(re.search(r'\d+', date_parts[1]).group())
                day        = int(re.search(r'\d+', date_parts[2]).group())
                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
                if day > 999:
                    day  = int(re.search(r'\d+', date_parts[0]).group())
                    year = int(re.search(r'\d+', date_parts[2]).group())
                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
                return formal_date, index, index, index

            # بررسی الگوی تاریخ 1402،10،06
            patterndate2 = r'^(\d{4}\s*،\s*([1-9]|0[1-9]|1[0-2])\s*،\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
            date_pattern = re.compile(patterndate2)
            match_date2  = date_pattern.match(element)
            if match_date2:
                date_parts = match_date2.string.split('،')
                year  = int(re.search(r'\d+', date_parts[0]).group())
                month = int(re.search(r'\d+', date_parts[1]).group())
                day   = int(re.search(r'\d+', date_parts[2]).group())
                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
                if day > 999:
                    day  = int(re.search(r'\d+', date_parts[0]).group())
                    year = int(re.search(r'\d+', date_parts[2]).group())
                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
                return formal_date, index, index, index

            # بررسی الگوی تاریخ 13ر04ر1345
            patterndate2 = r'^(\d{4}\s*ر\s*([1-9]|0[1-9]|1[0-2])\s*ر\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
            date_pattern = re.compile(patterndate2)
            match_date2  = date_pattern.match(element)
            if match_date2:
                date_parts = match_date2.string.split('ر')
                year  = int(re.search(r'\d+', date_parts[0]).group())
                month = int(re.search(r'\d+', date_parts[1]).group())
                day   = int(re.search(r'\d+', date_parts[2]).group())
                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
                if day > 999:
                    day  = int(re.search(r'\d+', date_parts[0]).group())
                    year = int(re.search(r'\d+', date_parts[2]).group())
                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
                return formal_date, index, index, index

            if element == "/":
                if index-1 >= 0 and index+1 < len(token_list) \
                        and token_list[index -1].isdigit() and token_list[index+1].isdigit():
                    if index+3 < len(token_list) and token_list[index+2] == "/" \
                            and token_list[index + 3].isdigit():
                        if int(token_list[index-1]) < 1450 and int(token_list[index+1]) < 13 and \
                            int(token_list[index+3]) < 32:
                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
                            formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])
                            #return formal_date, index-1, index+3, index
                            return formal_date, index, index, index
                        elif int(token_list[index-1]) < 32 and int(token_list[index+1]) < 13 and \
                            int(token_list[index+3]) < 1450:
                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
                            formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
                            #return formal_date, index-1, index+3, index
                            return formal_date, index, index, index
                        else:
                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
                            formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + "/" + str(formal_date[2]) + ")"
                            #return formal_date, index-1, index+3, index
                            return formal_date, index, index, index
                    elif (int(token_list[index-1]) < 1450 and int(token_list[index-1]) > 1250)  and int(token_list[index+1]) < 13:
                        formal_date = [int(token_list[index-1]), int(token_list[index+ 1]), 0]
                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
                        #return formal_date, index-1 , index+1, index
                        return formal_date, index , index, index
                    else:
                        formal_date = [int(token_list[index-1]), int(token_list[index+ 1])]
                        formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + ")"
                        #return formal_date, index-1 , index+1, index
                        return formal_date, index , index, index

            if element in self.month_dict or element == "سال":
                if index + 1 < len(token_list) and index - 1 > -2:
                    try:
                        start_date_index = end_date_index = 0
                        if(token_list[index + 1] == 'ماه'):
                            if(token_list[index + 2] == 'سال' and str(token_list[index + 3]).isdigit()):

                                if(int(token_list[index + 3])==1000):
                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
                                    start_date_index = index - 1
                                    day = int(token_list[index - 1])
                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
                                        day += int(token_list[index - 3])# رقم دهگان روز
                                        start_date_index = index - 3
                                    year = 1000
                                    if(len(token_list)>= index + 5):
                                        if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()):
                                            year += int(token_list[index + 5])# رقم صدگان سال
                                            end_date_index = index + 5
                                    if(len(token_list)>= index + 7):
                                        if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()):
                                            year += int(token_list[index + 7])# رقم دهگان سال
                                            end_date_index = index + 7
                                    if(len(token_list)>= index + 9):
                                        if(token_list[index + 8]=='و' and token_list[index + 9].isdigit()):
                                            year += int(token_list[index + 9])# رقم یکان سال
                                            end_date_index = index + 9

                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5

                                else:
                                    start_date_index = index - 1
                                    end_date_index = index + 3
                                    day = int(token_list[index - 1])
                                    if(token_list[index - 2]=='و' and int(token_list[index - 1])<10 and int(token_list[index - 3])<31 and token_list[index - 3].isdigit()):
                                        start_date_index = index - 3
                                        day += int(token_list[index - 3])# رقم دهگان روز

                                        formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 20 و 7 تیر ماه سال 1368


                                    formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 27 تیر ماه سال 1368

                            elif(str(token_list[index + 2]).isdigit()):
                                if(int(token_list[index + 2])==1000):

                                    if token_list[index-1].strip() == 'ام': # مثلا 30 ام تیر ماه 1000 و 300 و 80 و 2
                                        start_date_index = index - 2
                                        day = int(token_list[index - 2])
                                    else:
                                        # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
                                        # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
                                        start_date_index = index - 1
                                        day = int(token_list[index - 1])

                                        if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
                                            day += int(token_list[index - 3])# رقم دهگان روز
                                            start_date_index = index - 3
                                    year = 1000
                                    if(len(token_list)>= index + 4):
                                        if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()):
                                            year += int(token_list[index + 4])# رقم صدگان سال
                                            end_date_index = index + 4
                                    if(len(token_list)>= index + 6):
                                        if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()):
                                            year += int(token_list[index + 6])# رقم دهگان سال
                                            end_date_index = index + 6
                                    if(len(token_list)>= index + 8):
                                        if(token_list[index + 7]=='و' and token_list[index + 8].isdigit()):
                                            year += int(token_list[index + 8])# رقم یکان سال
                                            end_date_index = index + 8
                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5

                                else:
                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر ماه سال 1368
                                    start_date_index = index - 1
                                    end_date_index = index + 2
                                #formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر ماه 1368

                        elif(token_list[index + 1] == 'سال' and (not(token_list[index + 2]).isdigit())):

                            formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر سال 1368
                            start_date_index = index - 1
                            end_date_index = index + 2
                        else:
                            if(token_list[index + 1] == 'سال' and str(token_list[index + 2]).isdigit()):

                                if(int(token_list[index + 2])==1000):

                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
                                    start_date_index = index - 1
                                    day = int(token_list[index - 1])
                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
                                        day += int(token_list[index - 3])# رقم دهگان روز
                                        start_date_index = index - 3
                                    year = 1000
                                    if(len(token_list)>= index + 4):
                                        if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()):
                                            year += int(token_list[index + 4])# رقم صدگان سال
                                            end_date_index = index + 4
                                    if(len(token_list)>= index + 6):
                                        if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()):
                                            year += int(token_list[index + 6])#  # رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد
                                            end_date_index = index + 6
                                    if(len(token_list)>= index + 8):
                                        if((len(token_list)>= index + 7 and token_list[index + 7]=='و') and (len(token_list)>= index + 8 and token_list[index + 8].isdigit())):
                                            year += int(token_list[index + 8])# رقم یکان سال
                                            end_date_index = index + 8

                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5

                                else:
                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر سال 1368
                                    start_date_index = index - 1
                                    end_date_index = index + 2

                            elif(str(token_list[index + 1]).isdigit()):
                                if(int(token_list[index + 1])==1000):
                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
                                    start_date_index = index - 1
                                    day = int(token_list[index - 1])
                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
                                        day += int(token_list[index - 3])# رقم دهگان روز
                                        start_date_index = index - 3
                                    year = 1000
                                    if(len(token_list)>= index + 3):
                                        if(token_list[index + 2]=='و' and token_list[index + 3].isdigit()):
                                            year += int(token_list[index + 3])# رقم صدگان سال
                                            end_date_index = index + 3
                                    if(len(token_list)>= index + 5):
                                        if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()):
                                            year += int(token_list[index + 5])# رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد
                                            end_date_index = index + 5
                                    if(len(token_list)>= index + 7):
                                        if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()):
                                            year += int(token_list[index + 7])# رقم یکان سال
                                            end_date_index = index + 7
                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5
                                else:
                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1320‌
                                    start_date_index = index - 1
                                    end_date_index = index + 1

                            else:
                                formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1365
                                start_date_index = index - 1
                                end_date_index = index + 1
                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
                        if token_list[index - 1] and token_list[index + 1]:
                            return formal_date, start_date_index, end_date_index, start_date_index

                    except Exception as e:
                        error = e.args[0]
                        try:
                            formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), 0]
                            formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])#مثلا  y1358m12d0
                            return formal_date, index-1, index, index-1
                        except:
                            try:
                                # مثلا سال 1400
                                if token_list[index] == "سال":
                                    formal_date = [int(token_list[index + 1]),0, 0]
                                    formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])# y1358m0d0
                                    return formal_date, index, index+1, index
                                elif token_list[index+1] == "ماه" and index + 1 < len(token_list):
                                    if index + 2 < len(token_list) and token_list[index + 2].isdigit() and \
                                        int(token_list[index-2]) < 1450:
                                        formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])]
                                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5
                                        return formal_date, index, index+2, index
                                    else:
                                        formal_date = [0,int(self.month_dict[token_list[index]]),0]
                                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) #y0m12d0
                                        return formal_date, index, index+1, index
                                elif index + 1 < len(token_list) and token_list[index + 1].isdigit() and int(token_list[index-2]) < 1450:
                                    formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+1])]
                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5
                                    return formal_date, index, index+1, index
                                elif index-2 >= 0 and index+2 < len(token_list) and \
                                    token_list[index - 1] == "/" and token_list[index+1] == "/" and \
                                    token_list[index - 2].isdigit() and int(token_list[index-2]) < 32 and \
                                    token_list[index + 2].isdigit() and int(token_list[index+2]) < 1450:
                                    formal_date = [int(token_list[index-2]),int(self.month_dict[token_list[index]]),int(token_list[index+2])]
                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
                                    return formal_date, index-2, index+2, index-2
                                elif index + 2 < len(token_list) and token_list[index + 1] == 'سال' and \
                                    token_list[index + 2].isdigit() and int(token_list[index-2]) < 1450 :
                                    formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])]
                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
                                    return formal_date, index, index+1, index
                                #else:
                                #    print("Name : %s -> after1: %s -> after2: %s -> after3: %s" % (token_list[index], token_list[index+1], token_list[index+2], token_list[index+3]))
                            except:
                                pass

    def general_normalize_date(self,normalized_string, additional_token_index_array):
        # 24 /12 /1401 جهت بررسی الگوی تاریخ
        normalized_string, recognized_dates = DateNormalizer.separated_date_format_finder(normalized_string)
        normalized_string_list = normalized_string.strip().split()
        # جهت یافتن فرمت های مختلف تاریخ غیر از فرمت بررسی شده در بالا
        normalized_string, dates, recognized_dates_2 = \
            self.normalize_dates(normalized_string_list, additional_token_index_array,
                token_list_len = len(normalized_string_list), previous_slice_index_array=[0,])
        # تجمیع همه تاریخ ها
        for item in recognized_dates_2:
            recognized_dates.append(item)
        return normalized_string, dates, recognized_dates

    # 24 /12 /1401 متد بررسی الگوی تاریخ
    def separated_date_format_finder(normalized_string):
        import re
        date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}'
        regex = re.compile(date_pattern)
        match_dates = regex.finditer(normalized_string)
        recognized_dates = []
        for date_item in match_dates:
            position = date_item.span()
            founded_item = date_item.group()
            start_index = date_item.start() + 1
            end_index = date_item.end() - 1
            current_date = founded_item.replace(' ', '')
            date_parts = current_date.split('/')
            formal_date = "y" + str(date_parts[2]) + "m" + str(date_parts[1]) + "d" + str(date_parts[0])
            pattern_tokens_state = gf.token_state_finder(normalized_string, start_index, end_index)
            recognized_dates.append(
                    {
                        "original_date"         : founded_item,
                        "date"                  : formal_date,
                        "start_index"           : start_index,
                        "end_index"             : end_index,
                        "date_token_index"      : pattern_tokens_state["start_token_state"],
                        "start_date_token_index": pattern_tokens_state["start_token_state"],
                        "end_date_token_index"  : pattern_tokens_state["end_token_state"],
                    })
        normalized_string_list = normalized_string.strip().split()
        for date_item in recognized_dates:
            normalized_string_list[int(date_item['date_token_index'])]         = date_item['date']
            normalized_string_list[int(date_item['start_date_token_index'])+1] = 'tttt'
            normalized_string_list[int(date_item['end_date_token_index'])]     = 'tttt'
        normalized_string_temp = ''
        for token in normalized_string_list:
            normalized_string_temp = normalized_string_temp + ' ' + token
        return normalized_string_temp.strip(), recognized_dates

    def normalize_dates(self, token_list, additional_token_index_array, token_list_len, previous_slice_index_array= []):
        try:
            finded = self.find_date_part(token_list)
        except Exception as e:
            finded = None
            gf.save_error(0, e)

        recognized_dates = []
        if finded != None:
            date_part              = finded[0]
            start_date_token_index = finded[1]
            end_date_token_index   = finded[2]
            date_token_index       = finded[3]
            befor_date_part        = " ".join(x for x in token_list[:start_date_token_index])
            after_date_part        = [x for x in token_list[end_date_token_index + 1:]]
            previous_slice_index   = token_list_len - len(after_date_part)
            previous_slice_index_array.append(previous_slice_index)
            after_normalized, after_dates, recognized_dates = self.normalize_dates(after_date_part, additional_token_index_array, token_list_len, previous_slice_index_array)
            if after_dates == '':
                after_dates = []
            if recognized_dates == '':
                recognized_dates = []
            after_dates.insert(0, date_part)
            previous_slice_index = previous_slice_index_array.pop(len(previous_slice_index_array)-2)
            recognized_dates.append(
                {
                 "date"                  : date_part,
                 "date_token_index"      : date_token_index + previous_slice_index,
                 "start_date_token_index": start_date_token_index + previous_slice_index,
                 "end_date_token_index"  : end_date_token_index + previous_slice_index
                })

            i = 0
            while((date_token_index - start_date_token_index) > i ):
                befor_date_part = ''.join([befor_date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t
                i += 1
            i = 0
            while((end_date_token_index - date_token_index) > i ):
                date_part = ''.join([date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t
                i += 1

            return befor_date_part + " " + date_part + " " + after_normalized, after_dates, recognized_dates
        else:
            return " ".join(x for x in token_list), '',''

    def list2num(self, numerical_section_list):
        value = 1
        l = len(numerical_section_list)>3
        if l :
            value = 0
        for index, el in enumerate(numerical_section_list):
            if self.is_number(el):
                value = self.num_dict[el]
            elif el == '000':
                #value *= 1000
                pass
            elif l == True:
                value += float(el)
            else:
                value *= float(el)
        return value

    def convert2num(self, numerical_section_list):
        value = 0
        tmp_section_list = []
        for index, el in enumerate(numerical_section_list):
            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
                tmp_section_list.append(el)
            elif el == "و":
                value += self.list2num(tmp_section_list)
                tmp_section_list[:] = []
        if len(tmp_section_list) > 0:
            value += self.list2num(tmp_section_list)
            tmp_section_list[:] = []
        try:
            if (value-int(value) == 0):
                return int(value)
            else:
                return value
        except:
            return 0


    def is_number(self, word):
        return word in self.num_dict

    def find_number_location(self, token_list, addWithVaa = False):
        start_index = 0
        number_section =[]
        for i , el in enumerate(token_list):
            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
                start_index = i
                number_section.append(start_index)
                break

        i = start_index+1
        while(i < len(token_list)):
            if token_list[i] == "و" and (i+1)<len(token_list) and addWithVaa:
                if self.is_number(token_list[i+1]) or (token_list[i+1].replace('.', '', 1).isdigit()):
                    number_section.append(i)
                    number_section.append(i+1)
                    i += 2
                else:
                    break
            elif self.is_number(token_list[i]) or (token_list[i].replace('.', '', 1).isdigit()):
                number_section.append(i)
                i += 1
            elif token_list[i] == "/" and (i+1)<len(token_list) and token_list[i+1] == "000":
                number_section.append(i)
                number_section.append(i+1)
                i += 2
            else:
                break
        return number_section

    def normalize_numbers(self, token_list, converted=""):
        for i, el in enumerate(token_list):
            if el.endswith("ین") and self.is_number(el[:-2]):
                token_list[i] = el[:-2]
        finded = self.find_number_location(token_list)
        if len(finded) == 0:
            rest_of_string = " ".join(t for t in token_list)
            return converted + " " + rest_of_string
        else:
            numerical_subsection = [token_list[x] for x in finded]
            numerical_subsection = self.convert2num(numerical_subsection)

            converted = converted + " " + " ".join(x for x in token_list[:finded[0]]) + " " + str(numerical_subsection)

            new_index = finded[-1] + 1
            return self.normalize_numbers(token_list[new_index:], converted)

    # این متد برای تبدیل اعدادی حروفی به معادل رقمی ایجاد شده است
    def normalize_numbers2(self, token_list, converted=""):
        additional_token_index_array = [] # آرایه ای از توکن هایی که اضافه بر اصل متن ممکن است در این متد اضافه شوند
        for index, token in enumerate(token_list):

            if token in self.num_dict:
                if token.startswith('و'): # برای حالتی مانند وسیصد که بین واو و سیصد، فاصله ای وجود ندارد
                    temp_token = token[1:]
                    if temp_token in self.num_dict:
                       # در متن، واو را از عدد جدا می کنیم
                       # در این حالت یک توکن به متن اصلی اضافه می شود که باید کنترل شود
                       converted = converted + " " + "و" + " " + str(int(self.num_dict[token]))
                       additional_token_index_array.append(index)
                else:
                    converted = converted + " " + str(int(self.num_dict[token]))
            else:
                converted = converted + " " + token
        return converted, additional_token_index_array

    # این متد برای ضرب اعداد متوالی با جدا کننده اسپیس ایجاد شده
    def multiply_continuous_numbers(self, normalized_string):
        token_list       = normalized_string.strip().split()
        converted_string = ''
        for index,token in enumerate(token_list):
            num_array = []
            if token.isdigit():
                #if (token_list[index+1]).isdigit():
                    # اگر عدد مورد نظر با فرمت بالا نبود، حالت اعداد بدون فاصله کنار هم که باید در هم ضرب شوند را بررسی می کنیم
                    num_array.clear()
                    num_array.append(int(token))
                    step = 1
                    deletting_indexes = []
                    num_array_sum = 1
                    while True:
                        if (token_list[index+step]).isdigit() :
                            num_array.append(int(token_list[index+step]))
                            deletting_indexes.append(index+step)
                            step += 1
                            continue
                        else:
                            break

                    for index in deletting_indexes:
                        # بازای توکن هایی که عدد هستند و به دلیل ضرب در هم شدن باید حذف شوند، رشته بی معنی زیر را جایگزین کن
                        token_list[index] =   'nnnn' # به معنای توکنی که از جنس عدد بوده number
                    for number in num_array: # ،عدادی که پشت سرهم بوده اند، بدون فاصله و در مراحل قبل پردازش نشده اند و در آرایه ذخیره شده را ضرب می کند و در یک عدد، یک کاسه می کند، مثلا 2 1000 1000000000 تومان را
                        num_array_sum *= number
                    converted_string = converted_string + " " + str(num_array_sum)

            else:
                converted_string = converted_string + " " + token
        return converted_string

    # این متد برای جمع اعداد متوالی با جدا کننده واو ایجاد شده
    def sum_continuous_numbers(self, normalized_string):
        token_list = normalized_string.strip().split()
        converted_string = ''
        operand = '+'
        for index,token in enumerate(token_list):
            num_array = []
            if token.isdigit():
                # این بخش، متن را بررسی می کند و اعداد دو یا سه رقمی و بیشتر که با واو از هم جدا شده را با هم جمع و یک کاسه می کند
                if token_list[index+1]=='و': # 2 , 50 , 600 :مانند

                    if(int(token) % 10 != 0): # اگر عدد دو یا سه یا چهار یا ... رقمی باشد، رقم دهگاه، صدگان، هزارگان و ... باید باقیمانده شان بر 2 برابر با صفر باشد، اگر نبود، ادامه نده
                        converted_string = converted_string + " " + token
                    else:
                        num_array.append(int(token))
                        step = 1
                        deletting_indexes = []
                        num_array_sum = 0
                        while True:
                            if token_list[index+step] == 'و' :
                                # اگر یکی از شرط های زیر را داشت، اعدادی که با واو جدا شده اند، مربوط به تاریخ هستند و نباید جمع عددی انجام شود
                                if (token_list[index+step+1] == 'tttt' or token_list[index+step+1].startswith('y')):
                                    deletting_indexes.clear()
                                    num_array.clear()
                                    num_array_sum = int(token)
                                    break
                                elif token_list[index+step+1].isdigit() and token_list[index+step+2]=='nnnn' :
                                    operand = '*'
                                    num_array_sum = 1
                                    num_array.append(int(token_list[index+step+1]))
                                    # ذخیره کردن ایندکس هایی که جمع شده و باید به جای آن یک توکن خنثی بگذاریم
                                    deletting_indexes.append(index+step)
                                    deletting_indexes.append(index+step+1)
                                    step += 3
                                    continue
                                elif token_list[index+step+1].isdigit() :
                                    num_array.append(int(token_list[index+step+1]))
                                    # ذخیره کردن ایندکس هایی که جمع شده و باید به جای آن یک توکن خنثی بگذاریم
                                    deletting_indexes.append(index+step)
                                    deletting_indexes.append(index+step+1)
                                    step += 2
                                    continue
                            else:
                                break

                        for index in deletting_indexes:
                            # بازای توکن هایی که عدد یا واو هستند و  باید حذف شوند، رشته بی معنی زیر را جایگزین کن
                            token_list[index] =   'nnnn' # به معنای توکنی که از جنس عدد بوده number

                        for number in num_array: # اعدادی که با واو پشت سرهم بوده اند و در آرایه ذخیره شده را جمع می کند و در یک عدد، یک کاسه می کند
                            if operand == '+':
                                num_array_sum += number
                            elif operand == '*':
                                num_array_sum *= number
                        converted_string = converted_string + " " + str(num_array_sum)

            else:
                converted_string = converted_string + " " + token
        return converted_string

    # این متد اعداد پشت سر هم که با اسپیس یا واو از هم جدا شده اند و باید تبدیل به یک عدد به صورت یک کاسه شوند را هندل می کند
    def handle_continuous_numbers(self, normalized_string):
        token_list = normalized_string.strip().split()
        converted_string = ''
        number_parts_array = []
        start_token_index = 0
        end_token_index = 0
        recognized_numbers = []
        flag = True
        is_tabsare_numbers = False
        is_mavad_numbers = False
        is_madde_numbers = False
        for token_index,token in enumerate(token_list):
            if token.isdigit() :
                # if token == '61' :
                #      print(len(token_list))
                if token_index - 2 >= 0:
                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند تبصره های مربوط به یک ماده نباشند
                    is_tabsare_numbers = (token_list[token_index-2] + ' ' + token_list[token_index-1]=='تبصره های')
                if token_index - 1 >= 0:
                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند مواد یک قانون نباشند
                    is_mavad_numbers = (token_list[token_index-1]=='مواد')
                if token_index - 1 >= 0:
                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند ماده های یک قانون نباشند
                    is_madde_numbers = (token_list[token_index-1]=='ماده')
                if is_tabsare_numbers or is_mavad_numbers or is_madde_numbers:
                    flag = False
                start_token_index = end_token_index = token_index
                number_parts_array.clear()
                current_token_index = token_index
                number_parts_array.append(token)
                if current_token_index + 1 < len(token_list):
                    while ((flag) and (token_list[current_token_index+1] == 'و' or token_list[current_token_index+1].isdigit())):
                        number_parts_array.append(token_list[current_token_index+1])
                        current_token_index += 1
                        end_token_index = current_token_index
                        if not (current_token_index + 1 < len(token_list)):
                            break

                final_number = number_completter(number_parts_array)
                token_list[token_index] = str(final_number)
                recognized_numbers.append({
                    'number_value'     : final_number,
                    'number_token_list': number_parts_array,
                    'start_token_index': start_token_index,
                    'end_token_index'  : end_token_index
                    })
                # جایگذاری مقدار رشته بی ارزش به جای توکن های مربوط به عدد جاری
                # به منظور اینکه متن اصلی از نظر تعداد توکن تغییر نکند
                for i in range(token_index + 1 , len(number_parts_array) + token_index):
                    token_list[i] = 'nnnn'
                converted_string = converted_string + ' ' + token_list[token_index]
                if token_index + 1 < len(token_list):
                    if flag == False and (token_list[token_index+1]=='و'):
                        flag = False
                    else:
                        flag = True
            else:
                converted_string = converted_string + ' ' + token

        return converted_string, recognized_numbers


    def remove_additional_tokens(self,token_list,additional_token_index_array):
        converted_string = ''
        for index in additional_token_index_array:
            del token_list[index]

        for token in token_list:
            converted_string = converted_string + ' ' + token
        return converted_string

class PinglishNormalizer():
    def __init__(self):
        # self.data_helper = DataHelper()
        self.file_dir = os.path.dirname(os.path.realpath(__file__)) + "/"

        self.en_dict_filename = self.file_dir + "resource/tokenizer/enDict"
        # self.en_dict = self.data_helper.load_var(self.en_dict_filename)

        self.fa_dict_filename = self.file_dir + "resource/tokenizer/faDict"
        # self.fa_dict = self.data_helper.load_var(self.fa_dict_filename)


    def pingilish2persian(self, pinglish_words_list):

        for i, word in enumerate(pinglish_words_list):
            if word in self.en_dict:
                pinglish_words_list[i] = self.en_dict[word]#.decode("utf-8")
                #inp = inp.replace(word, enDict[word], 1)
            else:
                ch = self.characterize(word)
                pr = self.map_char(ch)
                amir = self.make_word(pr)
                for wd in amir:
                    am = self.escalation(wd)
                    asd = ''.join(am)
                    if asd in self.fa_dict:
                        pinglish_words_list[i] = asd#.decode("utf-8")
                        #inp = inp.replace(word, asd, 1)
        inp = " ".join(x for x in pinglish_words_list)
        return inp

    def characterize(self, word):
        list_of_char = []
        i = 0
        while i < len(word):
            char = word[i]
            sw_out = self.switcher(char)
            if (sw_out == None):
                esp_out = None
                if(i < len(word) - 1):
                    esp_out = self.esp_check(word[i], word[i + 1])
                if(esp_out == None):
                    list_of_char.append(word[i])
                else:
                    list_of_char.append(esp_out)
                    i += 1
            else:
                list_of_char.append(sw_out)
            i += 1
        return list_of_char

    def switcher(self, ch):
        switcher = {
            "c": None,
            "k": None,
            "z": None,
            "s": None,
            "g": None,
            "a": None,
            "u": None,
            "e": None,
            "o": None
        }
        return switcher.get(ch, ch)

    def esp_check(self, char1, char2):
        st = char1 + char2
        if (st == "ch"):
            return "ch"
        elif (st == "kh"):
            return "kh"
        elif (st == "zh"):
            return "zh"
        elif (st == "sh"):
            return "sh"
        elif (st == "gh"):
            return "gh"
        elif (st == "aa"):
            return "aa"
        elif (st == "ee"):
            return "ee"
        elif (st == "oo"):
            return "oo"
        elif (st == "ou"):
            return "ou"
        else:
            return None

    def map_char(self, word):
        listm = []
        sw_out = self.map_switcher(word[0])
        i = 0
        if (sw_out == None):
            listm.append(["ا"])
            i += 1
        if (word[0] == "oo"):
            listm.append(["او"])
            i += 1
        while i < len(word):
            listm.append(self.char_switcher(word[i]))
            i += 1
        if word[len(word) - 1] == "e":
            listm.append(["ه"])
        elif word[len(word) - 1] == "a":
            listm.append(["ا"])
        elif word[len(word) - 1] == "o":
            listm.append(["و"])
        elif word[len(word) - 1] == "u":
            listm.append(["و"])

        return listm

    def map_switcher(self, ch):
        switcher = {
            "a": None,
            "e": None,
            "o": None,
            "u": None,
            "ee": None,

            "ou": None
        }
        return switcher.get(ch, ch)

    def make_word(self, chp):
        word_list = [[]]
        for char in chp:
            word_list_temp = []
            for tmp_word_list in word_list:
                for chch in char:
                    tmp = copy.deepcopy(tmp_word_list)
                    tmp.append(chch)
                    word_list_temp.append(tmp)
            word_list = word_list_temp
        return word_list

    def escalation(self, word):
        tmp = []
        i = 0
        t = len(word)
        while i < t - 1:
            tmp.append(word[i])
            if word[i] == word[i + 1]:
                i += 1
            i += 1
        if i != t:
            tmp.append(word[i])
        return tmp

    def char_switcher(self, ch):
        switcher = {
            'a': ["", "ا"],
            'c': ["ث", "ص", "ص"],
            'h': ["ه", "ح"],
            'b': ["ب"],
            'p': ["پ"],
            't': ["ت", "ط"],
            's': ["س", "ص", "ث"],
            'j': ["ج"],
            'ch': ["چ"],
            'kh': ["خ"],
            'q': ["ق", "غ"],
            'd': ["د"],
            'z': ["ز", "ذ", "ض", "ظ"],
            'r': ["ر"],
            'zh': ["ژ"],
            'sh': ["ش"],
            'gh': [",ق", "غ"],
            'f': ["ف"],
            'k': ["ک"],
            'g': ["گ"],
            'l': ["ل"],
            'm': ["م"],
            'n': ["ن"],
            'v': ["و"],
            'aa': ["ا"],
            'ee': ["ی"],
            'oo': ["و"],
            'ou': ["و"],
            'i': ["ی"],
            'y': ["ی"],
            ' ': [""],
            'w': ["و"],
            'e': ["", "ه"],
            'o': ["", "و"]
        }
        return switcher.get(ch, "")

# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند
def number_completter2(number_parts_array):
        zarb_value = 0
        temp_number_array = []
        sum_number_parts = 0
        final_number = 0
        for index,item in enumerate(number_parts_array):
                    if item.isdigit():
                        temp_number_array.append(int(item))
                        if index + 1 >= len(number_parts_array):
                            for item3 in temp_number_array:
                                final_number += int(item3)
                            return final_number
                        current_value = number_parts_array[index+1]
                        if (current_value.isdigit() and (int(current_value) == 1000 or int(current_value) == 1000000 or int(current_value) == 1000000000)):# or int(current_value) == 1000000
                            zarb_value = int(number_parts_array[index+1])
                            for num_item in temp_number_array:
                                sum_number_parts += num_item
                            final_number = sum_number_parts * zarb_value
                            temp_array2 = []
                            x = index + 2
                            while x < len(number_parts_array):
                            # for x in range(index+2,len(number_parts_array)):
                                temp_array2.append(number_parts_array[x])
                                if x + 1 < len(number_parts_array):
                                    if number_parts_array[x+1].isdigit() and (int(number_parts_array[x+1]) == 1000 or int(number_parts_array[x+1]) == 1000000 or int(number_parts_array[x+1]) == 1000000000):
                                        if int(number_parts_array[x+1]) > zarb_value: # 1000000>1000
                                            zarb_value = int(number_parts_array[x+1])
                                            # تابع بازگشتی برای محاسبه عدد نهایی
                                            final_number += number_completter2(temp_array2)
                                            final_number *= zarb_value
                                            temp_array2.clear()
                                            zarb_value = 0
                                            x += 2 # به این دلیل که مقدار ضرب را در آرایه تمپ ذخیره نکند
                                        else: # 1000!>1000000
                                            zarb_value = int(number_parts_array[x+1])
                                            temp_num = 0
                                            if (zarb_value == 1000 or zarb_value == 1000000) and len(number_parts_array) > x + 1:
                                                num_array = number_parts_array[x+2:len(number_parts_array)]
                                                temp_num = number_completter(num_array)
                                                if temp_num == None:
                                                    temp_num = 0

                                                '''for i in range(x+2,len(number_parts_array)):
                                                    if(number_parts_array[i].isdigit()):
                                                        temp_num += int(number_parts_array[i])
                                                        i+=1'''
                                            temp_num2 = 1
                                            for num_item2 in temp_array2:
                                                if(num_item2.isdigit()):
                                                    temp_num2 += int(num_item2)

                                            temp_num2 *= zarb_value
                                            final_number += temp_num + temp_num2
                                            break
                                    else:
                                        x += 1 #
                                else: # اگر از رنج آرایه خارج شدیم
                                    temp_num = 0
                                    # مقادیر موجود در آرایه تمپ را جمع کن
                                    for num_item3 in temp_array2:
                                        if(num_item3.isdigit()):
                                            temp_num += int(num_item3)
                                     # حاصل جمع اعداد موجود در آرایه ذخیره را در عدد نهایی که قبلا داشته ایم ضرب کن و از حلقه خارج شو
                                    final_number *= temp_num
                                    #final_number += temp_num
                                    break
                            return final_number

# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند
def number_completter(number_parts_array):
        zarb_value                = 0
        previous_zarb_value       = 0
        previous_number_parts_sum = 0
        temp_number_array         = []
        number_parts_sum          = 0
        final_number              = 0
        current_number_part       = 0
        for index,item in enumerate(number_parts_array):
                    if item.isdigit():
                        if (not(int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)):
                            temp_number_array.append(item)
                            continue
                        elif((int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)):
                            zarb_value = int(item)
                            for num_item in temp_number_array:
                                number_parts_sum += int(num_item)
                            if number_parts_sum == 0 and previous_number_parts_sum == 0:
                                number_parts_sum = 1
                            temp_number_array.clear()

                        else:# for example 952
                            zarb_value = 1
                            for num_item in temp_number_array:
                                number_parts_sum += int(num_item)
                            current_number_part = number_parts_sum + previous_number_parts_sum
                            continue
                        if previous_zarb_value < zarb_value:# for example 1000 < 1000000000
                            current_number_part = previous_number_parts_sum + number_parts_sum
                            current_number_part = zarb_value * current_number_part
                        else:# previous_zarb_value > zarb_value
                            if number_parts_sum == 0:
                                number_parts_sum = 1
                            current_number_part  = zarb_value * number_parts_sum
                            current_number_part += previous_number_parts_sum

                        previous_number_parts_sum = current_number_part
                        current_number_part = 0
                        previous_zarb_value = zarb_value
                        number_parts_sum = 0
        if len(temp_number_array) != 0:
            remained_parts_sum = 0
            for num_item in temp_number_array:
                remained_parts_sum += int(num_item)
            final_number = previous_number_parts_sum + remained_parts_sum
            return final_number

        final_number = previous_number_parts_sum
        return final_number


def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

# normalizer = hazm.Normalizer()
wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)

def cleaning(text):
    text = text.strip()
    text = clean(text,
                 extra_spaces = True,
                 lowercase = True
                 )

    # cleaning htmls
    text = cleanhtml(text)

    # normalizing
    text = normalizer.normalize(text)

    # removing wierd patterns
    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)

    return text