From b7f15bd84608c056cb8a4f64e70a1b5e2f400c29 Mon Sep 17 00:00:00 2001
From: ajokar <ajokar91@yahoo.com>
Date: Sat, 16 Aug 2025 14:24:11 +0330
Subject: [PATCH] add some readme files

---
 do_nlp_processes.py             |   46 +-
 normalizer copy.py              | 1370 -------------------------------
 p1_classifier.py                |   19 +-
 p2_ner_recognizer.py            |    4 +-
 p3_words_embedder.py            |   11 +-
 p4_keyword_extractor.py         |   15 +-
 p5_simplifier.py                |   20 +-
 readme/readme-classifier.md     |   71 ++
 readme/readme-words-embedder.md |   70 ++
 9 files changed, 218 insertions(+), 1408 deletions(-)
 delete mode 100644 normalizer copy.py
 create mode 100644 readme/readme-classifier.md
 create mode 100644 readme/readme-words-embedder.md

diff --git a/do_nlp_processes.py b/do_nlp_processes.py
index c030814..4a41a78 100644
--- a/do_nlp_processes.py
+++ b/do_nlp_processes.py
@@ -2,19 +2,30 @@
 سورس اجرای پردازش های مختلف روی اجزای قانونی
 شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و ساده‌سازی(بازنمایی) متن
 """
+    
 from p1_classifier import do_classify
 from p2_ner_recognizer import do_ner_recognize
 from p3_words_embedder import do_word_embedder
-# from p4_keyword_extractor import do_keyword_extract
-# from p5_simplifier import do_simplify
+from p4_keyword_extractor import do_keyword_extract
+from p5_simplifier import do_representation
 
 from elastic_helper import ElasticHelper
+import json
 
 def get_sections():
-    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
-    eh_obj = ElasticHelper()
-    sections = eh_obj.iterateJsonFile(sections_path, True)
-    sections = convert_to_dict(sections)
+    
+    # region خواندن کل سکشن ها از فایل جیسون
+    # sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
+    # eh_obj = ElasticHelper()
+    # sections = eh_obj.iterateJsonFile(sections_path, True)
+    # sections = convert_to_dict(sections)
+    # endregion
+    
+    # region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
+    with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
+        sections = json.load(file)
+    # endregion
+    
     return sections
 
 def convert_to_dict(sections):
@@ -31,6 +42,15 @@ def main():
     # get sections to do nlp processes
     sections = get_sections()
     
+    
+    # temp_sections = {}
+    # for i, item in enumerate(sections):
+    #     if i>3:
+    #         break
+    #     temp_sections[item] = sections[item]
+    
+    # sections = temp_sections
+    
     # dictsections = {}
     # for item in sections:
         # if not item['id'] == 'qs2180272':
@@ -46,14 +66,22 @@ def main():
     sections = do_ner_recognize(sections)
     
     # 3. word embedder
-    #sections = do_word_embedder(sections)
+    sections = do_word_embedder(sections)
     
     # 4. keyword extract
-    # result_kw = do_keyword_extract(sections)
+    # keyword_extract_result, sections = do_keyword_extract(sections)
+    # if keyword_extract_result: 
+    #     print(f'keyword extraction finished successfully!')
     
     # 5. simpify
-    # result_simp = do_simplify(sections)
+    # representation_result, sections = do_representation(sections)
+    # if representation_result: 
+    #     print(f'representation finished successfully!')
     
+    with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
+        data = json.dumps(sections, ensure_ascii=False, indent=2)
+        output_file.write(data)
+        
     print('all nlp processes finished successfully!')
 
 
diff --git a/normalizer copy.py b/normalizer copy.py
deleted file mode 100644
index 6c92ca2..0000000
--- a/normalizer copy.py	
+++ /dev/null
@@ -1,1370 +0,0 @@
-from re import sub
-import copy
-import os
-from tokenizer import Tokenizer
-from data_helper import DataHelper
-import general_functions as gf
-import traceback,sys
-
-class Normalizer():
-
-    def __init__(self,
-                 half_space_char='\u200c',
-                 date_normalizing_needed=False,
-                 pinglish_conversion_needed=False,
-                 train_file_path="resource/tokenizer/Bijan_khan_chunk.txt",
-                 token_merger_path="resource/tokenizer/TokenMerger.pckl"):
-        self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-
-        self.dic1_path = self.dir_path + 'resource/normalizer/Dic1_new.txt'
-        self.dic2_path = self.dir_path + 'resource/normalizer/Dic2_new.txt'
-        self.dic3_path = self.dir_path + 'resource/normalizer/Dic3_new.txt'
-        self.dic1 = self.load_dictionary(self.dic1_path)
-        self.dic2 = self.load_dictionary(self.dic2_path)
-        self.dic3 = self.load_dictionary(self.dic3_path)
-
-        self.date_normalizing_needed = date_normalizing_needed
-        self.pinglish_conversion_needed = pinglish_conversion_needed
-        self.data_helper = DataHelper()
-
-
-        if self.date_normalizing_needed or self.pinglish_conversion_needed:
-            self.tokenizer = Tokenizer()
-            self.date_normalizer = DateNormalizer()
-            self.pinglish_conversion = PinglishNormalizer()
-
-        
-
-    def load_dictionary(self, file_path):
-        dict = {}
-        with open(file_path, 'r', encoding='utf-8') as f:
-            g = f.readlines()
-            for Wrds in g:
-                wrd = Wrds.split(' ')
-                dict[wrd[0].strip()] = sub('\n', '', wrd[1].strip())
-        return dict
-
-    def sub_alphabets(self, doc_string):
-        # try:
-        #     doc_string = doc_string.decode('utf-8')
-        # except UnicodeEncodeError:
-        #     pass
-        # a0 = "ء"
-        # b0 = "ئ"
-        # c0 = sub(a0, b0, doc_string)
-        a11 = r"ﺁ|آ"
-        b11 = r"آ"
-        c11 = sub(a11, b11, doc_string)
-        a1 = r"ٲ|ٱ|إ|ﺍ|أ"
-        b1 = r"ا"
-        c1 = sub(a1, b1, c11)
-        a2 = r"ﺐ|ﺏ|ﺑ"
-        b2 = r"ب"
-        c2 = sub(a2, b2, c1)
-        a3 = r"ﭖ|ﭗ|ﭙ|ﺒ|ﭘ"
-        b3 = r"پ"
-        c3 = sub(a3, b3, c2)
-        a4 = r"ﭡ|ٺ|ٹ|ﭞ|ٿ|ټ|ﺕ|ﺗ|ﺖ|ﺘ"
-        b4 = r"ت"
-        c4 = sub(a4, b4, c3)
-        a5 = r"ﺙ|ﺛ"
-        b5 = r"ث"
-        c5 = sub(a5, b5, c4)
-        a6 = r"ﺝ|ڃ|ﺠ|ﺟ"
-        b6 = r"ج"
-        c6 = sub(a6, b6, c5)
-        a7 = r"ڃ|ﭽ|ﭼ"
-        b7 = r"چ"
-        c7 = sub(a7, b7, c6)
-        a8 = r"ﺢ|ﺤ|څ|ځ|ﺣ"
-        b8 = r"ح"
-        c8 = sub(a8, b8, c7)
-        a9 = r"ﺥ|ﺦ|ﺨ|ﺧ"
-        b9 = r"خ"
-        c9 = sub(a9, b9, c8)
-        a10 = r"ڏ|ډ|ﺪ|ﺩ"
-        b10 = r"د"
-        c10 = sub(a10, b10, c9)
-        a11 = r"ﺫ|ﺬ|ﻧ"
-        b11 = r"ذ"
-        c11 = sub(a11, b11, c10)
-        a12 = r"ڙ|ڗ|ڒ|ڑ|ڕ|ﺭ|ﺮ"
-        b12 = r"ر"
-        c12 = sub(a12, b12, c11)
-        a13 = r"ﺰ|ﺯ"
-        b13 = r"ز"
-        c13 = sub(a13, b13, c12)
-        a14 = r"ﮊ"
-        b14 = r"ژ"
-        c14 = sub(a14, b14, c13)
-        a15 = r"ݭ|ݜ|ﺱ|ﺲ|ښ|ﺴ|ﺳ"
-        b15 = r"س"
-        c15 = sub(a15, b15, c14)
-        a16 = r"ﺵ|ﺶ|ﺸ|ﺷ"
-        b16 = r"ش"
-        c16 = sub(a16, b16, c15)
-        a17 = r"ﺺ|ﺼ|ﺻ"
-        b17 = r"ص"
-        c17 = sub(a17, b17, c16)
-        a18 = r"ﺽ|ﺾ|ﺿ|ﻀ"
-        b18 = r"ض"
-        c18 = sub(a18, b18, c17)
-        a19 = r"ﻁ|ﻂ|ﻃ|ﻄ"
-        b19 = r"ط"
-        c19 = sub(a19, b19, c18)
-        a20 = r"ﻆ|ﻇ|ﻈ"
-        b20 = r"ظ"
-        c20 = sub(a20, b20, c19)
-        a21 = r"ڠ|ﻉ|ﻊ|ﻋ"
-        b21 = r"ع"
-        c21 = sub(a21, b21, c20)
-        a22 = r"ﻎ|ۼ|ﻍ|ﻐ|ﻏ"
-        b22 = r"غ"
-        c22 = sub(a22, b22, c21)
-        a23 = r"ﻒ|ﻑ|ﻔ|ﻓ"
-        b23 = r"ف"
-        c23 = sub(a23, b23, c22)
-        a24 = r"ﻕ|ڤ|ﻖ|ﻗ"
-        b24 = r"ق"
-        c24 = sub(a24, b24, c23)
-        a25 = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك"
-        b25 = r"ک"
-        c25 = sub(a25, b25, c24)
-        a26 = r"ﮚ|ﮒ|ﮓ|ﮕ|ﮔ"
-        b26 = r"گ"
-        c26 = sub(a26, b26, c25)
-        a27 = r"ﻝ|ﻞ|ﻠ|ڵ"
-        b27 = r"ل"
-        c27 = sub(a27, b27, c26)
-        a28 = r"ﻡ|ﻤ|ﻢ|ﻣ"
-        b28 = r"م"
-        c28 = sub(a28, b28, c27)
-        a29 = r"ڼ|ﻦ|ﻥ|ﻨ"
-        b29 = r"ن"
-        c29 = sub(a29, b29, c28)
-        a30 = r"ވ|ﯙ|ۈ|ۋ|ﺆ|ۊ|ۇ|ۏ|ۅ|ۉ|ﻭ|ﻮ|ؤ"
-        b30 = r"و"
-        c30 = sub(a30, b30, c29)
-        a31 = r"ﺔ|ﻬ|ھ|ﻩ|ﻫ|ﻪ|ۀ|ە|ة|ہ"
-        b31 = r"ه"
-        c31 = sub(a31, b31, c30)
-        a32 = r"ﭛ|ﻯ|ۍ|ﻰ|ﻱ|ﻲ|ں|ﻳ|ﻴ|ﯼ|ې|ﯽ|ﯾ|ﯿ|ێ|ے|ى|ي"
-        b32 = r"ی"
-        c32 = sub(a32, b32, c31)
-        a33 = r'¬'
-        b33 = r'‌'
-        c33 = sub(a33, b33, c32)
-        pa0 = r'•|·|●|·|・|∙|｡|ⴰ'
-        pb0 = r'.'
-        pc0 = sub(pa0, pb0, c33)
-        pa1 = r',|٬|٫|‚|，'
-        pb1 = r'،'
-        pc1 = sub(pa1, pb1, pc0)
-        pa2 = r'؟|ʕ'
-        pb2 = r''
-        pc2 = sub(pa2, pb2, pc1)
-        na0 = r'۰|٠'
-        nb0 = r'0'
-        nc0 = sub(na0, nb0, pc2)
-        na1 = r'۱|١'
-        nb1 = r'1'
-        nc1 = sub(na1, nb1, nc0)
-        na2 = r'۲|٢'
-        nb2 = r'2'
-        nc2 = sub(na2, nb2, nc1)
-        na3 = r'۳|٣'
-        nb3 = r'3'
-        nc3 = sub(na3, nb3, nc2)
-        na4 = r'۴|٤'
-        nb4 = r'4'
-        nc4 = sub(na4, nb4, nc3)
-        na5 = r'۵'
-        nb5 = r'5'
-        nc5 = sub(na5, nb5, nc4)
-        na6 = r'۶|٦'
-        nb6 = r'6'
-        nc6 = sub(na6, nb6, nc5)
-        na7 = r'۷|٧'
-        nb7 = r'7'
-        nc7 = sub(na7, nb7, nc6)
-        na8 = r'۸|٨'
-        nb8 = r'8'
-        nc8 = sub(na8, nb8, nc7)
-        na9 = r'۹|٩'
-        nb9 = r'9'
-        nc9 = sub(na9, nb9, nc8)
-        np2 = r'²'
-        nm2 = r'2'
-        ng2 = sub(np2, nm2, nc9)
-        ea1 = r'|ِ|ُ|َ|ٍ|ٌ|ً|'
-        eb1 = r''
-        ec1 = sub(ea1, eb1, ng2)
-        ea1 = r'ـ|_'
-        eb1 = r''
-        ec2 = sub(ea1, eb1, ec1)
-        Sa1 = r'( )+'
-        Sb1 = r' '
-        Sc1 = sub(Sa1, Sb1, ec2)
-        Sa2 = r'(\n)+'
-        Sb2 = r'\n'
-        Sc2 = sub(Sa2, Sb2, Sc1)
-        Sd1 = '\u00A0'# No-Break Space (NBSP) character
-        Sd2 = Sc2.replace(Sd1, '')
-        
-        return Sd2.strip()
-
-    def space_correction(self, doc_string):
-        a00 = r'^(بی|می|نمی)( )'
-        b00 = r'\1‌'
-        c00 = sub(a00, b00, doc_string)
-        a0 = r'( )(می|نمی|بی)( )'
-        b0 = r'\1\2‌'
-        c0 = sub(a0, b0, c00)
-        a1 = r'( )(هایی|ها|های|ایی|هایم|هایت|هایش|هایمان|هایتان|هایشان|ات|ان|ین' \
-             r'|انی|بان|ام|ای|یم|ید|اید|اند|بودم|بودی|بود|بودیم|بودید|بودند|ست)( )'
-        b1 = r'‌\2\3'
-        c1 = sub(a1, b1, c0)
-        a2 = r'( )(شده|نشده)( )'
-        b2 = r'‌\2‌'
-        c2 = sub(a2, b2, c1)
-        a3 = r'( )(طلبان|طلب|گرایی|گرایان|شناس|شناسی|گذاری|گذار|گذاران|شناسان|گیری|پذیری|بندی|آوری|سازی|' \
-             r'بندی|کننده|کنندگان|گیری|پرداز|پردازی|پردازان|آمیز|سنجی|ریزی|داری|دهنده|آمیز|پذیری' \
-             r'|پذیر|پذیران|گر|ریز|ریزی|رسانی|یاب|یابی|گانه|گانه‌ای|انگاری|گا|بند|رسانی|دهندگان|دار)( )'
-        b3 = r'‌\2\3'
-        c3 = sub(a3, b3, c2)
-        return c3
-
-    def space_correction_plus1(self, doc_string):
-        out_sentences = ''
-        for wrd in doc_string.split(' '):
-            try:
-                out_sentences = out_sentences + ' ' + self.dic1[wrd]
-            except KeyError:
-                out_sentences = out_sentences + ' ' + wrd
-        return out_sentences
-
-    def space_correction_plus2(self, doc_string):
-        out_sentences = ''
-        wrds = doc_string.split(' ')
-        L = wrds.__len__()
-        if L < 2:
-            return doc_string
-        cnt = 1
-        for i in range(0, L - 1):
-            w = wrds[i] + wrds[i + 1]
-            try:
-                out_sentences = out_sentences + ' ' + self.dic2[w]
-                cnt = 0
-            except KeyError:
-                if cnt == 1:
-                    out_sentences = out_sentences + ' ' + wrds[i]
-                cnt = 1
-        if cnt == 1:
-            out_sentences = out_sentences + ' ' + wrds[i + 1]
-        return out_sentences
-
-    def space_correction_plus3(self, doc_string):
-        # Dict = {'گفتوگو': 'گفت‌وگو'}
-        out_sentences = ''
-        wrds = doc_string.split(' ')
-        L = wrds.__len__()
-        if L < 3:
-            return doc_string
-        cnt = 1
-        cnt2 = 0
-        for i in range(0, L - 2):
-            w = wrds[i] + wrds[i + 1] + wrds[i + 2]
-            try:
-                out_sentences = out_sentences + ' ' + self.dic3[w]
-                cnt = 0
-                cnt2 = 2
-            except KeyError:
-                if cnt == 1 and cnt2 == 0:
-                    out_sentences = out_sentences + ' ' + wrds[i]
-                else:
-                    cnt2 -= 1
-                cnt = 1
-        if cnt == 1 and cnt2 == 0:
-            out_sentences = out_sentences + ' ' + wrds[i + 1] + ' ' + wrds[i + 2]
-        elif cnt == 1 and cnt2 == 1:
-            out_sentences = out_sentences + ' ' + wrds[i + 2]
-        return out_sentences
-
-    def normalize(self, doc_string, new_line_elimination=False, return_dates = False):
-        normalized_string = gf.normalYehKe(doc_string)
-        normalized_string = self.sub_alphabets(doc_string)
-        #normalized_string = self.data_helper.clean_text(normalized_string, new_line_elimination).strip()
-
-        #normalized_string = self.space_correction(self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip()
-        
-        # !!!آئین نامه را به آئین‌نامه تبدیل می کند و دو توکن را به یکی تبدیل می کند
-        #normalized_string = (self.space_correction_plus1(self.space_correction_plus2(self.space_correction_plus3(normalized_string)))).strip()
-
-        #if self.pinglish_conversion_needed:
-            #normalized_string = self.pinglish_conversion.pingilish2persian(self.tokenizer.tokenize_words(normalized_string))
-
-        if self.date_normalizing_needed:
-            token_list = self.tokenizer.tokenize_words(normalized_string)
-            # نرمالایز کردن اعداد حروفی و ...
-            normalized_string, additional_token_index_array = self.date_normalizer.normalize_numbers2(token_list)
-            # نرمالایز و تشخیص تاریخ ها
-            normalized_string, dates, recognized_dates = self.date_normalizer.general_normalize_date(normalized_string, additional_token_index_array)
-            #normalized_string_list = normalized_string.strip().split()
-            #normalized_string, dates, recognized_dates = self.date_normalizer.normalize_dates(normalized_string_list, additional_token_index_array, token_list_len= len(normalized_string_list), previous_slice_index_array=[0,])
-            # کنترل اعدادی که پشت سر هم آمده اند و با واو از هم جدا شده اند
-            normalized_string, recognized_numbers = self.date_normalizer.handle_continuous_numbers(normalized_string)
-        # در مواردی مانند وسیصد که واو به عدد سیصد چسبیده، ابتدا این دو را از هم جدا می کنیم
-        # تا عدد اصلی را به دست بیاوریم. در این صورت یک توکن اضافه تولید می شود که با این متد، توکن های اضافی را حذف می کنیم            
-        normalized_string =self.date_normalizer.remove_additional_tokens(normalized_string.split(), additional_token_index_array)
-        if return_dates:
-            # مرتب کردن آرایه تاریخ های پیدا شده بر اساس توکن شروع عبارت حاوی تاریخ
-            recognized_dates.sort(key=lambda x: int(x['start_date_token_index']), reverse=False)
-            
-            return normalized_string, dates, recognized_dates, recognized_numbers
-        else:
-            return normalized_string
-
-
-class DateNormalizer():
-    def __init__(self):
-
-        self.month_dict = { "فروردین": 1,"فروردینماه": 1,'فروردین\u200cماه':1,
-                            "اردیبهشت": 2,"اردیبهشتماه": 2,'اردیبهشت\u200cماه':2,
-                            "خرداد": 3,"خردادماه": 3,
-                            "تیر": 4,"تیرماه": 4,
-                            "مرداد": 5,"مردادماه": 5,
-                            "شهریور": 6,"شهریورماه": 6,
-                            "مهر": 7,"مهرماه": 7,
-                            "آبان": 8,"آبانماه": 8,'آبان\u200cماه':8,
-                            "آذر": 9,"آذرماه": 9,
-                            "دی": 10,"دیماه": 10,'دی\u200cماه':10,
-                            "بهمن": 11,"بهمنماه": 11,'بهمن\u200cماه':11,
-                            "اسفند": 12,"اسفندماه": 12}
-        self.num_dict = {"صد": 100,"وصد": 100, "یکصد": 100, "ویکصد": 100, "هزار": 1000,"وهزار": 1000,
-                         "یکهزار": 1000,"ویکهزار": 1000, 
-                         "میلیون": 1000000,"ملیون": 1000000, "ومیلیون": 1000000,"وملیون": 1000000,
-                         "یکمیلیون": 1000000,"یکملیون": 1000000,
-                         "ویکمیلیون": 1000000,"ویکملیون": 1000000, "دویست": 200,"ودویست": 200,
-                         "ده": 10,"وده": 10, "نه": 9,"ونه": 9, "هشت": 8,"وهشت": 8, "هفت": 7,"وهفت": 7,
-                         "شش": 6,"وشش": 6, "پنج": 5,"وپنج": 5,"چهار": 4,"وچهار": 4, "سه": 3,"وسه": 3,
-                         "دو": 2,"ودو": 2, "یک": 1,"ویک": 1, "یازده": 11,"ویازده": 11, "سیزده": 13, "وسیزده": 13,
-                         "چهارده": 14, "دوازده": 12, "پانزده": 15, "شانزده": 16, "هفده": 17,"هیفده": 17,
-                         "وچهارده": 14, "ودوازده": 12, "وپانزده": 15, "وشانزده": 16, "وهفده": 17,"وهیفده": 17,
-                         "هجده": 18,"هیجده": 18, "نوزده": 19, "بیست": 20, "سی": 30, "چهل": 40, "پنجاه": 50,
-                         "وهجده": 18,"وهیجده": 18, "ونوزده": 19, "وبیست": 20, "وسی": 30, "وچهل": 40, "وپنجاه": 50,
-                         "شصت": 60, "هفتاد": 70, "نود": 90, "سیصد": 300, "چهارصد": 400,
-                         "وشصت": 60, "وهفتاد": 70, "ونود": 90, "وسیصد": 300, "وچهارصد": 400,
-                         "پانصد": 500, "ششصد": 600, "هفتصد": 700, "هشتصد": 800, "نهصد": 900,
-                         "وپانصد": 500, "وششصد": 600, "وهفتصد": 700, "وهشتصد": 800, "ونهصد": 900,
-                         "هشتاد": 80, " ": 0, "میلیارد": 1000000000,"ملیارد": 1000000000,
-                         "یکمیلیارد": 1000000000,"یکملیارد": 1000000000,
-                         "وهشتاد": 80, " ": 0, "ومیلیارد": 1000000000,"وملیارد": 1000000000,
-                         "ویکمیلیارد": 1000000000,"ویکملیارد": 1000000000,
-                         "صدم": 100, "هزارم": 1000, "دویستم": 200,
-                         "وصدم": 100, "وهزارم": 1000, "ودویستم": 200,
-                         "دهم": 10, "نهم": 9, "هشتم": 8, "هفتم": 7, "ششم": 6, "پنجم": 5,
-                         "ودهم": 10, "ونهم": 9, "وهشتم": 8, "وهفتم": 7, "وششم": 6, "وپنجم": 5,
-                         "چهارم": 4, "سوم": 3, "دوم": 2, "یکم": 1, "اول": 1, "یازدهم": 11, "سیزدهم": 13,
-                         "وچهارم": 4, "وسوم": 3, "ودوم": 2, "ویکم": 1, "واول": 1, "ویازدهم": 11, "وسیزدهم": 13,
-                         "چهاردهم": 14, "دوازدهم": 12, "پانزدهم": 15, "شانزدهم": 16, "هفدهم": 17,"هیفدهم": 17,
-                         "وچهاردهم": 14, "ودوازدهم": 12, "وپانزدهم": 15, "وشانزدهم": 16, "وهفدهم": 17,"وهیفدهم": 17,
-                         "هجدهم": 18,"هیجدهم": 18, "نوزدهم": 19, "بیستم": 20, "چهلم": 40, "پنجاهم": 50,
-                         "وهجدهم": 18,"وهیجدهم": 18, "ونوزدهم": 19, "وبیستم": 20, "وچهلم": 40, "وپنجاهم": 50,
-                         "شصتم": 60, "هفتادم": 70, "نودم": 90, "سیصدم": 300, "چهارصدم": 400,
-                         "وشصتم": 60, "وهفتادم": 70, "ونودم": 90, "وسیصدم": 300, "وچهارصدم": 400,
-                         "پانصدم": 500, "ششصدم": 600, "هفتصدم": 700, "هشتصدم": 800, "نهصدم": 900,
-                         "وپانصدم": 500, "وششصدم": 600, "وهفتصدم": 700, "وهشتصدم": 800, "ونهصدم": 900,
-                         "هشتادم": 80,"وهشتادم": 80}
-
-    def find_date_part(self, token_list):
-        import re
-        for index, element in enumerate(token_list):
-            # بررسی الگوی تاریخ 1398/12/18 
-            pattern_date = r'^(\d{4}\s*/\s*([1-9]|0[1-9]|1[0-2])\s*/\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
-            date_pattern = re.compile(pattern_date)
-            match_date   = date_pattern.match(element)
-            if match_date:
-                date_parts = match_date.string.split('/')
-                year  = int(re.search(r'\d+', date_parts[0]).group())
-                month = int(re.search(r'\d+', date_parts[1]).group())
-                day   = int(re.search(r'\d+', date_parts[2]).group())
-                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
-                if day > 999:
-                    day  = int(re.search(r'\d+', date_parts[0]).group())
-                    year = int(re.search(r'\d+', date_parts[2]).group())
-                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
-                return formal_date, index, index, index
-            
-            # 24 /12 /1401 بررسی الگوی تاریخ 
-            pattern_date = r'\s*\d{2}\s*/\s*\d{2}\s*/\s*\d{4}\s*'
-            date_pattern = re.compile(pattern_date)
-            match_date   = date_pattern.match(element)
-            if match_date:
-                date_parts = match_date.string.split('/')
-                year       = int(re.search(r'\d+', date_parts[0]).group())
-                month      = int(re.search(r'\d+', date_parts[1]).group())
-                day        = int(re.search(r'\d+', date_parts[2]).group())
-                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
-                if day > 999:
-                    day  = int(re.search(r'\d+', date_parts[0]).group())
-                    year = int(re.search(r'\d+', date_parts[2]).group())
-                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
-                return formal_date, index, index, index
-            
-            # بررسی الگوی تاریخ 1402،10،06
-            patterndate2 = r'^(\d{4}\s*،\s*([1-9]|0[1-9]|1[0-2])\s*،\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
-            date_pattern = re.compile(patterndate2)
-            match_date2  = date_pattern.match(element)
-            if match_date2:
-                date_parts = match_date2.string.split('،')
-                year  = int(re.search(r'\d+', date_parts[0]).group())
-                month = int(re.search(r'\d+', date_parts[1]).group())
-                day   = int(re.search(r'\d+', date_parts[2]).group())
-                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
-                if day > 999:
-                    day  = int(re.search(r'\d+', date_parts[0]).group())
-                    year = int(re.search(r'\d+', date_parts[2]).group())
-                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
-                return formal_date, index, index, index
-            
-            # بررسی الگوی تاریخ 13ر04ر1345
-            patterndate2 = r'^(\d{4}\s*ر\s*([1-9]|0[1-9]|1[0-2])\s*ر\s*([1-9]|0[1-9]|[12][0-9]|3[01]))$'
-            date_pattern = re.compile(patterndate2)
-            match_date2  = date_pattern.match(element)
-            if match_date2:
-                date_parts = match_date2.string.split('ر')
-                year  = int(re.search(r'\d+', date_parts[0]).group())
-                month = int(re.search(r'\d+', date_parts[1]).group())
-                day   = int(re.search(r'\d+', date_parts[2]).group())
-                # اگر مقداری که در روز است، چهار رقمی باشد باید مقدار روز و سال را با هم عوض کرد
-                if day > 999:
-                    day  = int(re.search(r'\d+', date_parts[0]).group())
-                    year = int(re.search(r'\d+', date_parts[2]).group())
-                formal_date = "y" + str(year) + "m" + str(month) + "d" + str(day)
-                return formal_date, index, index, index
-
-            if element == "/":
-                if index-1 >= 0 and index+1 < len(token_list) \
-                        and token_list[index -1].isdigit() and token_list[index+1].isdigit():
-                    if index+3 < len(token_list) and token_list[index+2] == "/" \
-                            and token_list[index + 3].isdigit():
-                        if int(token_list[index-1]) < 1450 and int(token_list[index+1]) < 13 and \
-                            int(token_list[index+3]) < 32:
-                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
-                            formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])
-                            #return formal_date, index-1, index+3, index
-                            return formal_date, index, index, index
-                        elif int(token_list[index-1]) < 32 and int(token_list[index+1]) < 13 and \
-                            int(token_list[index+3]) < 1450:
-                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
-                            formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
-                            #return formal_date, index-1, index+3, index
-                            return formal_date, index, index, index
-                        else:
-                            formal_date = [int(token_list[index-1]), int(token_list[index+1]), int(token_list[index+3])]
-                            formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + "/" + str(formal_date[2]) + ")"
-                            #return formal_date, index-1, index+3, index
-                            return formal_date, index, index, index
-                    elif (int(token_list[index-1]) < 1450 and int(token_list[index-1]) > 1250)  and int(token_list[index+1]) < 13:
-                        formal_date = [int(token_list[index-1]), int(token_list[index+ 1]), 0]
-                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
-                        #return formal_date, index-1 , index+1, index
-                        return formal_date, index , index, index
-                    else:
-                        formal_date = [int(token_list[index-1]), int(token_list[index+ 1])]
-                        formal_date = "num(" + str(formal_date[0]) + "/" + str(formal_date[1]) + ")"
-                        #return formal_date, index-1 , index+1, index
-                        return formal_date, index , index, index
-
-            if element in self.month_dict or element == "سال":
-                if index + 1 < len(token_list) and index - 1 > -2:
-                    try:
-                        start_date_index = end_date_index = 0
-                        if(token_list[index + 1] == 'ماه'):
-                            if(token_list[index + 2] == 'سال' and str(token_list[index + 3]).isdigit()):
-
-                                if(int(token_list[index + 3])==1000):
-                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
-                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
-                                    start_date_index = index - 1
-                                    day = int(token_list[index - 1])
-                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
-                                        day += int(token_list[index - 3])# رقم دهگان روز
-                                        start_date_index = index - 3
-                                    year = 1000
-                                    if(len(token_list)>= index + 5):
-                                        if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()):
-                                            year += int(token_list[index + 5])# رقم صدگان سال
-                                            end_date_index = index + 5
-                                    if(len(token_list)>= index + 7):
-                                        if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()):
-                                            year += int(token_list[index + 7])# رقم دهگان سال
-                                            end_date_index = index + 7
-                                    if(len(token_list)>= index + 9):
-                                        if(token_list[index + 8]=='و' and token_list[index + 9].isdigit()):
-                                            year += int(token_list[index + 9])# رقم یکان سال
-                                            end_date_index = index + 9
-
-                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5
-                                    
-                                else:
-                                    start_date_index = index - 1
-                                    end_date_index = index + 3
-                                    day = int(token_list[index - 1])
-                                    if(token_list[index - 2]=='و' and int(token_list[index - 1])<10 and int(token_list[index - 3])<31 and token_list[index - 3].isdigit()):
-                                        start_date_index = index - 3
-                                        day += int(token_list[index - 3])# رقم دهگان روز
-
-                                        formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 20 و 7 تیر ماه سال 1368
-                                        
-                                    
-                                    formal_date = [day, int(self.month_dict[token_list[index]]), int(token_list[index + 3])]# مثلا 27 تیر ماه سال 1368
-
-                            elif(str(token_list[index + 2]).isdigit()):
-                                if(int(token_list[index + 2])==1000):
-                                    
-                                    if token_list[index-1].strip() == 'ام': # مثلا 30 ام تیر ماه 1000 و 300 و 80 و 2
-                                        start_date_index = index - 2
-                                        day = int(token_list[index - 2])
-                                    else:
-                                        # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
-                                        # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
-                                        start_date_index = index - 1
-                                        day = int(token_list[index - 1])
-                                        
-                                        if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
-                                            day += int(token_list[index - 3])# رقم دهگان روز
-                                            start_date_index = index - 3
-                                    year = 1000
-                                    if(len(token_list)>= index + 4):
-                                        if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()):
-                                            year += int(token_list[index + 4])# رقم صدگان سال
-                                            end_date_index = index + 4
-                                    if(len(token_list)>= index + 6):
-                                        if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()):
-                                            year += int(token_list[index + 6])# رقم دهگان سال
-                                            end_date_index = index + 6
-                                    if(len(token_list)>= index + 8):
-                                        if(token_list[index + 7]=='و' and token_list[index + 8].isdigit()):
-                                            year += int(token_list[index + 8])# رقم یکان سال
-                                            end_date_index = index + 8
-                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5
-                                    
-                                else:
-                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر ماه سال 1368
-                                    start_date_index = index - 1
-                                    end_date_index = index + 2
-                                #formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر ماه 1368
-                        
-                        elif(token_list[index + 1] == 'سال' and (not(token_list[index + 2]).isdigit())):
-                            
-                            formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])] # مثلا 27 تیر سال 1368
-                            start_date_index = index - 1
-                            end_date_index = index + 2
-                        else:
-                            if(token_list[index + 1] == 'سال' and str(token_list[index + 2]).isdigit()):
-
-                                if(int(token_list[index + 2])==1000):
-                                    
-                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
-                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
-                                    start_date_index = index - 1
-                                    day = int(token_list[index - 1])
-                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
-                                        day += int(token_list[index - 3])# رقم دهگان روز
-                                        start_date_index = index - 3
-                                    year = 1000
-                                    if(len(token_list)>= index + 4):
-                                        if(token_list[index + 3]=='و' and token_list[index + 4].isdigit()):
-                                            year += int(token_list[index + 4])# رقم صدگان سال
-                                            end_date_index = index + 4
-                                    if(len(token_list)>= index + 6):
-                                        if(token_list[index + 5]=='و' and token_list[index + 6].isdigit()):
-                                            year += int(token_list[index + 6])#  # رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد
-                                            end_date_index = index + 6
-                                    if(len(token_list)>= index + 8):
-                                        if((len(token_list)>= index + 7 and token_list[index + 7]=='و') and (len(token_list)>= index + 8 and token_list[index + 8].isdigit())):
-                                            year += int(token_list[index + 8])# رقم یکان سال
-                                            end_date_index = index + 8
-
-                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 8 تیر ماه سال 1000 و 300 و 20 و 5
-                                    
-                                else:
-                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 2])]# مثلا 27 تیر سال 1368
-                                    start_date_index = index - 1
-                                    end_date_index = index + 2
-
-                            elif(str(token_list[index + 1]).isdigit()):
-                                if(int(token_list[index + 1])==1000):
-                                    # در این حالت، امکان دارد روز مربوط به این تاریخ هم به صورت حروف در متن قانون نوشته شده باشد
-                                    # به همین دلیل ، توکن های قبل از ماه را نیز کنترل می کنیم
-                                    start_date_index = index - 1
-                                    day = int(token_list[index - 1])
-                                    if(token_list[index - 2]=='و' and token_list[index - 3].isdigit()):
-                                        day += int(token_list[index - 3])# رقم دهگان روز
-                                        start_date_index = index - 3
-                                    year = 1000
-                                    if(len(token_list)>= index + 3):
-                                        if(token_list[index + 2]=='و' and token_list[index + 3].isdigit()):
-                                            year += int(token_list[index + 3])# رقم صدگان سال
-                                            end_date_index = index + 3
-                                    if(len(token_list)>= index + 5):
-                                        if(token_list[index + 4]=='و' and token_list[index + 5].isdigit()):
-                                            year += int(token_list[index + 5])# رقم دهگان سال !!! برای تاریخ بین 1399 تا 1410 هم همین روال درست بر می گرداند، هرچند منطق غلطی دارد چون مثلا سال 1402 رقم صدگان ندارد
-                                            end_date_index = index + 5
-                                    if(len(token_list)>= index + 7):
-                                        if(token_list[index + 6]=='و' and token_list[index + 7].isdigit()):
-                                            year += int(token_list[index + 7])# رقم یکان سال
-                                            end_date_index = index + 7
-                                    formal_date = [day, int(self.month_dict[token_list[index]]), year]# مثلا 20 و 7 تیر ماه 1000 و 300 و 20 و 5
-                                else:
-                                    formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1320‌
-                                    start_date_index = index - 1
-                                    end_date_index = index + 1
-                                    
-                            else:
-                                formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), int(token_list[index + 1])]# مثلا 27 تیر 1365
-                                start_date_index = index - 1
-                                end_date_index = index + 1
-                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
-                        if token_list[index - 1] and token_list[index + 1]:
-                            return formal_date, start_date_index, end_date_index, start_date_index
-                        
-                    except Exception as e:
-                        error = e.args[0]
-                        try:
-                            formal_date = [int(token_list[index - 1]), int(self.month_dict[token_list[index]]), 0]
-                            formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])#مثلا  y1358m12d0
-                            return formal_date, index-1, index, index-1
-                        except:
-                            try:
-                                # مثلا سال 1400
-                                if token_list[index] == "سال":
-                                    formal_date = [int(token_list[index + 1]),0, 0]
-                                    formal_date = "y" + str(formal_date[0]) + "m" + str(formal_date[1]) + "d" + str(formal_date[2])# y1358m0d0
-                                    return formal_date, index, index+1, index
-                                elif token_list[index+1] == "ماه" and index + 1 < len(token_list):
-                                    if index + 2 < len(token_list) and token_list[index + 2].isdigit() and \
-                                        int(token_list[index-2]) < 1450:
-                                        formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])]
-                                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5
-                                        return formal_date, index, index+2, index
-                                    else:
-                                        formal_date = [0,int(self.month_dict[token_list[index]]),0]
-                                        formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) #y0m12d0
-                                        return formal_date, index, index+1, index
-                                elif index + 1 < len(token_list) and token_list[index + 1].isdigit() and int(token_list[index-2]) < 1450:
-                                    formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+1])]
-                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0]) # y0m12d5
-                                    return formal_date, index, index+1, index
-                                elif index-2 >= 0 and index+2 < len(token_list) and \
-                                    token_list[index - 1] == "/" and token_list[index+1] == "/" and \
-                                    token_list[index - 2].isdigit() and int(token_list[index-2]) < 32 and \
-                                    token_list[index + 2].isdigit() and int(token_list[index+2]) < 1450:
-                                    formal_date = [int(token_list[index-2]),int(self.month_dict[token_list[index]]),int(token_list[index+2])]
-                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
-                                    return formal_date, index-2, index+2, index-2
-                                elif index + 2 < len(token_list) and token_list[index + 1] == 'سال' and \
-                                    token_list[index + 2].isdigit() and int(token_list[index-2]) < 1450 :
-                                    formal_date = [0,int(self.month_dict[token_list[index]]),int(token_list[index+2])]
-                                    formal_date = "y" + str(formal_date[2]) + "m" + str(formal_date[1]) + "d" + str(formal_date[0])
-                                    return formal_date, index, index+1, index
-                                #else:
-                                #    print("Name : %s -> after1: %s -> after2: %s -> after3: %s" % (token_list[index], token_list[index+1], token_list[index+2], token_list[index+3]))
-                            except:
-                                pass
-
-    def general_normalize_date(self,normalized_string, additional_token_index_array):
-        # 24 /12 /1401 جهت بررسی الگوی تاریخ 
-        normalized_string, recognized_dates = DateNormalizer.separated_date_format_finder(normalized_string)
-        normalized_string_list = normalized_string.strip().split()
-        # جهت یافتن فرمت های مختلف تاریخ غیر از فرمت بررسی شده در بالا
-        normalized_string, dates, recognized_dates_2 = \
-            self.normalize_dates(normalized_string_list, additional_token_index_array,
-                token_list_len = len(normalized_string_list), previous_slice_index_array=[0,])
-        # تجمیع همه تاریخ ها
-        for item in recognized_dates_2:
-            recognized_dates.append(item)
-        return normalized_string, dates, recognized_dates
-
-    # 24 /12 /1401 متد بررسی الگوی تاریخ 
-    def separated_date_format_finder(normalized_string):
-        import re
-        date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}'
-        regex = re.compile(date_pattern)
-        match_dates = regex.finditer(normalized_string)
-        recognized_dates = []
-        for date_item in match_dates:
-            position = date_item.span()
-            founded_item = date_item.group()
-            start_index = date_item.start() + 1
-            end_index = date_item.end() - 1
-            current_date = founded_item.replace(' ', '')
-            date_parts = current_date.split('/')
-            formal_date = "y" + str(date_parts[2]) + "m" + str(date_parts[1]) + "d" + str(date_parts[0])
-            pattern_tokens_state = gf.token_state_finder(normalized_string, start_index, end_index)
-            recognized_dates.append(
-                    {
-                        "original_date"         : founded_item,
-                        "date"                  : formal_date,
-                        "start_index"           : start_index,
-                        "end_index"             : end_index,
-                        "date_token_index"      : pattern_tokens_state["start_token_state"],
-                        "start_date_token_index": pattern_tokens_state["start_token_state"],
-                        "end_date_token_index"  : pattern_tokens_state["end_token_state"],
-                    })
-        normalized_string_list = normalized_string.strip().split()
-        for date_item in recognized_dates:
-            normalized_string_list[int(date_item['date_token_index'])]         = date_item['date']
-            normalized_string_list[int(date_item['start_date_token_index'])+1] = 'tttt'
-            normalized_string_list[int(date_item['end_date_token_index'])]     = 'tttt'
-        normalized_string_temp = ''
-        for token in normalized_string_list:
-            normalized_string_temp = normalized_string_temp + ' ' + token
-        return normalized_string_temp.strip(), recognized_dates   
-
-    def normalize_dates(self, token_list, additional_token_index_array, token_list_len, previous_slice_index_array= []):
-        try:
-            finded = self.find_date_part(token_list)
-        except Exception as e:
-            finded = None
-            
-            filename =  'law_section_errors.txt'
-            exc_type, exc_value, exc_traceback = sys.exc_info()
-            frame = exc_traceback.tb_frame
-            function_array = traceback.extract_tb(exc_traceback)
-            current_function = function_array[len(function_array)-1]
-            error = f'''
-                filename     : {current_function.filename}
-                function     : {current_function.name}
-                err line no  : {current_function.lineno}
-                err line     : {current_function.line}
-                err message  : {e}
-            '''
-            gf.save_error(error, filename)
-
-        recognized_dates = []
-        if finded != None:
-            date_part              = finded[0]
-            start_date_token_index = finded[1]
-            end_date_token_index   = finded[2]
-            date_token_index       = finded[3]
-            befor_date_part        = " ".join(x for x in token_list[:start_date_token_index])
-            after_date_part        = [x for x in token_list[end_date_token_index + 1:]]
-            previous_slice_index   = token_list_len - len(after_date_part)
-            previous_slice_index_array.append(previous_slice_index)
-            after_normalized, after_dates, recognized_dates = self.normalize_dates(after_date_part, additional_token_index_array, token_list_len, previous_slice_index_array)
-            if after_dates == '':
-                after_dates = []
-            if recognized_dates == '':
-                recognized_dates = []
-            after_dates.insert(0, date_part)
-            previous_slice_index = previous_slice_index_array.pop(len(previous_slice_index_array)-2)
-            recognized_dates.append(
-                {
-                 "date"                  : date_part,
-                 "date_token_index"      : date_token_index + previous_slice_index,
-                 "start_date_token_index": start_date_token_index + previous_slice_index,
-                 "end_date_token_index"  : end_date_token_index + previous_slice_index
-                })
-            
-            i = 0
-            while((date_token_index - start_date_token_index) > i ):
-                befor_date_part = ''.join([befor_date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t
-                i += 1
-            i = 0
-            while((end_date_token_index - date_token_index) > i ):
-                date_part = ''.join([date_part," tttt"]) # به عنوان توکنی که از جنس تاریخ بوده t
-                i += 1
-            
-            return befor_date_part + " " + date_part + " " + after_normalized, after_dates, recognized_dates
-        else:
-            return " ".join(x for x in token_list), '',''
-
-    def list2num(self, numerical_section_list):
-        value = 1
-        l = len(numerical_section_list)>3
-        if l :
-            value = 0
-        for index, el in enumerate(numerical_section_list):
-            if self.is_number(el):
-                value = self.num_dict[el]
-            elif el == '000':
-                #value *= 1000
-                pass
-            elif l == True:
-                value += float(el)
-            else:
-                value *= float(el)
-        return value
-
-    def convert2num(self, numerical_section_list):
-        value = 0
-        tmp_section_list = []
-        for index, el in enumerate(numerical_section_list):
-            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
-                tmp_section_list.append(el)
-            elif el == "و":
-                value += self.list2num(tmp_section_list)
-                tmp_section_list[:] = []
-        if len(tmp_section_list) > 0:
-            value += self.list2num(tmp_section_list)
-            tmp_section_list[:] = []
-        try:
-            if (value-int(value) == 0):
-                return int(value)
-            else:
-                return value
-        except:
-            return 0
-        
-
-    def is_number(self, word):
-        return word in self.num_dict
-
-    def find_number_location(self, token_list, addWithVaa = False):
-        start_index = 0
-        number_section =[]
-        for i , el in enumerate(token_list):
-            if self.is_number(el) or (el.replace('.', '', 1).isdigit()):
-                start_index = i
-                number_section.append(start_index)
-                break
-
-        i = start_index+1
-        while(i < len(token_list)):
-            if token_list[i] == "و" and (i+1)<len(token_list) and addWithVaa:
-                if self.is_number(token_list[i+1]) or (token_list[i+1].replace('.', '', 1).isdigit()):
-                    number_section.append(i)
-                    number_section.append(i+1)
-                    i += 2
-                else:
-                    break
-            elif self.is_number(token_list[i]) or (token_list[i].replace('.', '', 1).isdigit()):
-                number_section.append(i)
-                i += 1
-            elif token_list[i] == "/" and (i+1)<len(token_list) and token_list[i+1] == "000":
-                number_section.append(i)
-                number_section.append(i+1)
-                i += 2
-            else:
-                break
-        return number_section
-
-    def normalize_numbers(self, token_list, converted=""):
-        for i, el in enumerate(token_list):
-            if el.endswith("ین") and self.is_number(el[:-2]):
-                token_list[i] = el[:-2]
-        finded = self.find_number_location(token_list)
-        if len(finded) == 0:
-            rest_of_string = " ".join(t for t in token_list)
-            return converted + " " + rest_of_string
-        else:
-            numerical_subsection = [token_list[x] for x in finded]
-            numerical_subsection = self.convert2num(numerical_subsection)
-
-            converted = converted + " " + " ".join(x for x in token_list[:finded[0]]) + " " + str(numerical_subsection)
-
-            new_index = finded[-1] + 1
-            return self.normalize_numbers(token_list[new_index:], converted)
-
-    # این متد برای تبدیل اعدادی حروفی به معادل رقمی ایجاد شده است
-    def normalize_numbers2(self, token_list, converted=""):
-        additional_token_index_array = [] # آرایه ای از توکن هایی که اضافه بر اصل متن ممکن است در این متد اضافه شوند
-        for index, token in enumerate(token_list):
-
-            if token in self.num_dict:
-                if token.startswith('و'): # برای حالتی مانند وسیصد که بین واو و سیصد، فاصله ای وجود ندارد
-                    temp_token = token[1:]
-                    if temp_token in self.num_dict:
-                       # در متن، واو را از عدد جدا می کنیم
-                       # در این حالت یک توکن به متن اصلی اضافه می شود که باید کنترل شود
-                       converted = converted + " " + "و" + " " + str(int(self.num_dict[token]))
-                       additional_token_index_array.append(index) 
-                else:    
-                    converted = converted + " " + str(int(self.num_dict[token]))
-            else:
-                converted = converted + " " + token
-        return converted, additional_token_index_array
-
-    # این متد برای ضرب اعداد متوالی با جدا کننده اسپیس ایجاد شده
-    def multiply_continuous_numbers(self, normalized_string):
-        token_list       = normalized_string.strip().split()
-        converted_string = ''
-        for index,token in enumerate(token_list):
-            num_array = []
-            if token.isdigit():
-                #if (token_list[index+1]).isdigit():    
-                    # اگر عدد مورد نظر با فرمت بالا نبود، حالت اعداد بدون فاصله کنار هم که باید در هم ضرب شوند را بررسی می کنیم
-                    num_array.clear() 
-                    num_array.append(int(token))
-                    step = 1
-                    deletting_indexes = []
-                    num_array_sum = 1
-                    while True:
-                        if (token_list[index+step]).isdigit() :
-                            num_array.append(int(token_list[index+step]))
-                            deletting_indexes.append(index+step)
-                            step += 1
-                            continue
-                        else:
-                            break
-                        
-                    for index in deletting_indexes:
-                        # بازای توکن هایی که عدد هستند و به دلیل ضرب در هم شدن باید حذف شوند، رشته بی معنی زیر را جایگزین کن
-                        token_list[index] =   'nnnn' # به معنای توکنی که از جنس عدد بوده number
-                    for number in num_array: # ،عدادی که پشت سرهم بوده اند، بدون فاصله و در مراحل قبل پردازش نشده اند و در آرایه ذخیره شده را ضرب می کند و در یک عدد، یک کاسه می کند، مثلا 2 1000 1000000000 تومان را
-                        num_array_sum *= number 
-                    converted_string = converted_string + " " + str(num_array_sum) 
-               
-            else:
-                converted_string = converted_string + " " + token
-        return converted_string
-    
-    # این متد برای جمع اعداد متوالی با جدا کننده واو ایجاد شده
-    def sum_continuous_numbers(self, normalized_string):
-        token_list = normalized_string.strip().split()
-        converted_string = ''
-        operand = '+'
-        for index,token in enumerate(token_list):
-            num_array = []
-            if token.isdigit():
-                # این بخش، متن را بررسی می کند و اعداد دو یا سه رقمی و بیشتر که با واو از هم جدا شده را با هم جمع و یک کاسه می کند
-                if token_list[index+1]=='و': # 2 , 50 , 600 :مانند
-                    
-                    if(int(token) % 10 != 0): # اگر عدد دو یا سه یا چهار یا ... رقمی باشد، رقم دهگاه، صدگان، هزارگان و ... باید باقیمانده شان بر 2 برابر با صفر باشد، اگر نبود، ادامه نده
-                        converted_string = converted_string + " " + token
-                    else: 
-                        num_array.append(int(token))
-                        step = 1
-                        deletting_indexes = []
-                        num_array_sum = 0
-                        while True:
-                            if token_list[index+step] == 'و' :
-                                # اگر یکی از شرط های زیر را داشت، اعدادی که با واو جدا شده اند، مربوط به تاریخ هستند و نباید جمع عددی انجام شود
-                                if (token_list[index+step+1] == 'tttt' or token_list[index+step+1].startswith('y')):
-                                    deletting_indexes.clear()
-                                    num_array.clear()
-                                    num_array_sum = int(token)
-                                    break
-                                elif token_list[index+step+1].isdigit() and token_list[index+step+2]=='nnnn' :
-                                    operand = '*'
-                                    num_array_sum = 1
-                                    num_array.append(int(token_list[index+step+1]))
-                                    # ذخیره کردن ایندکس هایی که جمع شده و باید به جای آن یک توکن خنثی بگذاریم
-                                    deletting_indexes.append(index+step)
-                                    deletting_indexes.append(index+step+1)
-                                    step += 3
-                                    continue
-                                elif token_list[index+step+1].isdigit() :
-                                    num_array.append(int(token_list[index+step+1]))
-                                    # ذخیره کردن ایندکس هایی که جمع شده و باید به جای آن یک توکن خنثی بگذاریم
-                                    deletting_indexes.append(index+step)
-                                    deletting_indexes.append(index+step+1)
-                                    step += 2
-                                    continue
-                            else:
-                                break
-                            
-                        for index in deletting_indexes:
-                            # بازای توکن هایی که عدد یا واو هستند و  باید حذف شوند، رشته بی معنی زیر را جایگزین کن
-                            token_list[index] =   'nnnn' # به معنای توکنی که از جنس عدد بوده number
-
-                        for number in num_array: # اعدادی که با واو پشت سرهم بوده اند و در آرایه ذخیره شده را جمع می کند و در یک عدد، یک کاسه می کند
-                            if operand == '+':
-                                num_array_sum += number 
-                            elif operand == '*':
-                                num_array_sum *= number
-                        converted_string = converted_string + " " + str(num_array_sum)
- 
-            else:
-                converted_string = converted_string + " " + token
-        return converted_string
-    
-    # این متد اعداد پشت سر هم که با اسپیس یا واو از هم جدا شده اند و باید تبدیل به یک عدد به صورت یک کاسه شوند را هندل می کند
-    def handle_continuous_numbers(self, normalized_string):
-        token_list = normalized_string.strip().split()
-        converted_string = ''
-        number_parts_array = []
-        start_token_index = 0
-        end_token_index = 0
-        recognized_numbers = []
-        flag = True
-        is_tabsare_numbers = False
-        is_mavad_numbers = False
-        is_madde_numbers = False
-        for token_index,token in enumerate(token_list):
-            if token.isdigit() :
-                # if token == '61' :
-                #      print(len(token_list))
-                if token_index - 2 >= 0:
-                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند تبصره های مربوط به یک ماده نباشند
-                    is_tabsare_numbers = (token_list[token_index-2] + ' ' + token_list[token_index-1]=='تبصره های')
-                if token_index - 1 >= 0:
-                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند مواد یک قانون نباشند
-                    is_mavad_numbers = (token_list[token_index-1]=='مواد')
-                if token_index - 1 >= 0:
-                    # تشخیص اینکه یک سلسله از اعداد که با واو از هم جدا شده اند ماده های یک قانون نباشند
-                    is_madde_numbers = (token_list[token_index-1]=='ماده')
-                if is_tabsare_numbers or is_mavad_numbers or is_madde_numbers:
-                    flag = False
-                start_token_index = end_token_index = token_index
-                number_parts_array.clear()
-                current_token_index = token_index
-                number_parts_array.append(token)
-                if current_token_index + 1 < len(token_list):
-                    while ((flag) and (token_list[current_token_index+1] == 'و' or token_list[current_token_index+1].isdigit())):
-                        number_parts_array.append(token_list[current_token_index+1])
-                        current_token_index += 1
-                        end_token_index = current_token_index
-                        if not (current_token_index + 1 < len(token_list)):
-                            break
-                
-                final_number = number_completter(number_parts_array)
-                token_list[token_index] = str(final_number)
-                recognized_numbers.append({
-                    'number_value'     : final_number,
-                    'number_token_list': number_parts_array,
-                    'start_token_index': start_token_index,
-                    'end_token_index'  : end_token_index
-                    })
-                # جایگذاری مقدار رشته بی ارزش به جای توکن های مربوط به عدد جاری
-                # به منظور اینکه متن اصلی از نظر تعداد توکن تغییر نکند
-                for i in range(token_index + 1 , len(number_parts_array) + token_index):
-                    token_list[i] = 'nnnn'
-                converted_string = converted_string + ' ' + token_list[token_index]
-                if token_index + 1 < len(token_list):
-                    if flag == False and (token_list[token_index+1]=='و'):
-                        flag = False
-                    else:
-                        flag = True
-            else:
-                converted_string = converted_string + ' ' + token
-               
-        return converted_string, recognized_numbers
-
-    
-
-
-    def remove_additional_tokens(self,token_list,additional_token_index_array):
-        converted_string = ''
-        for index in additional_token_index_array:
-            del token_list[index]
-
-        for token in token_list:
-            converted_string = converted_string + ' ' + token
-        return converted_string
-    
-class PinglishNormalizer():
-    def __init__(self):
-        self.data_helper = DataHelper()
-        self.file_dir = os.path.dirname(os.path.realpath(__file__)) + "/"
-
-        self.en_dict_filename = self.file_dir + "resource/tokenizer/enDict"
-        self.en_dict = self.data_helper.load_var(self.en_dict_filename)
-
-        self.fa_dict_filename = self.file_dir + "resource/tokenizer/faDict"
-        self.fa_dict = self.data_helper.load_var(self.fa_dict_filename)
-
-
-    def pingilish2persian(self, pinglish_words_list):
-
-        for i, word in enumerate(pinglish_words_list):
-            if word in self.en_dict:
-                pinglish_words_list[i] = self.en_dict[word]#.decode("utf-8")
-                #inp = inp.replace(word, enDict[word], 1)
-            else:
-                ch = self.characterize(word)
-                pr = self.map_char(ch)
-                amir = self.make_word(pr)
-                for wd in amir:
-                    am = self.escalation(wd)
-                    asd = ''.join(am)
-                    if asd in self.fa_dict:
-                        pinglish_words_list[i] = asd#.decode("utf-8")
-                        #inp = inp.replace(word, asd, 1)
-        inp = " ".join(x for x in pinglish_words_list)
-        return inp
-
-    def characterize(self, word):
-        list_of_char = []
-        i = 0
-        while i < len(word):
-            char = word[i]
-            sw_out = self.switcher(char)
-            if (sw_out == None):
-                esp_out = None
-                if(i < len(word) - 1):
-                    esp_out = self.esp_check(word[i], word[i + 1])
-                if(esp_out == None):
-                    list_of_char.append(word[i])
-                else:
-                    list_of_char.append(esp_out)
-                    i += 1
-            else:
-                list_of_char.append(sw_out)
-            i += 1
-        return list_of_char
-
-    def switcher(self, ch):
-        switcher = {
-            "c": None,
-            "k": None,
-            "z": None,
-            "s": None,
-            "g": None,
-            "a": None,
-            "u": None,
-            "e": None,
-            "o": None
-        }
-        return switcher.get(ch, ch)
-
-    def esp_check(self, char1, char2):
-        st = char1 + char2
-        if (st == "ch"):
-            return "ch"
-        elif (st == "kh"):
-            return "kh"
-        elif (st == "zh"):
-            return "zh"
-        elif (st == "sh"):
-            return "sh"
-        elif (st == "gh"):
-            return "gh"
-        elif (st == "aa"):
-            return "aa"
-        elif (st == "ee"):
-            return "ee"
-        elif (st == "oo"):
-            return "oo"
-        elif (st == "ou"):
-            return "ou"
-        else:
-            return None
-
-    def map_char(self, word):
-        listm = []
-        sw_out = self.map_switcher(word[0])
-        i = 0
-        if (sw_out == None):
-            listm.append(["ا"])
-            i += 1
-        if (word[0] == "oo"):
-            listm.append(["او"])
-            i += 1
-        while i < len(word):
-            listm.append(self.char_switcher(word[i]))
-            i += 1
-        if word[len(word) - 1] == "e":
-            listm.append(["ه"])
-        elif word[len(word) - 1] == "a":
-            listm.append(["ا"])
-        elif word[len(word) - 1] == "o":
-            listm.append(["و"])
-        elif word[len(word) - 1] == "u":
-            listm.append(["و"])
-
-        return listm
-
-    def map_switcher(self, ch):
-        switcher = {
-            "a": None,
-            "e": None,
-            "o": None,
-            "u": None,
-            "ee": None,
-
-            "ou": None
-        }
-        return switcher.get(ch, ch)
-
-    def make_word(self, chp):
-        word_list = [[]]
-        for char in chp:
-            word_list_temp = []
-            for tmp_word_list in word_list:
-                for chch in char:
-                    tmp = copy.deepcopy(tmp_word_list)
-                    tmp.append(chch)
-                    word_list_temp.append(tmp)
-            word_list = word_list_temp
-        return word_list
-
-    def escalation(self, word):
-        tmp = []
-        i = 0
-        t = len(word)
-        while i < t - 1:
-            tmp.append(word[i])
-            if word[i] == word[i + 1]:
-                i += 1
-            i += 1
-        if i != t:
-            tmp.append(word[i])
-        return tmp
-
-    def char_switcher(self, ch):
-        switcher = {
-            'a': ["", "ا"],
-            'c': ["ث", "ص", "ص"],
-            'h': ["ه", "ح"],
-            'b': ["ب"],
-            'p': ["پ"],
-            't': ["ت", "ط"],
-            's': ["س", "ص", "ث"],
-            'j': ["ج"],
-            'ch': ["چ"],
-            'kh': ["خ"],
-            'q': ["ق", "غ"],
-            'd': ["د"],
-            'z': ["ز", "ذ", "ض", "ظ"],
-            'r': ["ر"],
-            'zh': ["ژ"],
-            'sh': ["ش"],
-            'gh': [",ق", "غ"],
-            'f': ["ف"],
-            'k': ["ک"],
-            'g': ["گ"],
-            'l': ["ل"],
-            'm': ["م"],
-            'n': ["ن"],
-            'v': ["و"],
-            'aa': ["ا"],
-            'ee': ["ی"],
-            'oo': ["و"],
-            'ou': ["و"],
-            'i': ["ی"],
-            'y': ["ی"],
-            ' ': [""],
-            'w': ["و"],
-            'e': ["", "ه"],
-            'o': ["", "و"]
-        }
-        return switcher.get(ch, "")
-
-# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند
-def number_completter2(number_parts_array):
-        zarb_value = 0
-        temp_number_array = []
-        sum_number_parts = 0
-        final_number = 0
-        for index,item in enumerate(number_parts_array):
-                    if item.isdigit():
-                        temp_number_array.append(int(item))
-                        if index + 1 >= len(number_parts_array):
-                            for item3 in temp_number_array:
-                                final_number += int(item3)
-                            return final_number 
-                        current_value = number_parts_array[index+1]       
-                        if (current_value.isdigit() and (int(current_value) == 1000 or int(current_value) == 1000000 or int(current_value) == 1000000000)):# or int(current_value) == 1000000
-                            zarb_value = int(number_parts_array[index+1])
-                            for num_item in temp_number_array:
-                                sum_number_parts += num_item
-                            final_number = sum_number_parts * zarb_value
-                            temp_array2 = []
-                            x = index + 2
-                            while x < len(number_parts_array):
-                            # for x in range(index+2,len(number_parts_array)):
-                                temp_array2.append(number_parts_array[x])
-                                if x + 1 < len(number_parts_array):
-                                    if number_parts_array[x+1].isdigit() and (int(number_parts_array[x+1]) == 1000 or int(number_parts_array[x+1]) == 1000000 or int(number_parts_array[x+1]) == 1000000000):
-                                        if int(number_parts_array[x+1]) > zarb_value: # 1000000>1000
-                                            zarb_value = int(number_parts_array[x+1])
-                                            # تابع بازگشتی برای محاسبه عدد نهایی
-                                            final_number += number_completter2(temp_array2)
-                                            final_number *= zarb_value
-                                            temp_array2.clear()
-                                            zarb_value = 0
-                                            x += 2 # به این دلیل که مقدار ضرب را در آرایه تمپ ذخیره نکند
-                                        else: # 1000!>1000000
-                                            zarb_value = int(number_parts_array[x+1])
-                                            temp_num = 0
-                                            if (zarb_value == 1000 or zarb_value == 1000000) and len(number_parts_array) > x + 1:
-                                                num_array = number_parts_array[x+2:len(number_parts_array)]
-                                                temp_num = number_completter(num_array) 
-                                                if temp_num == None:
-                                                    temp_num = 0 
-                                                
-                                                '''for i in range(x+2,len(number_parts_array)):
-                                                    if(number_parts_array[i].isdigit()):
-                                                        temp_num += int(number_parts_array[i])
-                                                        i+=1'''
-                                            temp_num2 = 1
-                                            for num_item2 in temp_array2:
-                                                if(num_item2.isdigit()):
-                                                    temp_num2 += int(num_item2)
-                                            
-                                            temp_num2 *= zarb_value  
-                                            final_number += temp_num + temp_num2
-                                            break
-                                    else:
-                                        x += 1 #
-                                else: # اگر از رنج آرایه خارج شدیم
-                                    temp_num = 0
-                                    # مقادیر موجود در آرایه تمپ را جمع کن
-                                    for num_item3 in temp_array2:
-                                        if(num_item3.isdigit()):
-                                            temp_num += int(num_item3)
-                                     # حاصل جمع اعداد موجود در آرایه ذخیره را در عدد نهایی که قبلا داشته ایم ضرب کن و از حلقه خارج شو
-                                    final_number *= temp_num
-                                    #final_number += temp_num
-                                    break       
-                            return final_number
-                        
-# این روال جهت جمع و ضرب کردن بخش های مختلف از صدگان و هزارگان و بالاتر از یک عدد که با حروف در متن وجود دارد کار می کند
-def number_completter(number_parts_array):
-        zarb_value                = 0
-        previous_zarb_value       = 0
-        previous_number_parts_sum = 0
-        temp_number_array         = []
-        number_parts_sum          = 0
-        final_number              = 0
-        current_number_part       = 0
-        for index,item in enumerate(number_parts_array):
-                    if item.isdigit():      
-                        if (not(int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)):
-                            temp_number_array.append(item)
-                            continue
-                        elif((int(item) == 1000 or int(item) == 1000000 or int(item) == 1000000000)):
-                            zarb_value = int(item)    
-                            for num_item in temp_number_array:
-                                number_parts_sum += int(num_item)
-                            if number_parts_sum == 0 and previous_number_parts_sum == 0:  
-                                number_parts_sum = 1  
-                            temp_number_array.clear()    
-                            
-                        else:# for example 952
-                            zarb_value = 1    
-                            for num_item in temp_number_array:
-                                number_parts_sum += int(num_item)
-                            current_number_part = number_parts_sum + previous_number_parts_sum
-                            continue
-                        if previous_zarb_value < zarb_value:# for example 1000 < 1000000000   
-                            current_number_part = previous_number_parts_sum + number_parts_sum
-                            current_number_part = zarb_value * current_number_part
-                        else:# previous_zarb_value > zarb_value
-                            if number_parts_sum == 0:
-                                number_parts_sum = 1
-                            current_number_part  = zarb_value * number_parts_sum
-                            current_number_part += previous_number_parts_sum
-
-                        previous_number_parts_sum = current_number_part
-                        current_number_part = 0
-                        previous_zarb_value = zarb_value
-                        number_parts_sum = 0
-        if len(temp_number_array) != 0:
-            remained_parts_sum = 0  
-            for num_item in temp_number_array:
-                remained_parts_sum += int(num_item)
-            final_number = previous_number_parts_sum + remained_parts_sum
-            return final_number
-            
-        final_number = previous_number_parts_sum
-        return final_number
-
-
-
-
-                           
-                       
\ No newline at end of file
diff --git a/p1_classifier.py b/p1_classifier.py
index b3fffe1..68693eb 100644
--- a/p1_classifier.py
+++ b/p1_classifier.py
@@ -4,20 +4,18 @@
 """
 from transformers import pipeline
 from normalizer import cleaning
-from elastic_helper import ElasticHelper 
 import transformers
 import json
 import datetime
 import pandas as pd
 from transformers import AutoTokenizer
-print(transformers.__version__)
+print(f'transformers version: {transformers.__version__}')
 
 #  finetuned model for classification path
 model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680' 
 
 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-
-# window_size = tokenizer.model_max_length#512#200
+print(f'Classification Model Loaded: {model_checkpoint}')
 window_size = 512
 """
 (یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
@@ -27,8 +25,8 @@ window_size = 512
 step_size = 350#100
 # تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم
 Top_k = 4 
-
-classifier = pipeline("text-classification", model_checkpoint, framework="pt")
+# set device = 0 => to use GPU
+classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0)
 
 def get_class(sentences, top_k:int=4):
     # sentences = cleaning(sentences)
@@ -177,14 +175,15 @@ def do_classify(sections):
 
     for index, id in enumerate(sections):
         
-        source = sections[id]['source']
+        source = sections[id]
         classification_result, classification_status, desc = single_section_classification(id, source)
         
         if not classification_status:
             print(f'id: {id} classification error. error description: {desc}')
         
         # ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
-        new_sections_dict[id] = classification_result
+        # new_sections_dict[id] = classification_result
+        sections[id]['ai_codes'] = classification_result
         
         """ برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
         # qanon_title = source['qanon_title']
@@ -198,8 +197,8 @@ def do_classify(sections):
     print(f'end: {datetime.datetime.now()}')
     print('classification finished!')
     
-    classified_sections_dict = new_sections_dict
+    # classified_sections_dict = new_sections_dict
     
-    return classified_sections_dict
+    return sections
 
 
diff --git a/p2_ner_recognizer.py b/p2_ner_recognizer.py
index ec1f85e..aea486e 100644
--- a/p2_ner_recognizer.py
+++ b/p2_ner_recognizer.py
@@ -15,7 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n
 tagger = SequenceTagger.load(model)
 print('model read and tagger initialized')
 
-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
 
 def prepare_data(ner_obj_list):
     ner_data_list = []
@@ -208,7 +208,7 @@ def do_ner_recognize(sections):
         ner_data_list = prepare_data(ner_obj_list)
         sections[id]['ners_v2'] = ner_data_list
         print(f'ner process: {index+1}/{len_sections}')
-    print(f'len_sections ner recognization finished!')
+    print(f'{len_sections} ner recognization finished!')
     
     return sections
 
diff --git a/p3_words_embedder.py b/p3_words_embedder.py
index 4a00214..a5389d5 100644
--- a/p3_words_embedder.py
+++ b/p3_words_embedder.py
@@ -6,7 +6,8 @@ import json
 import datetime
 import numpy as np
 
-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+date = datetime.datetime.now()
+today = f'{date.year}-{date.month}-{date.day}-{date.hour}'
 
 model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
 # model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
@@ -19,14 +20,16 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
 model = SentenceTransformer(model_name)
 
 def do_word_embedder(sections):
-    for index, item in enumerate(sections):
-        embeddings = single_section_embedder(sections[item]['content'])
-        sections[item]['embeddings'] = embeddings
+    for index, id in enumerate(sections):
+        embeddings = single_section_embedder(sections[id]['content'])
+        sections[id]['embeddings'] = embeddings.tolist()
     
     with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
         data = json.dumps(sections, ensure_ascii=False)
         output_file.write(data)
     
+    return sections
+    
 def single_section_embedder(sentence):
     """
     این متد، متن ورودی را تبدیل به بردار متناظر آن می کند
diff --git a/p4_keyword_extractor.py b/p4_keyword_extractor.py
index 2fd7fd1..530d59d 100644
--- a/p4_keyword_extractor.py
+++ b/p4_keyword_extractor.py
@@ -15,7 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME"
 model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #model_id = "meta-llama/Llama-3.1-70B-Instruct"
 
-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
 
 bnb_config = BitsAndBytesConfig(
     load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
@@ -61,7 +61,7 @@ def generate(formatted_prompt):
   # Gemini Prompt
   USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
 •	یک لیست فارسی
-•	شماره ترتیبی در ابتدای هر عبارت 
+•	هیچ عدد یا علامت و نمادی در ابتدا یا انتهای هر عبارت کلیدی قرار نگیرد
 •	هر عبارت در یک خط جداگانه
 •	بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
 موارد زیر در استخراج عبارات کلیدی الزامی است:
@@ -94,7 +94,7 @@ def generate(formatted_prompt):
       max_new_tokens=2048,
       eos_token_id=terminators,
       do_sample=True,
-      temperature=0.6,
+      temperature=0.1,
       top_p=0.9,
   )
   response = outputs[0][input_ids.shape[-1]:]
@@ -168,9 +168,9 @@ def do_keyword_extract(sections):
         
         period_ids_text += f"{id} \n"
         
-        print(f"section: {counter}-id: {id}")
+        print(f"section kw extracting: {counter} - id: {id}")
         # temp_dict.append(item)
-        if counter % 1000 == 0:
+        if counter % 5000 == 0:
             outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
             outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
             outputfile.close()
@@ -196,12 +196,13 @@ def do_keyword_extract(sections):
     print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
     print("end")
     
-    return True
+    operation_result = True
+    return operation_result, sections
      
 if __name__ == "__main__":
     print(f'start: {datetime.datetime.now()}')
     sections = get_sections()
     
-    sections = do_keyword_extract(sections)
+    operation_result = do_keyword_extract(sections)
     
     print(f'end: {datetime.datetime.now()}')
\ No newline at end of file
diff --git a/p5_simplifier.py b/p5_simplifier.py
index add1944..f4bf981 100644
--- a/p5_simplifier.py
+++ b/p5_simplifier.py
@@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import json
 
-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
 
 if torch.cuda.is_available():
     model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
@@ -56,17 +56,20 @@ def single_section_representation(content):
         
         outputs = model.generate(
             input_ids,
-            max_new_tokens=500,
+            max_new_tokens=2048,
             eos_token_id=terminators,
             do_sample=True,
-            temperature=0.7,
+            temperature=0.1,
             top_p=0.85,
         )
 
         response = outputs[0][input_ids.shape[-1]:]
         sentences = tokenizer.decode(response, skip_special_tokens=True)
+        # حذف جملات تکراری
+        sentences = list(set(sentences))
+        
         result = True
-        desc = 'operation successful'
+        desc = 'Operation successful'
         return result, desc, sentences
 
     except Exception as error:
@@ -77,7 +80,7 @@ def single_section_representation(content):
 def do_representation(sections):
     print(f"start time:   {datetime.datetime.now()}")
     
-    for index, id in sections:
+    for index, id in enumerate(sections):
         result, desc, sentences = single_section_representation(sections[id]['content'])
         if not result:
             error_content = f'id: {id} - error: {desc}\n'
@@ -85,11 +88,16 @@ def do_representation(sections):
                 file.write(error_content)
             
         sections[id]['represented_sentences'] = sentences
-        
+        print(f'representation process. section {index+1}/{len(sections)} - id: {id}')
+    
+    with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile:
+        outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4))
 
     print(f"end time:   {datetime.datetime.now()}")
     print(" *** finished! *** ")
 
+    operation_result = True
+    return operation_result, sections
 
 if __name__ == "__main__":
     pass
\ No newline at end of file
diff --git a/readme/readme-classifier.md b/readme/readme-classifier.md
new file mode 100644
index 0000000..e27f201
--- /dev/null
+++ b/readme/readme-classifier.md
@@ -0,0 +1,71 @@
+# Section Classification Script
+
+This project provides a Python script (`classification.py`) for classifying text sections using a fine-tuned transformer model. The script is designed to suggest the most relevant classes for each section of text, which is useful for legal documents, content categorization, and similar NLP tasks.
+
+## Requirements
+
+Before using this script, please install the required libraries:
+
+```bash
+pip install transformers pandas
+```
+
+You also need a fine-tuned classification model and its tokenizer. Update the `model_checkpoint` path in the script to point to your model.
+
+## How It Works
+
+- The script loads a fine-tuned transformer model for text classification.
+- It processes each section of text, possibly splitting long texts into windows to fit the model's input size.
+- For each section, it predicts the top classes and saves the results.
+
+## Main Functions
+
+- `get_class(sentences, top_k=4)`: Classifies a sentence or text and returns the top `k` classes.
+- `mean_classes(input_classes)`: Aggregates class results from multiple windows of a long text.
+- `get_window_classes(text)`: Handles splitting long texts into windows and aggregates their classification results.
+- `single_section_classification(id, section_source)`: Classifies a single section and returns the best and other suggested classes.
+- `do_classify(sections)`: Classifies all sections in a dictionary and saves the results to a JSON file.
+
+## Usage Example
+
+Suppose you have your sections data as a dictionary:
+
+```python
+sections = {
+    "1": {"content": "First section text", "other_info": {"full_path": "..."}, "qanon_title": "..."},
+    "2": {"content": "Second section text", "other_info": {"full_path": "..."}, "qanon_title": "..."}
+}
+```
+
+You can classify all sections as follows:
+
+```python
+from classification import do_classify
+
+result = do_classify(sections)
+```
+
+After running, the results will be saved in a JSON file in the `./data/classification/` directory.
+
+## Output Structure
+
+Each section will have a new field `ai_codes` with the classification results:
+
+```json
+"1": {
+  "content": "First section text",
+  "ai_codes": {
+    "best-class": {"label": "ClassA", "score": 0.85},
+    "other-classes": [
+      {"label": "ClassB", "score": 0.10},
+      {"label": "ClassC", "score": 0.05}
+    ]
+  }
+}
+```
+
+## Notes
+
+- Make sure the model path in `model_checkpoint` is correct and the model files are available.
+- The script supports Persian and other languages, depending on your model.
+- The output JSON file will be saved in `./data/classification/`.
diff --git a/readme/readme-words-embedder.md b/readme/readme-words-embedder.md
new file mode 100644
index 0000000..eb6dc62
--- /dev/null
+++ b/readme/readme-words-embedder.md
@@ -0,0 +1,70 @@
+# Sentence Embedding Generator
+
+This project provides a Python script (`embedding.py`) for generating sentence embeddings using the [Sentence Transformers]library.
+
+## Requirements
+
+Before using this script, please install the required libraries:
+
+```bash
+pip install sentence-transformers numpy
+```
+
+## How It Works
+
+- The script uses the pre-trained model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`.
+- There are two main functions:
+  - `single_section_embedder(sentence)`: Takes a sentence (string) and returns its embedding as a vector.
+  - `do_word_embedder(sections)`: Takes a dictionary of sections (each with a `content` field), generates embeddings for each section, and saves the results as a JSON file.
+
+## Usage
+
+### 1. Get Embedding for a Single Sentence
+
+```python
+from embedding import single_section_embedder
+
+sentence = "This is a sample sentence."
+embedding = single_section_embedder(sentence)
+print(embedding)
+```
+
+### 2. Generate Embeddings for Multiple Sections and Save to File
+
+Suppose your data is structured like this:
+
+```python
+sections = {
+    "1": {"content": "First section text"},
+    "2": {"content": "Second section text"}
+}
+```
+
+You can generate and save embeddings as follows:
+
+```python
+from embedding import do_word_embedder
+
+result = do_word_embedder(sections)
+```
+
+After running, a file named like `sections_embeddings_YEAR-MONTH-DAY-HOUR.json` will be created in the `./data/embeddings/` directory, containing the embeddings for each section.
+
+## Output Structure
+
+The output is a JSON file where each section has its embedding added:
+
+```json
+{
+  "1": {
+    "content": "First section text",
+    "embeddings": [0.123, 0.456, ...]
+  },
+  ...
+}
+```
+
+## Notes
+
+- Make sure the folder `./data/embeddings/` exists before running the script.
+- The script supports Persian language.