first push

2025-01-20 16:35:35 +00:00 · 2025-01-20 16:35:35 +00:00 · e73258bff3
commit e73258bff3
16 changed files with 3331 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+*.log
+*.pyc
+*.json
--- a/conflict.py
+++ b/conflict.py
@ -0,0 +1,94 @@
+import json
+from tqdm import tqdm
+import time
+
+
+import torch
+import os
+from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+os.environ['HF_HOME'] = "/home/admin/HFHOME"
+
+model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+#model_id = "meta-llama/Llama-3.1-70B-Instruct"
+
+# use quantization to lower GPU usage
+# 4 bit:
+# bnb_config = BitsAndBytesConfig(
+#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
+# )
+# 8 bit:
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=bnb_config
+)
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
+
+SYS_PROMPT = """
+You receive two Persian legal rule texts and analyze them carefully, explain your answer step by step, to see whether these two rules logically conflict with each other. 
+Finally, state the final conclusion for the presence or absence of conflict with the words "yes" or "no".
+"""#  Explain your answer step by step.
+
+
+def format_prompt(SENTENCE1, SENTENCE2):
+  PROMPT = f"Rule 1: {SENTENCE1}. Rule 2: {SENTENCE2}."
+  return PROMPT
+
+
+
+def generate(formatted_prompt):
+  formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
+  messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
+  # tell the model to generate
+  input_ids = tokenizer.apply_chat_template(
+      messages,
+      add_generation_prompt=True,
+      return_tensors="pt"
+  ).to(model.device)
+  outputs = model.generate(
+      input_ids,
+      max_new_tokens=2048,
+      eos_token_id=terminators,
+      do_sample=True,
+      temperature=0.6,
+      top_p=0.9,
+  )
+  response = outputs[0][input_ids.shape[-1]:]
+  return tokenizer.decode(response, skip_special_tokens=True)
+
+
+def conflict(sentence1, sentence2):
+    formatted_prompt = format_prompt(sentence1, sentence2)
+    return generate(formatted_prompt)
+
+
+if __name__ == "__main__":
+    print('start')
+    start_time = time.time()
+    #inputfile = open('./main_rules_lama70B_dataset_02.json', "r", encoding='utf-8')
+    result = conflict("حسین در حال حاضر در وزارت نفت و انرژی به استخدام دولت در آمده است.",
+             "حسین الان در دانشگاه دولتی مشغول به تحصیل است و هرکسی که در حال تحصیل باشد، از نظر قانون نمی تواند در دولت استخدام شود")
+    end_time = time.time()
+    
+    print("*********************************************************")
+    print("*********************************************************")
+    print()
+    print(result)
+    print()
+    print("*********************************************************")
+    print("*********************************************************")
+    
+    
+    print(f"elapsed time:   {end_time-start_time}")
+    print("end")
--- a/data_helper.py
+++ b/data_helper.py
@ -0,0 +1,153 @@
+
+import pickle
+import re
+import string
+import os
+
+
+class DataHelper():
+    def __init__(self):
+        pass
+
+    def clean_text(self, text_doc, new_line_elimination):
+        punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;"
+        punctuations = '[' + punctuations + string.punctuation + ']'
+        punctuations = punctuations.replace("@", "")
+
+        text_doc.strip()
+
+        # pattern = ur'\s*@[a-zA-Z0-9]*\s*'
+        # tmp = re.findall(pattern, text_doc)
+        # newstring = re.sub(pattern, eliminate_pattern, text_doc)
+
+
+        #finding the numbers
+        pattern = r"[-+]?\d*\.\d+|\d+"
+        nums_list = re.findall(pattern, text_doc)
+        newstring = re.sub(pattern, 'floatingpointnumber', text_doc)
+
+
+        #pattern = '\s*' + punctuations + '+' + '\s*'
+        #tmp = re.findall(pattern, newstring)
+        #newstring = re.sub(pattern, self.add_space, newstring)
+
+        # pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)'
+        # rep = ur'\1\3\5'
+        # tmp = re.findall(pattern, newstring)
+        # newstring = re.sub(pattern, rep, newstring)
+
+        pattern = r'[\n]+'
+        tmp = re.findall(pattern, newstring)
+        if new_line_elimination:
+            newstring = re.sub(pattern, " ", newstring)
+        else:
+            # newstring = re.sub(pattern, "\n", newstring)
+            pass
+
+        punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;"
+        latinLettersDigits = r"a-zA-Z0-9"
+        pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '‌' + '\d\s:]'
+        tmp = re.findall(pattern, newstring)
+        newstring = re.sub(pattern, self.eliminate_pattern, newstring)
+
+        pattern = r'[ ]+'
+        tmp = re.findall(pattern, newstring)
+        newstring = re.sub(pattern, ' ', newstring)
+
+        for number in nums_list:
+            pattern = 'floatingpointnumber'
+            newstring = re.sub(pattern, number, newstring, 1)
+
+        return newstring
+
+    def add_space(self, mystring):
+        mystring = mystring.group()  # this method return the string matched by re
+        mystring = mystring.strip(' ')  # ommiting the whitespace around the pucntuation
+        mystring = " " + mystring + " "  # adding a space after and before punctuation
+        return mystring
+
+    def replace_newline_with_dot(self, mystring):
+        return ' . '
+
+    def eliminate_pattern(self, mystring):
+        return ""
+
+    def load_var(self, load_path):
+        file = open(load_path, 'rb')
+        variable = pickle.load(file)
+        file.close()
+        return variable
+
+    def save_var(self, save_path, variable):
+        print("saving vars ...")
+        file = open(save_path, 'wb')
+        pickle.dump(variable, file)
+        print("variable saved.")
+        file.close()
+
+    def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path):
+        path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep"
+        lexicon_stem = set()
+        verb_stem = set()
+        #verb_tense_map = {}
+        verb_p2f_map = {}
+        verb_f2p_map = {}
+        for fileName in os.listdir(path_dir):
+            file_path = path_dir + "/" + fileName
+            with open(file_path, "r") as input:
+                input_content = input.readlines()
+                for el in input_content:
+                    el = normalizer.sub_alphabets(el)
+                    el = el.split("\t")
+                    if (len(el) > 2):
+                        if (el[3] == 'V'):
+                            tmp_pos = "V"
+                        else:
+                            tmp_pos = "N"
+                        stem_word = el[2]
+                        stem_word = stem_word.split("#")
+                        stem_word = [x.strip('\u200c') for x in stem_word]
+                        if (tmp_pos == "V" and len(stem_word) == 2):
+                            if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0):
+                                verb_p2f_map[stem_word[0]] = stem_word[1]
+                                verb_f2p_map[stem_word[1]] = stem_word[0]
+                                verb_stem.add(stem_word[0])
+                                verb_stem.add(stem_word[1])
+                        if(tmp_pos == 'V' and len(stem_word) == 3):
+                            if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0):
+                                #verb_prifix.add(stem_word[0])
+                                verb_p2f_map[stem_word[1]] = stem_word[2]
+                                verb_f2p_map[stem_word[2]] = stem_word[1]
+                                verb_stem.add(stem_word[1])
+                                verb_stem.add(stem_word[2])
+                        for t in stem_word:
+                            if len(t) > 1:
+                                if (tmp_pos == 'N'):
+                                    lexicon_stem.add(t)
+
+        with open(verb_tense_path, "r") as bon_file:
+            bon_file_content = bon_file.readlines()
+            for el in bon_file_content:
+                el = el.strip()
+                el = normalizer.sub_alphabets(el)
+                el = el.split()
+                el = [x.strip('\u200c') for x in el]
+
+                verb_p2f_map[el[0]] = el[1]
+                verb_f2p_map[el[1]] = el[0]
+                verb_stem.add(el[0])
+                verb_stem.add(el[1])
+
+        irregular_noun = {}
+        with open(mokasar_noun_path, "r") as input:
+            input_content = input.readlines()
+            for el in input_content:
+                el = normalizer.sub_alphabets(el)
+                el = el.replace("\t\t", "\t")
+                el = el.strip().split("\t")
+                el = [x.strip('\u200c') for x in el]
+                irregular_noun[el[0]] = el[1]
+                lexicon_stem.add(el[0])
+
+        verb_tense_map = [verb_p2f_map, verb_f2p_map]
+        return lexicon_stem, verb_stem, verb_tense_map, irregular_noun
--- a/do_extractor.py
+++ b/do_extractor.py
@ -0,0 +1,200 @@
+"""
+این فایل با نرمالایزر پارسیور کار می کند
+"""
+from datetime import datetime
+# from elasticsearch import Elasticsearch
+from threading import Thread
+import torch
+import time
+import json 
+import os.path
+import os
+os.environ['HF_HOME'] = "/home/admin/HFHOME"
+# from general_functions import normalize_content
+from funcs import write_to_json, read_from_json
+
+from normalizer import Normalizer
+from tokenizer import *
+_normalizer = Normalizer(date_normalizing_needed=True)
+address = os.getcwd()
+# sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
+# sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
+# sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
+# sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
+
+
+# not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
+# not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
+not_have_two_token_kw_list = []
+
+import json
+from tqdm import tqdm
+import time
+
+from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+os.environ['HF_HOME'] = "/home/admin/HFHOME"
+
+#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+#model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
+
+model_id = "meta-llama/Llama-3.1-70B-Instruct"
+
+# use quantization to lower GPU usage
+# 4 bit:
+# bnb_config = BitsAndBytesConfig(
+#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
+# )
+# 8 bit:
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
+)
+print("Model Loading START:")
+print(str(datetime.now()))
+print()
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=bnb_config
+)
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
+print("Model Loading END:")
+print(str(datetime.now()))
+print()
+
+sections_list = read_from_json('./data/sections_110.json') # Main File
+# if torch.cuda.is_available():
+#     #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
+#     # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    
+#     model_id = "meta-llama/Llama-3.1-70B-Instruct"
+#     model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+#     tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# index_name_i = 'semantic_search-v10'
+
+# es = Elasticsearch(
+#     "http://127.0.0.1:6900",
+#     # ca_certs="/path/to/http_ca.crt",
+#     basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
+# )
+
+counter = 0
+total = 0
+remained = 0
+id = ''
+keywords_count = 15
+   
+def generateKeywords(text):
+    global remained
+    try:
+        keywords_count = (len(text) / 1000) * 15
+        keywords_count = int(keywords_count)
+        if keywords_count == 0: 
+            keywords_count = 1
+       
+        
+        # messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
+        #              {"role": "user", "content": 
+        #             '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
+        #             هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
+        #             .format(keywords_count)
+        # },
+        #              {"role": "user", "content": 
+        #             '''"متن": {}'''.format(text)
+        # },]
+        messages =  [{"role": "system", "content": "You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text." },
+                     {"role": "user", "content": 
+                    '''Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
+Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".'''
+                    .format(keywords_count)
+        },
+                     {"role": "user", "content": 
+                    '''"متن": {}'''.format(text)
+        },]
+       
+        
+        input_ids = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        ).to(model.device)
+
+        terminators = [
+                    tokenizer.eos_token_id,
+                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+                ]
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+
+        
+        outputs = model.generate(
+            input_ids,
+            max_new_tokens=256,
+            eos_token_id=terminators,
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.85,
+        )
+        #lock0.release()
+        response = outputs[0][input_ids.shape[-1]:]
+        keywords = tokenizer.decode(response, skip_special_tokens=True)
+        #lock1.acquire()
+        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
+       
+
+        return keywords
+
+    except Exception as inst:
+        print(type(inst))    # the exception type
+        print(inst.args)     # arguments stored in .args
+        print("Exception: " + str(inst))
+
+
+
+if __name__ == "__main__":
+    start_time = time.time()
+    print("start_time: "+str(datetime.now()))
+    
+    try:
+        keywords_dict = []
+        
+        count = 1
+        for content_item in sections_list:
+            id = content_item['id']
+            # if not id in not_have_two_token_kw_list:
+            #     continue
+            content = content_item['content']
+            content_len = len(content.split())
+            # کنارگذاشتن محتواهای با حجم زیاد
+            if content_len > 2000:
+                print("too long content " + str(id))
+                continue
+            content = _normalizer.sub_alphabets(content)
+            keywords = generateKeywords(content) 
+            print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " keyword extracting ... ")
+            keywords_dict.append({
+                    'id':id,
+                    'keywords':keywords
+                })
+            if count % 500 == 0:
+                write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
+                keywords_dict = []
+            count+=1
+            
+        write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
+          
+    except Exception as inst:
+            print(type(inst))    # the exception type
+            print(inst.args)     # arguments stored in .args
+            
+    end_time = time.time()
+    print("end_time: "+ str(datetime.now()))
+    operation_time = (int(end_time-start_time)/60)/60
+    print(f"elapsed time: {operation_time} hours")
+    print(f"Finished!!!")
--- a/funcs.py
+++ b/funcs.py
@ -0,0 +1,119 @@
+import re
+import os
+import json
+# from pandas import read_excel, DataFrame
+def remove_signs():
+    str = read_file()
+    # lines = 
+    pattern = r"\(|\)"
+    str = re.sub(pattern,'', str)
+    # str = re.sub(')','', str)
+    # str = re.sub('/','', str)
+
+    return str
+
+def read_file():
+    with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file:
+        text = ''
+        try:
+            text = str(file.read())
+        except:
+            pass 
+    return text
+
+def read_file_by_address(file_address):
+    with open(file_address, 'r', encoding='utf-8') as file:
+        text = ''
+        try:
+            text = str(file.read())
+        except:
+            pass 
+    return text
+
+def save_to_file(result):
+    with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file:
+        previous_result = ''
+        try:
+            previous_result = file.read()
+        except:
+            pass 
+        file.write(result)
+        file.close()
+
+def save_to_file_by_address(file_address, text):
+    with open(file_address, 'a+', encoding='utf-8') as file:
+        previous_result = ''
+        try:
+            previous_result = file.read()
+        except:
+            pass 
+        file.write(text)
+        file.close()
+        
+
+def read_from_excel(file_address, column_name):
+    # خواندن فایل اکسل
+    data = read_excel(file_address)
+    
+    # استخراج محتوای ستون مورد نظر
+    column_data = data[column_name]
+    return column_data
+
+def add_columndata_to_excel(file_address, column_name, columndata):
+
+    # خواندن فایل اکسل
+    data = read_excel(file_address)
+
+    # اضافه کردن ستون جدید به داده‌ها
+    data[column_name] = columndata
+
+    # ذخیره کردن داده‌ها در فایل اکسل
+    data.to_excel(file_address, index=False)
+
+def write_to_excel(data_dict, file_name_and_address):
+    df = DataFrame(data_dict)
+
+    # ذخیره DataFrame به عنوان فایل اکسل
+    df.to_excel(file_name_and_address, index=False)
+    return True
+
+def write_to_json(dict, file_address):
+
+    # تبدیل دیکشنری به فرمت JSON
+    json_data = json.dumps(dict, indent=2, ensure_ascii=False)
+
+    # ذخیره فایل
+    with open(file_address, 'w+', encoding='utf-8') as file:
+        file.write(json_data)
+        
+    return True
+
+def read_from_json(file_address):
+    data_dict = []
+    # خواندن اطلاعات از فایل JSON
+    with open(file_address, 'r', encoding='utf-8') as file:
+        loaded_data = json.load(file)
+       
+    # نمایش اطلاعات خوانده شده
+    for item in loaded_data:
+        data_dict.append(item)
+    return data_dict
+
+
+def separated_date_format_finder(date_ner):
+        result = False
+        date_ner = date_ner.replace('.','/')
+        date_ner = date_ner.replace('،','/')
+        date_ner = date_ner.replace('ر','/')
+        #date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}'
+        date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b'
+        regex = re.compile(date_pattern)
+        match_dates = regex.finditer(date_ner)
+        for date_item in match_dates:
+            result = True
+            break
+        return result
+
+if __name__ ==  "__main__":
+
+        pass
--- a/general_functions.py
+++ b/general_functions.py
@ -0,0 +1,800 @@
+from normalizer import Normalizer
+from tokenizer import *
+# import jalali
+import re
+from re import sub
+import textwrap
+from html import escape
+from bs4 import BeautifulSoup
+
+# from lxml import etree
+import datetime
+#enumerate(token_list):  
+_normalizer = Normalizer(date_normalizing_needed=True)
+
+yeAr = r"ﻱ|ې|ێ|ے|ى|ي|ئ"
+yeFr= r"ی"
+keAr = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك"
+keFr = r"ک"
+mark1 = r'#\[#'
+mark2 = r'#\]#'
+hTag1 = r'<'
+hTag2 = r'>'
+tableTag=["table","tr", "th", "td", "TABLE", "TR", "TH", "TD"]
+strTable = ''
+for tag in tableTag:
+    if strTable != '':
+        strTable += '|'     
+    strTable += '('+tag+')'
+regTable = r'<(?P<slash>\/)*(?P<tag>'+strTable+')(?P<class>[^>]+)*>'
+regTableReplace = r'#[#\g<slash>\g<tag>\g<class>#]#'
+
+
+def isNeedHtml(html):
+    if "<TABLE" in html or  "<table" in html or "</TR" in html or "</tr" in html :
+        return True
+    else:
+        return False
+
+
+def removeHtmlTags(html, exeptionTag=[]):
+    #reg1 = r'<[^>]+>'
+
+    if exeptionTag.__len__ :
+        exceptTags = ''
+        for tag in exeptionTag:
+            if exceptTags != '':
+                exceptTags += '|'     
+            exceptTags += '('+tag+')'
+        reg1 = r'<(?P<slash>/)*(?P<tag>'+exceptTags+')(?P<class>[^>]+)*>'
+        html = sub(reg1, regTableReplace, html)
+
+
+    soup = BeautifulSoup(html, "html.parser")
+    text = soup.get_text("\n", strip=True)
+
+    if exeptionTag.__len__ :
+        text = sub(mark1, hTag1, text)
+        text = sub(mark2, hTag2, text)
+
+    return text
+
+
+def removeHtmlNoTableTag(html):
+
+    # خطا داره و هنگ می کنه در test2.py
+    html = sub(regTable, regTableReplace, html)
+    soup = BeautifulSoup(html, "html.parser")
+    text = soup.get_text("\n", strip=True)
+
+    text = sub(mark1, hTag1, text)
+    text = sub(mark2, hTag2, text)
+
+    return text
+
+def normalizerData(data):
+    global _normalizer
+    normalTitle, dates = _normalizer.normalize(data, return_dates=True)
+    tdates = []
+    for d in dates:
+        if not d.startswith("num"):
+            try:
+                tsd = jdate2timestamp(d)
+                cdate = d
+            except:
+                try:
+                    d = d.replace("y", "")
+                    d = d.replace("m", "/")
+                    d = d.replace("d", "/")
+                    m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
+                    if m:
+                        [year, month, day] = [
+                            int(m.group(1)),
+                            int(m.group(2)),
+                            int(m.group(3)),
+                        ]
+                        if year > 1200 and year < 1550:
+                            if month < 1 or month > 12:
+                                month = 1
+                            if day < 1 or day > 31:
+                                day = 1
+                            cdate = str(year) + "/" + str(month) + "/" + str(day)
+                            tsd = jdate2timestamp(cdate)
+                        else:
+                            # cdate = "1403/03/03"
+                            # tsd = jdate2timestamp(cdate)
+                            continue
+                    else:
+                        # cdate = "1403/03/03"
+                        # tsd = jdate2timestamp(cdate)
+                        continue
+                except:
+                    # print("Error in:"+ d +" for id: " + id)
+                    # cdate = "1403/03/03"
+                    # tsd = jdate2timestamp(cdate)
+                    continue
+            tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})
+
+    return normalTitle,tdates
+
+def normalizerDate2(inputString):
+    global _normalizer
+    normalizedString, dates, recognized_dates, recognized_numbers = _normalizer.normalize(inputString, return_dates=True)
+    tdates = []
+    for date_item in recognized_dates:
+        date_part              = date_item['date']
+        date_token_index       = date_item['date_token_index']
+        start_date_token_index = date_item['start_date_token_index']
+        end_date_token_index   = date_item['end_date_token_index']
+        if not date_part.startswith("num"):
+            try:
+                cdate = date_part
+                tsd = jdate2timestamp(date_part)
+            except:
+                try:
+                    date_part = date_part.replace("y", "")
+                    date_part = date_part.replace("m", "/")
+                    date_part = date_part.replace("d", "/")
+                    m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", date_part)
+                    if m:
+                        [year, month, day] = [
+                            int(m.group(1)),
+                            int(m.group(2)),
+                            int(m.group(3)),
+                        ]
+                        if year > 1200 and year < 1550:
+                            if month < 1 or month > 12:
+                                month = 1
+                            if day < 1 or day > 31:
+                                day = 1
+                            cdate = str(year) + "/" + str(month) + "/" + str(day)
+                            tsd = jdate2timestamp(cdate)
+                        else:
+                            # cdate = "1403/03/03"
+                            # tsd = jdate2timestamp(cdate)
+                            continue
+                    # else:
+                    #     # cdate = "1403/03/03"
+                    #     # tsd = jdate2timestamp(cdate)
+                    #     continue
+                except:
+                    # print("Error in:"+ d +" for id: " + id)
+                    # cdate = "1403/03/03"
+                    # tsd = jdate2timestamp(cdate)
+                    continue
+            import tokenizer as t
+            inputString_token = t.Tokenizer.tokenize_words(None,inputString)
+            # if start_date_token_index == end_date_token_index:
+            #     end_date_token_index += 1
+            #     original_date_part = inputString_token[start_date_token_index:end_date_token_index]
+            # else:
+            original_date_part = inputString_token[start_date_token_index:end_date_token_index + 1]
+            original_date = ''
+            for part in original_date_part:
+                original_date = original_date + ' ' + part
+            original_date = original_date.strip()    
+            tdates.append({"converted_date": date_item['date'],
+                           "date": cdate ,
+                           "original_date" : original_date,
+                        #    "timestamp": tsd,
+                           "date_token_index": date_token_index,
+                           "start_date_token_index": start_date_token_index,
+                           "end_date_token_index":end_date_token_index})
+    '''
+    for d in dates:
+        if not d.startswith("num"):
+            try:
+                tsd = jdate2timestamp(d)
+                cdate = d
+            except:
+                try:
+                    d = d.replace("y", "")
+                    d = d.replace("m", "/")
+                    d = d.replace("d", "/")
+                    m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
+                    if m:
+                        [year, month, day] = [
+                            int(m.group(1)),
+                            int(m.group(2)),
+                            int(m.group(3)),
+                        ]
+                        if year > 1200 and year < 1550:
+                            if month < 1 or month > 12:
+                                month = 1
+                            if day < 1 or day > 31:
+                                day = 1
+                            cdate = str(year) + "/" + str(month) + "/" + str(day)
+                            tsd = jdate2timestamp(cdate)
+                        else:
+                            # cdate = "1403/03/03"
+                            # tsd = jdate2timestamp(cdate)
+                            continue
+                    else:
+                        # cdate = "1403/03/03"
+                        # tsd = jdate2timestamp(cdate)
+                        continue
+                except:
+                    # print("Error in:"+ d +" for id: " + id)
+                    # cdate = "1403/03/03"
+                    # tsd = jdate2timestamp(cdate)
+                    continue
+            tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})'''
+    return normalizedString,tdates,recognized_numbers
+
+def OtherDateFormatNormalizer(inputString,pattern):
+    mainTextTemp = inputString
+    regex_pattern_Mah = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه 1000 و 300 و 50 و 4 
+    regex_pattern_MahSal =  r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال 1000 و 300 و 50 و 4 
+    regex_pattern_MahSal2 =  r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال y1000m0d0 و 300 و 50 و 4 
+    regex_pattern_MahSal3 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})" # y0m3d1 ماه سال y1353m0d0
+    
+    if(pattern==1):
+        regex = re.compile(regex_pattern_Mah)
+    elif(pattern==2):
+        regex = re.compile(regex_pattern_MahSal)
+    elif(pattern==3):
+        regex = re.compile(regex_pattern_MahSal2)
+    elif(pattern==4):
+        regex = re.compile(regex_pattern_MahSal3)
+
+    matches = regex.finditer(inputString)
+    for match in matches:
+            foundedPattern = match.group()
+            foundedPatternTemp = match.group()
+            if(pattern==1):
+                foundedPattern = foundedPattern.replace('ماه','')
+            else:
+                foundedPattern = foundedPattern.replace('سال','')
+                foundedPattern = foundedPattern.replace('ماه','')
+            foundedPattern = foundedPattern.strip()
+            tempString = foundedPattern
+            standardDatePattern = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})"
+            #regex = re.compile(regex_pattern_Mah)
+            matchItems = re.finditer(standardDatePattern,tempString)
+            for item in matchItems:
+                tempPattern = item.group()
+                tempString = tempString.replace(tempPattern,'')
+                tempString = tempString.strip()
+                tempString = tempString.replace('و','')
+                tempString = tempString.strip()
+                tempArray = tempString.split()
+                year = 0
+                for item in tempArray:
+                    dateMatch = re.finditer(standardDatePattern,item)
+                    regexFlag = True
+                    for dateItem in dateMatch:
+                        yearStr = dateItem.group()[1:5]
+                        year += int(yearStr)
+                        regexFlag = False
+                        break
+                    if(item.isalnum() and regexFlag):
+                        year += int(item)
+                tempPattern = tempPattern.replace('y0','y'+str(year))
+                mainTextTemp = mainTextTemp.replace(foundedPatternTemp,tempPattern+' ')    
+    return mainTextTemp
+
+            #foundedPattern = jdate2timestamp(foundedPattern)
+            #convertedText = regex.sub(foundedPattern,convertedText)
+
+def normalizerLongData(data):
+    dates = []
+    if len(data) > 10000:
+        textParts = textwrap.wrap(data, 10000, break_long_words=False)
+        for part in textParts:
+            dates.extend(normalizerData(part))
+    else:
+        dates = normalizerData(data)
+    return dates
+
+# ##################
+# در ویندوز برای اعداد منفی که تاریخهای قبلی بود را خطا می داد
+#  rr = gdt.timestamp()
+# #################
+def jdate2timestamp_old(dt):
+    ndt = dt.replace("y", "")
+    ndt = ndt.replace("m", "/")
+    ndt = ndt.replace("d", "/")
+    gd = jalali.Persian(ndt).gregorian_datetime()
+    # print(gd)
+    ztime = datetime.time(0, 0, 0, 0)
+    gdt = datetime.datetime.combine(gd, ztime)
+    # print(gdt)
+    rr = gdt.timestamp()
+    tst = int( round(rr) * 1000)
+    return tst
+
+def jdate2timestamp(dt):
+    ndt = dt.replace("y", "")
+    ndt = ndt.replace("m", "/")
+    ndt = ndt.replace("d", "/")
+    gd = jalali.Persian(ndt).gregorian_datetime()   
+    base = datetime.date(1970, 1, 1)
+    rr = (gd-base).total_seconds()  
+    tst = int( round(rr) * 1000)
+    return tst
+
+
+
+def getSortTimestamp(ts_date):
+    empty_date = -15000000000
+    ts_ts = empty_date
+    try:
+        if ts_date != "":
+            ts_ts = jdate2timestamp(ts_date)
+    except:
+        ts_ts = empty_date
+
+    return ts_ts
+
+def normalize_content(content):
+    text = _normalizer.sub_alphabets(content)
+    return text
+
+def normalYehKe(text):
+    if(text == None) :
+       return ''
+
+    c1 = sub(yeAr, yeFr, text)
+    c2 = sub(keAr, keFr, c1)
+    c2 = c2.replace('\u00A0', '')    
+    return c2.strip()
+
+_term_list = []
+def setTermList():
+  global _term_list
+  if(_term_list.__len__() > 0):
+    return
+  _term_list = [
+      {
+          "begin": jdate2timestamp("1285/07/14"),
+          "end": jdate2timestamp("1287/04/2"),
+          "term": "مجلس شورای ملی-دوره1",
+          "term_number": 1,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1288/8/24"),
+          "end": jdate2timestamp("1290/10/3"),
+          "term": "مجلس شورای ملی-دوره2",
+          "term_number": 2,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1293/9/14"),
+          "end": jdate2timestamp("1294/8/21"),
+          "term": "مجلس شورای ملی-دوره3",
+          "term_number": 3,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1300/4/1"),
+          "end": jdate2timestamp("1302/3/30"),
+          "term": "مجلس شورای ملی-دوره4",
+          "term_number": 4,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1302/11/22"),
+          "end": jdate2timestamp("1304/11/22"),
+          "term": "مجلس شورای ملی-دوره5",
+          "term_number": 5,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1305/4/19"),
+          "end": jdate2timestamp("1307/5/22"),
+          "term": "مجلس شورای ملی-دوره6",
+          "term_number": 6,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1307/7/19"),
+          "end": jdate2timestamp("1309/8/14"),
+          "term": "مجلس شورای ملی-دوره7",
+          "term_number": 7,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1309/9/24"),
+          "end": jdate2timestamp("1311/10/24"),
+          "term": "مجلس شورای ملی-دوره8",
+          "term_number": 8,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1311/12/24"),
+          "end": jdate2timestamp("1314/1/24"),
+          "term": "مجلس شورای ملی-دوره9",
+          "term_number": 9,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1314/3/15"),
+          "end": jdate2timestamp("1316/3/22"),
+          "term": "مجلس شورای ملی-دوره10",
+          "term_number": 10,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1316/6/20"),
+          "end": jdate2timestamp("1318/6/27"),
+          "term": "مجلس شورای ملی-دوره11",
+          "term_number": 11,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1318/8/3"),
+          "end": jdate2timestamp("1320/8/9"),
+          "term": "مجلس شورای ملی-دوره12",
+          "term_number": 12,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1320/8/22"),
+          "end": jdate2timestamp("1322/9/1"),
+          "term": "مجلس شورای ملی-دوره13",
+          "term_number": 13,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1322/12/16"),
+          "end": jdate2timestamp("1324/12/21"),
+          "term": "مجلس شورای ملی-دوره14",
+          "term_number": 14,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1326/4/25"),
+          "end": jdate2timestamp("1328/5/6"),
+          "term": "مجلس شورای ملی-دوره15",
+          "term_number": 15,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1328/11/20"),
+          "end": jdate2timestamp("1330/11/29"),
+          "term": "مجلس شورای ملی-دوره16",
+          "term_number": 16,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1331/2/7"),
+          "end": jdate2timestamp("1332/8/28"),
+          "term": "مجلس شورای ملی-دوره17",
+          "term_number": 17,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1332/12/27"),
+          "end": jdate2timestamp("1335/1/26"),
+          "term": "مجلس شورای ملی-دوره18",
+          "term_number": 18,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1335/3/10"),
+          "end": jdate2timestamp("1339/3/29"),
+          "term": "مجلس شورای ملی-دوره19",
+          "term_number": 19,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1339/12/2"),
+          "end": jdate2timestamp("1340/2/19"),
+          "term": "مجلس شورای ملی-دوره20",
+          "term_number": 20,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1342/7/14"),
+          "end": jdate2timestamp("1346/7/13"),
+          "term": "مجلس شورای ملی-دوره21",
+          "term_number": 21,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1346/7/14"),
+          "end": jdate2timestamp("1350/6/9"),
+          "term": "مجلس شورای ملی-دوره22",
+          "term_number": 22,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1350/6/9"),
+          "end": jdate2timestamp("1354/6/16"),
+          "term": "مجلس شورای ملی-دوره23",
+          "term_number": 23,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1354/6/17"),
+          "end": jdate2timestamp("1357/11/20"),
+          "term": "مجلس شورای ملی-دوره24",
+          "term_number": 24,
+          "majles_name": "شورای ملی",
+      },
+      {
+          "begin": jdate2timestamp("1359/3/7"),
+          "end": jdate2timestamp("1363/3/6"),
+          "term": "مجلس شورای اسلامی-دوره1",
+          "term_number": 1,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1363/3/7"),
+          "end": jdate2timestamp("1367/3/6"),
+          "term": "مجلس شورای اسلامی-دوره2",
+          "term_number": 2,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1367/3/7"),
+          "end": jdate2timestamp("1371/3/6"),
+          "term": "مجلس شورای اسلامی-دوره3",
+          "term_number": 3,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1371/3/7"),
+          "end": jdate2timestamp("1375/3/11"),
+          "term": "مجلس شورای اسلامی-دوره4",
+          "term_number": 4,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1375/3/12"),
+          "end": jdate2timestamp("1379/3/6"),
+          "term": "مجلس شورای اسلامی-دوره5",
+          "term_number": 5,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1379/3/7"),
+          "end": jdate2timestamp("1383/3/6"),
+          "term": "مجلس شورای اسلامی-دوره6",
+          "term_number": 6,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1383/3/7"),
+          "end": jdate2timestamp("1387/3/6"),
+          "term": "مجلس شورای اسلامی-دوره7",
+          "term_number": 7,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1387/3/7"),
+          "end": jdate2timestamp("1391/3/6"),
+          "term": "مجلس شورای اسلامی-دوره8",
+          "term_number": 8,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1391/3/7"),
+          "end": jdate2timestamp("1395/3/7"),
+          "term": "مجلس شورای اسلامی-دوره9",
+          "term_number": 9,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1395/3/8"),
+          "end": jdate2timestamp("1399/3/6"),
+          "term": "مجلس شورای اسلامی-دوره10",
+          "term_number": 10,
+          "majles_name": "شورای اسلامی",
+      },
+      {
+          "begin": jdate2timestamp("1399/3/7"),
+          "end": jdate2timestamp("1403/3/6"),
+          "term": "مجلس شورای اسلامی-دوره11",
+          "term_number": 11,
+          "majles_name": "شورای اسلامی",
+      },
+  ]
+
+
+def getTermQanon(ts_date_timestamp, ts_ref):
+    setTermList()
+    global _term_list
+    term = ""
+    term_number = 0
+    majles_name = ""
+
+    if ts_ref == "هيات وزيران (دوره فترت)":
+        term = ts_ref
+    if ts_ref == "نخست وزير (مصدق)":
+        term = ts_ref
+    if ts_ref == "وزير عدليه (داور)":
+        term = ts_ref
+    if ts_ref == "شوراي انقلاب جمهوري اسلامي ايران":
+        term = ts_ref
+
+    majles_name = term
+    if term == "":
+        for i in range(len(_term_list) - 1, -1, -1):
+            begin = _term_list[i]["begin"]
+            end = _term_list[i]["end"]
+            if ts_date_timestamp >= begin and ts_date_timestamp <= end:
+                term = _term_list[i]["term"]
+                term_number = _term_list[i]["term_number"]
+                majles_name = _term_list[i]["majles_name"]
+                break
+
+    error = ""
+    if term == "":
+        # if ts_date_timestamp >= _term_list[0]["begin"] and  ts_date_timestamp <= _term_list[len(_term_list)-1]["end"] :
+        if ts_date_timestamp <= _term_list[len(_term_list) - 1]["end"]:
+            for i in range(0, len(_term_list) - 1, 1):
+                end = _term_list[i]["end"]
+                if ts_date_timestamp <= end:
+                    term = _term_list[i]["term"]
+                    term_number = _term_list[i]["term_number"]
+                    majles_name = _term_list[i]["majles_name"]
+                    error = "تاریخ بین دو دوره"
+                    break
+        else:
+            term_number = -1
+            error = "تاریخ خارج از محدوده"
+
+    return term, term_number, majles_name, error
+
+# این متد یک متن و ایندکس آغاز و پایان یک عبارت درون آن متن را دریافت می کند
+# و شماره توکن آغازین و توکن پایانی مربوط به عبارت در متن را بر می گرداند
+def token_state_finder(normalized_section_content, start_index, end_index):
+    before_substring = normalized_section_content[0:start_index-1].strip()
+    pattern_substring = normalized_section_content[start_index-1:end_index+1].strip()
+    before_substring_token_list = before_substring.strip().split()
+    pattern_token_list = pattern_substring.strip().split()
+    start_token_state = len(before_substring_token_list)
+    end_token_state = len(before_substring_token_list) + (len(pattern_token_list)-1)
+    pattern_tokens_state ={
+        "start_token_state": start_token_state,
+        "end_token_state"  : end_token_state
+          }
+    return pattern_tokens_state
+
+def find_number_indexes_in_string(normalized_string,recognized_numbers):
+    complete_recognized_numbers = []
+    for item in recognized_numbers:
+        number_start_index, number_end_index = find_token_indexes_in_string(normalized_string,item['start_token_index'],item['end_token_index'])
+        content = normalized_string.split()
+        # if item['start_token_index']==item['end_token_index']:
+        #     # حذف این بخش و باقی گذاشتن دستور ذیل الز زیر در کفایت درست کار کردن متد بررسی شود
+            
+        #     number_token_list = content[item['start_token_index']]
+        # else:
+        number_token_list = content[item['start_token_index']:item['end_token_index']+1]
+        complete_recognized_numbers.append(
+            {
+            'number_value'      : item['number_value'],
+            'number_token_list' : number_token_list,
+            'start_token_index' : item['start_token_index'],
+            'end_token_index'   : item['end_token_index'],
+            "start_number_state": number_start_index,
+            "end_number_state"  : number_end_index
+            }
+        )
+    return complete_recognized_numbers
+
+# این متد متن اصلی یک متن، توکن آغازین و توکن پایانی مربوط به یک عبارت را می گیرد
+# و ایندکس آغاز و ایندکس پایان متن وارد شده را بر می گرداند
+def find_token_indexes_in_string(normalized_string,start_token_state,end_token_state):
+    before_tokens = normalized_string.split()[0:start_token_state]
+    content_tokens = normalized_string.split()[start_token_state:end_token_state + 1]
+    content_start_index = 0
+    content_end_index = 0
+    # شمردن تعداد کاراکترهای هر توکن در لیست توکن قبل از عدد
+    for token in before_tokens:
+        content_start_index += len(token)
+    # اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس شروع عدد
+    content_start_index += len(before_tokens) + 1
+       
+    # شمردن تعداد کاراکترهای هر توکن در لیست توکن مربوط به عدد
+    for token in content_tokens:
+        content_end_index += len(token)
+    # اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس پایان عدد
+    content_end_index += (content_start_index - 1) + (len(content_tokens) - 1)
+
+    return content_start_index, content_end_index
+
+# این متد، متنی را دریافت می کند و الگوهای تعریف شده را در آن جستجو می کند و آرایه ای از عبارات مطابق با هر الگو،
+# شماره ایندکس شروع و پایان هر عبارت، عنوان و محتوای الگو، و شماره توکن شروع و توکن پایانی هر عبارت
+# پیدا شده را بر می گرداند
+def regex_patterns_finder(sectoin_content):
+    regex_patterns = {
+        "asle N asasi": r"اصل\s*شماره\s*(\d+)\s*قانون\s*اساسی\s*جمهوری\s*اسلامی\s*ایران", # اصل شماره فلان قانون اساسی جمهوری اسلامی ایران
+        "qanone asasi": r"(?<!^)\b\sقانون\sاساسی\sجمهوری\sاسلامی\sایران",                  # قانون اساسی جمهوری اسلامی ایران که در اول پاراگراف نباشد
+        "qanone asasi": r"(?<!^)\b\sقانون\sاساسی",                                          # قانون اساسی که در اول پاراگراف نباشد
+        "qanon * mosavvab tarikh"    : r"\bقانون[\s\w/]*مصوب\s((y\d{2,4}m\d{1,2}d\d{1,2})|\d{2,4})",  # قانون * مصوب تاریخ
+        "in qanon"    : r"این\sقانون",                                                    # این قانون
+        "qanone foq"  : r"قانون\sفوق",                                                   # قانون فوق
+        "eslahe qanon": r"قانون\sاصلاح",                                                  # اصلاح قانون
+        "tabsare foq" : r"تبصره\sفوق",                                                   # تبصره فوق
+        "made foq"    : r"ماده\sفوق",                                                     # ماده فوق
+        "made vahede" : r"ماده\sواحده",                                                  # ماده واحده
+        "made vahed"  : r"ماده\sواحد",                                                    # ماده واحد
+        "tabsare N"   : r"^\bتبصره\s*شماره\s*(\d+)\s*",                                    # تبصره شماره فلان که فقط اول پاراگراف باشد
+        "f tabsare N" : r"(?<!^)\bتبصره\sشماره\s(\d+)\s",                                  # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
+        "tabsare N"   : r"(?<!^)\bتبصره ?\(? ?\d+? ?\)?[ :.]",                             # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
+        "made N"      : r"(?<!^)\bماده ?\(? ?\d+? ?\)?[ :.]",                              #  *** ماده فلان که هرجای پاراگراف غیر از اول آن باشد
+        "f made N"    : r"^\bماده\s*[(]?\s*(\d+)\s*[)]?\s*"                                #  ماده فلان که فقط اول پاراگراف باشد با یا بدون ترکیب عدد با پرانتز
+        }
+          
+    matched_array = []
+    for pattern_key,pattern_value in regex_patterns.items():
+        regex = re.compile(pattern_value)
+        matches = regex.finditer(sectoin_content)
+
+        for match in matches:
+            # انجام عملیات مرتبط با هر الگو در اینجا
+            founded_item = match.group()
+            start_index = match.start() + 1
+            end_index = match.end() - 1
+            pattern_tokens_state = token_state_finder(sectoin_content, start_index, end_index)
+            matched_array.append(
+                {
+                    "founded_item"     : founded_item,
+                    "start_index"      : start_index,
+                    "end_index"        : end_index,
+                    "pattern_key"      : pattern_key,
+                    "pattern_value"    : pattern_value,
+                    "start_token_state": pattern_tokens_state["start_token_state"],
+                    "end_token_state"  : pattern_tokens_state["end_token_state"],
+                }
+            )
+            # convertedText = regex.sub(' wwwwwwwww ',convertedText)
+    # مرتب کردن آرایه بر اساس توکن شروع عبارت
+    matched_array.sort(key=lambda x: int(x['start_token_state']), reverse=False)
+    return matched_array
+
+def change_refrece_tokens(normalized_section_content, recognized_patterns_array):
+    token_list = normalized_section_content.strip().split()
+    for ref_item in recognized_patterns_array:
+        start_token_state = ref_item.get('start_token_state')
+        end_token_state = ref_item.get('end_token_state')
+        for i in range(start_token_state, end_token_state+1):
+            token_list[i] = 'eeee'
+    normalized_section_content = ''
+    for token in token_list:
+        normalized_section_content = ''.join([normalized_section_content, (' ' + token)])
+    return normalized_section_content.strip()
+
+def getMetaData(text):
+    normalized_section_content, recognized_dates, recognized_numbers = normalizerDate2(text.strip())
+    recognized_numbers         = find_number_indexes_in_string(text,recognized_numbers)
+    normalized_section_content = normalized_section_content.strip()
+    recognized_patterns_array  = regex_patterns_finder(normalized_section_content)
+    normalized_section_content  = change_refrece_tokens(normalized_section_content, recognized_patterns_array)
+    nlp_parser = []
+    date_list  = recognized_dates
+    ref_list   = recognized_patterns_array
+    for date_item in date_list:
+        nlp_parser.append({
+                "properties": {
+                                "type"       : "date",
+                                "index_start": int(date_item['start_date_token_index']),
+                                "index_end"  : int(date_item['end_date_token_index']),
+                                "text"       : date_item['original_date'],
+                                "result"     : date_item['converted_date'],
+                                #"timestamp"  : date_item['timestamp'],
+                                "ref_link"   : ''
+                                }      
+                            })
+    for ref_item in ref_list:
+        nlp_parser.append({
+                "properties": {
+                                "type"         : "reference",
+                                "index_start"  : int(ref_item['start_token_state']),
+                                "index_end"    : int(ref_item['end_token_state']),
+                                "text"         : ref_item['founded_item'],
+                                "result"       : ref_item['pattern_value'],
+                                "ref_link"     : ''
+                                }      
+                            })
+    return nlp_parser, normalized_section_content
+
+def save_error(error_text,filename):
+    with open(filename, 'a+', encoding='utf-8') as file:
+        # نوشتن خطا در فایل
+        file.write(error_text + '\n' + 50*'*' + '\n')
+
--- a/keyword_extractor.py
+++ b/keyword_extractor.py
@ -0,0 +1,106 @@
+import json
+from tqdm import tqdm
+import time
+
+import torch
+import os
+from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+os.environ['HF_HOME'] = "/home/admin/HFHOME"
+
+#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+model_id = "meta-llama/Llama-3.1-70B-Instruct"
+
+# use quantization to lower GPU usage
+# 4 bit:
+# bnb_config = BitsAndBytesConfig(
+#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
+# )
+# 8 bit:
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=bnb_config
+)
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
+
+SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
+And you don't need to provide explanations or additional text.
+Put each keyword on a single line."
+"""#  Explain your answer step by step.
+
+
+def format_prompt(SENTENCE):
+  PROMPT = f"Persian legal text: {SENTENCE}."
+  return PROMPT
+
+def generate(formatted_prompt):
+  formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
+  messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
+  # tell the model to generate
+  input_ids = tokenizer.apply_chat_template(
+      messages,
+      add_generation_prompt=True,
+      return_tensors="pt"
+  ).to(model.device)
+  outputs = model.generate(
+      input_ids,
+      max_new_tokens=2048,
+      eos_token_id=terminators,
+      do_sample=True,
+      temperature=0.6,
+      top_p=0.9,
+  )
+  response = outputs[0][input_ids.shape[-1]:]
+  return tokenizer.decode(response, skip_special_tokens=True)
+
+def get_rules(sentence):
+    formatted_prompt = format_prompt(sentence)
+    rules = generate(formatted_prompt).split('\n')
+    result =  [r.strip() for r in rules if r.strip()]
+    return result
+
+if __name__ == "__main__":
+    print('start')
+    start_time = time.time()
+    inputfile = open('./data/main_classes_dataset_03.json', "r", encoding='utf-8')
+    data = json.load(inputfile)
+    inputfile.close()
+    counter = 1
+    for c in tqdm(data):
+        for item in tqdm(data[c]): 
+            content = item['content']
+            item['keywords'] = get_rules(content)
+            print(f"section {counter} ...")
+            counter += 1
+
+    outputfile = open('./data/main_keywords_lama70B_dataset_03.json', "w", encoding='utf-8')
+    outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4))
+    outputfile.close()
+    end_time = time.time()
+    print(f"elapsed time:   {end_time-start_time}")
+    print("end")
+    
+    exit()
+    
+    """
+    system prompt version 2 for test:
+    
+    You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
+    
+    user prompt version 2 for test:
+    
+    Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
+    Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
+    """
--- a/llama_givechi.py
+++ b/llama_givechi.py
@ -0,0 +1,236 @@
+"""
+این فایل با نرمالایزر پارسیور کار می کند
+"""
+from datetime import datetime
+# from elasticsearch import Elasticsearch
+from threading import Thread
+import torch
+import time
+import json 
+import os.path
+import os
+from funcs import write_to_json, read_from_json
+from normalizer import Normalizer
+from tokenizer import *
+_normalizer = Normalizer(date_normalizing_needed=True)
+address = os.getcwd()
+# sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
+# sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
+# sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
+# sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
+
+def read_from_json1(file_address):
+    data_dict = []
+    # خواندن اطلاعات از فایل JSON
+    with open(file_address, 'r', encoding='utf-8') as file:
+        loaded_data = json.load(file)
+       
+    # نمایش اطلاعات خوانده شده
+    # for item in loaded_data:
+    #     data_dict.append(item)
+    return loaded_data
+
+sections_list = read_from_json1("./data_givechi/main_classes_dataset_03.json") # Main File
+
+# not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
+# not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
+not_have_two_token_kw_list = []
+
+import json
+os.environ['HF_HOME'] = "/home/admin/HFHOME"
+from tqdm import tqdm
+import time
+
+from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+#model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
+
+model_id = "meta-llama/Llama-3.1-70B-Instruct"
+
+# use quantization to lower GPU usage
+# 4 bit:
+# bnb_config = BitsAndBytesConfig(
+#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
+# )
+# 8 bit:
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
+)
+print("Model Loading START:")
+print(str(datetime.now()))
+print()
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=bnb_config
+)
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
+print("Model Loading END:")
+print(str(datetime.now()))
+print()
+
+# if torch.cuda.is_available():
+#     #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
+#     # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    
+#     model_id = "meta-llama/Llama-3.1-70B-Instruct"
+#     model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+#     tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# index_name_i = 'semantic_search-v10'
+
+# es = Elasticsearch(
+#     "http://127.0.0.1:6900",
+#     # ca_certs="/path/to/http_ca.crt",
+#     basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
+# )
+
+counter = 0
+total = 0
+remained = 0
+id = ''
+keywords_count = 15
+   
+def generateKeywords(law):
+    global remained
+    try:
+        keywords_count = (len(law) / 1000) * 15
+        keywords_count = int(keywords_count)
+        if keywords_count == 0: 
+            keywords_count = 1
+       
+        
+        # messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
+        #              {"role": "user", "content": 
+        #             '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
+        #             هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
+        #             .format(keywords_count)
+        # },
+        #              {"role": "user", "content": 
+        #             '''"متن": {}'''.format(text)
+        # },]
+        messages =  [{"role": "system", "content": '''You are a lawyer and legal expert who can interpret legal texts well, understand the semantic layers of the text and infer logical relationships in the text.
+You can understand Persian and English and interpret expressions in a Legal Context. You are also an expert in Deontic Logic and can understand the logical structure of law and rules in text.
+
+Use uppercase English letters such as A, B, C, etc. to identify all possible propositions. Do not include negative tones such as ""not"" in the propositions. For example, if the sentence is ""It is not bored,"" you should use ""A: bored"" to represent it.
+Next, for each proposition, use the symbol to represent its negative form. For example, the negative form of proposition A can be expressed as ¬A.
+Now, carefully analyze the context and find causal relationship between propositions seriously. A causal expression is only established when the context directly and explicitly supports this relationship. Use arrows (→) to indicate causal relationships, for example, ""If A, then B"", ""B if A"" and ""A causes B"" etc. must be represented as A → B.
+another example, ""if A and not C then B"" should be represented as A ∧ ¬C → B
+if the causal expression is not strict and is somehow loose, do not state it.
+
+Extraction Criteria:
+1- Defintion of a propostion:
+a proposition is an statement which could be true or false. ""I'm an animal"" is a propostion, but ""Ministry of Roads and Urban Development"" is not.
+Defintion of strict caustion:
+A strictly causes B if and only if:
+1. Whenever A occurs, B necessarily occurs.
+2. The occurrence of B is entirely dependent on the occurrence of A.
+3. There are no other factors or conditions that can independently cause B.
+4. The relationship between A and B is invariant and holds in all relevant circumstances.
+
+Additional Instructions:
+Do Not Add Extra Information: Provide only the extracted rules in the specified format without additional commentary or explanations.
+
+Formatting Instructions:
+Language: Output must be in English.
+output only propositions and causal expressions that are explicitly stated in the following text.''' },
+        {"role": "user", "content": f'''"this is your text to process:{law}'''
+        },]
+       
+        
+        input_ids = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        ).to(model.device)
+
+        terminators = [
+                    tokenizer.eos_token_id,
+                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+                ]
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+
+        
+        outputs = model.generate(
+            input_ids,
+            max_new_tokens=256,
+            eos_token_id=terminators,
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.85,
+        )
+        #lock0.release()
+        response = outputs[0][input_ids.shape[-1]:]
+        keywords = tokenizer.decode(response, skip_special_tokens=True)
+        #lock1.acquire()
+        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
+       
+
+        return keywords
+
+    except Exception as inst:
+        print(type(inst))    # the exception type
+        print(inst.args)     # arguments stored in .args
+        print("Exception: " + str(inst))
+
+
+
+if __name__ == "__main__":
+    start_time = time.time()
+    print("start_time: "+str(datetime.now()))
+    
+    try:
+        keywords_dict = []
+        
+        count = 1
+        
+        finall_data = []
+        for q_class, current_section_list in sections_list.items():
+            
+            
+            for content_item in current_section_list:
+                id = content_item['id']
+                qanon_id = content_item['qanon_id']
+                # if not id in not_have_two_token_kw_list:
+                #     continue
+                content = content_item['content']
+                content_len = len(content.split())
+                # کنارگذاشتن محتواهای با حجم زیاد
+                if content_len > 2000:
+                    print("too long content " + str(id))
+                    continue
+                content = _normalizer.sub_alphabets(content)
+                prompt_result = generateKeywords(content) 
+                print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " prompting ... ")
+                keywords_dict.append({
+                        'id':id,
+                        'qanon-id':qanon_id,
+                        'content':content,
+                        'prompt-result':prompt_result
+                    })
+                if count ==5:
+                    write_to_json(keywords_dict, f"./data_givechi/main_test_{count}.json")
+                    # keywords_dict = []
+                count+=1
+            # finall_data.append({'class':q_class, 'sections': keywords_dict})
+            finall_data.append({q_class : keywords_dict})
+            
+        write_to_json(finall_data, "./data_givechi/main_classes_dataset_result.json")
+          
+    except Exception as inst:
+            print(type(inst))    # the exception type
+            print(inst.args)     # arguments stored in .args
+            
+    end_time = time.time()
+    print("end_time: "+ str(datetime.now()))
+    operation_time = (int(end_time-start_time)/60)/60
+    print(f"elapsed time: {operation_time} hours")
+    print(f"Finished!!!")
--- a/normalizer.py
+++ b/normalizer.py
--- a/resource/normalizer/Dic1_new.txt
+++ b/resource/normalizer/Dic1_new.txt
@ -0,0 +1,74 @@
+بیخبر بی‌خبر
+بیتوجهی بی‌توجهی
+بیطرفانه بی‌طرفانه
+گفتوگو گفت‌وگو
+آنها آن‌ها
+پیشبرد پیش‌برد
+روانشناختی روان‌شناختی
+میباشد می‌باشد
+لذتبخش لذت‌بخش
+میدادند می‌دادند
+مینویسد می‌نویسد
+میبخشد می‌بخشد
+بیقاعده بی‌قاعده
+میباشند می‌باشند
+موافقتنامه موافقت‌نامه
+تخمگذار تخم‌گذار
+پایینترین پایین‌ترین
+گرمکن گرم‌کن
+پیشبینی پیش‌بینی
+برونگرا برون‌گرا
+میدهد می‌دهد
+فیلمبرداری فیلم‌برداری
+آنسوی آن‌سوی
+خدمتدهی خدمت‌دهی
+اینگونه این‌گونه
+کمکرسانی کمک‌رسانی
+کلانشهر کلان‌شهر
+سپردهگذار سپرده‌گذار
+بنیانگذار بنیان‌گذار
+رضایتبخش رضایت‌بخش
+اصلاحطلبان اصلاح‌طلبان
+استخوانبندی استخوان‌بندی
+درونگرا درون‌گرا
+میگردد می‌گردد
+اصلاحطلب اصلاح‌طلب
+میتوان می‌توان
+عملکرد عمل‌کرد
+میروم می‌روم
+بزرگنمایی بزرگ‌نمایی
+همجنس هم‌جنس
+همانطور همان‌طور
+بیشترین بیش‌ترین
+انسانگرایی انسان‌گرایی
+نمیباشند نمی‌باشند
+جانبداری جانب‌داری
+نمیتوانی نمی‌توانی
+قانونگذار قانون‌گذار
+میشدند می‌شدند
+تفاهمنامه تفاهم‌نامه
+آسیبپذیر آسیب‌پذیر
+برونگرایی برون‌گرایی
+جفتگیری جفت‌گیری
+گرانبها گران‌بها
+میشوند می‌شوند
+کلاهبرداری کلاه‌برداری
+جهتیابی جهت‌یابی
+چشمپوشی چشم‌پوشی
+بنیانگذاران بنیان‌گذاران
+میکند می‌کند
+الهامبخش الهام‌بخش
+وقتگیر وقت‌گیر
+پسلرزه پس‌لرزه
+میکنند می‌کنند
+میتواند می‌تواند
+آرامبخش آرام‌بخش
+بینام بی‌نام
+غربزدگی غرب‌زدگی
+بیتفاوت بی‌تفاوت
+بیثباتی بی‌‌ثباتی
+پاسخگویی پاسخ‌گویی
+میگیرد می‌گیرد
+جمعبندی جمع‌بندی
+میشود می‌شود
+میکنیم می‌کنیم
--- a/resource/normalizer/Dic2_new.txt
+++ b/resource/normalizer/Dic2_new.txt
@ -0,0 +1,112 @@
+مهماننوازی مهمان‌نوازی
+صلیالله صلی‌الله
+موافقتنامه موافقت‌نامه
+اعتراضآمیز اعتراض‌آمیز
+رییسجمهور رییس‌جمهور
+چشمپوشی چشم‌پوشی
+هیئتعلمی هیئت‌علمی‌
+الزامآور الزام‌آور
+بیمهنامه بیمه‌نامه
+آییننامه آیین‌نامه
+بتنریزی بتن‌ریزی
+تشییعجنازه تشییع‌جنازه
+تامینکنندگان تامین‌کنندگان
+پرسشنامه پرسش‌نامه
+تحتالشعاع تحت‌الشعاع
+شگفتانگیز شگفت‌انگیز
+بزرگنمایی بزرگ‌نمایی
+نیمههادی نیمه‌هادی
+قابلکنترل قابل‌کنترل
+روانپزشکی روان‌پزشکی
+ضربالمثل ضرب‌المثل
+اضافهکاری اضافه‌کاری
+اختلافنظر اختلاف‌نظر
+بینالملل بین‌الملل
+یکطرفه یک‌طرفه
+موجشکن موج‌شکن
+عزتنفس عزت‌نفس
+بیسیم بی‌سیم
+شیبدار شیب‌دار
+دستیابی دست‌یابی
+روانشناختی روان‌شناختی
+عقبنشینی عقب‌نشینی
+بهطور به‌طور
+خطچین خط‌چین
+ادراکشده ادراک‌شده
+خزانهداری خزانه‌داری
+شیمیدرمانی شیمی‌درمانی
+آنسوی ‌آن‌سوی
+نقطهچین نقطه‌چین
+منحصربهفرد منحصربه‌فرد
+درحالتوسعه درحال‌توسعه
+رضایتبخش رضایت‌بخش
+قرضالحسنه قرض‌الحسنه
+هرجومرج هرج‌ومرج
+سیبزمینی سیب‌زمینی
+میلیگرم میلی‌گرم
+نخستوزیر نخست‌وزیر
+تعیینکنندهای تعیین‌کننده‌ای
+طاقتفرسا طاقت‌فرسا
+قابلمشاهده قابل‌مشاهده
+بهوسیله به‌وسیله
+قابلدستیابی قابل‌دستیابی
+الهامبخش الهام‌بخش
+پیدرپی پی‌درپی
+سرمایهداری سرمایه‌داری
+لذتبخش لذت‌بخش
+تخمگذار تخم‌گذار
+گرمکن گرم‌کن
+قابلتوجهی قابل‌توجهی
+فیلمبرداری فیلم‌برداری
+خدمتدهی خدمت‌دهی
+معنیدار معنی‌دار
+کلانشهری کلان‌شهری
+گواهینامه گواهی‌نامه
+همجنس هم‌جنس
+همانطور همان‌طور
+سیستمعامل سیستم‌عامل
+حملونقل حمل‌ونقل
+تفاهمنامه تفاهم‌نامه
+بینالمللی بین‌المللی
+کلاهبرداری کلاه‌برداری
+نرمافزار نرم‌افزار
+مضافالیه مضاف‌الیه
+قطعنامهای قطعنامه‌ای
+پاسخگویی پاسخ‌گویی
+عکسبرداری عکس‌برداری
+پسلرزه پس‌لرزه
+خردهفروشی خرده‌فروشی
+حقوقبشر حقوق‌بشر
+تحلیلگران تحلیل‌گران
+اینگونه این‌گونه
+صرفهجویی صرفه‌جویی
+علیالخصوص علی‌الخصوص
+کلانشهرها کلان‌شهرها
+حاصلضرب حاصل‌ضرب
+اطلاعرسانی اطلاع‌رسانی
+دندانپزشکی دندان‌پزشکی
+پیشبرد پیش‌برد
+ایدهال ایده‌ال
+هیچگاه هیچ‌گاه
+صنایعدستی صنایع‌دستی
+سانتیمتر سانتی‌متر
+پیشبینی پیش‌بینی
+خلیجفارس خلیج‌فارس
+تاریخنگاری تاریخ‌نگاری
+هیچگونه هیچ‌گونه
+راهاندازی راه‌اندازی
+جستوجوی جست‌وجوی
+حاشیهنشینی حاشیه‌نشینی
+رنگآمیزی رنگ‌آمیزی
+جمعآوری جمع‌‌آوری
+وقتگیر وقت‌گیر
+آرامبخش آرام‌بخش
+غربزدگی غرب‌زدگی
+کلانشهر کلان‌شهر
+نرمافزاری نرم‌افزاری
+بدینوسیله بدین‌وسیله
+جمعبندی جمع‌بندی
+گفتوگو گفت‌وگو
+حملونقل حمل‌ونقل
+آیتالله آیت‌الله
+حجتالاسلام حجت‌الاسلام
--- a/resource/normalizer/Dic3_new.txt
+++ b/resource/normalizer/Dic3_new.txt
@ -0,0 +1,5 @@
+حملونقل حمل‌ونقل
+حجتالاسلاموالمسلمین حجت‌الاسلام‌والمسلمین
+آیتاللهالعظمی آیت‌الله‌العظمی
+گفتوگو گفت‌وگو
+حملونقل حمل‌ونقل
--- a/resource/tokenizer/TokenMerger.pckl
+++ b/resource/tokenizer/TokenMerger.pckl
--- a/resource/tokenizer/enDict
+++ b/resource/tokenizer/enDict
--- a/resource/tokenizer/faDict
+++ b/resource/tokenizer/faDict
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,59 @@
+import re
+
+
+class Tokenizer():
+    def __init__(self):
+        pass
+
+    def tokenize_words(self, doc_string):
+        token_list = doc_string.strip().split()
+        #token_list = [x.strip("\u200c").strip('\u200f') for x in token_list if len(x.strip("\u200c")) != 0]
+        new_token_list = []
+        new_token = ''
+        for index,token in enumerate(token_list):
+            if len(token.strip("\u200c")) != 0:
+                new_token = token.strip("\u200c")
+            if len(token.strip("\u200f")) != 0:
+                new_token = new_token.strip('\u200f')
+            new_token_list.append(new_token)
+        return new_token_list
+
+    def tokenize_sentences(self, doc_string):
+        #finding the numbers
+        pattern = r"[-+]?\d*\.\d+|\d+"
+        nums_list = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
+
+        pattern = r'([!\.\?؟]+)[\n]*'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r':\n'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r';\n'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r'؛\n'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r'[\n]+'
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        for number in nums_list:
+            pattern = 'floatingpointnumber'
+            doc_string = re.sub(pattern, number, doc_string, 1)
+
+        doc_string = doc_string.split('\t\t')
+        doc_string = [x for x in doc_string if len(x) > 0]
+        return doc_string
+
+    def add_tab(self, mystring):
+        mystring = mystring.group()  # this method return the string matched by re
+        mystring = mystring.strip(' ')  # ommiting the whitespace around the pucntuation
+        mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
+        mystring = " " + mystring + "\t\t"  # adding a space after and before punctuation
+        return mystring