first push

2025-01-20 16:35:35 +00:00 · 2025-01-20 16:35:35 +00:00 · e73258bff3
commit e73258bff3
16 changed files with 3331 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 *.log
 *.pyc
 *.json
--- a/conflict.py
+++ b/conflict.py
@ -0,0 +1,94 @@
 import json
 from tqdm import tqdm
 import time
 import torch
 import os
 from transformers import AutoTokenizer, AutoModel
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 os.environ['HF_HOME'] = "/home/admin/HFHOME"
 model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #model_id = "meta-llama/Llama-3.1-70B-Instruct"
 # use quantization to lower GPU usage
 # 4 bit:
 # bnb_config = BitsAndBytesConfig(
 #     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
 # )
 # 8 bit:
 bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
 )
 terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
 model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
 SYS_PROMPT = """
 You receive two Persian legal rule texts and analyze them carefully, explain your answer step by step, to see whether these two rules logically conflict with each other. 
 Finally, state the final conclusion for the presence or absence of conflict with the words "yes" or "no".
 """#  Explain your answer step by step.
 def format_prompt(SENTENCE1, SENTENCE2):
  PROMPT = f"Rule 1: {SENTENCE1}. Rule 2: {SENTENCE2}."
  return PROMPT
 def generate(formatted_prompt):
  formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
  messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
  # tell the model to generate
  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)
  outputs = model.generate(
      input_ids,
      max_new_tokens=2048,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
  )
  response = outputs[0][input_ids.shape[-1]:]
  return tokenizer.decode(response, skip_special_tokens=True)
 def conflict(sentence1, sentence2):
    formatted_prompt = format_prompt(sentence1, sentence2)
    return generate(formatted_prompt)
 if __name__ == "__main__":
    print('start')
    start_time = time.time()
    #inputfile = open('./main_rules_lama70B_dataset_02.json', "r", encoding='utf-8')
    result = conflict("حسین در حال حاضر در وزارت نفت و انرژی به استخدام دولت در آمده است.",
             "حسین الان در دانشگاه دولتی مشغول به تحصیل است و هرکسی که در حال تحصیل باشد، از نظر قانون نمی تواند در دولت استخدام شود")
    end_time = time.time()
    print("*********************************************************")
    print("*********************************************************")
    print()
    print(result)
    print()
    print("*********************************************************")
    print("*********************************************************")
    print(f"elapsed time:   {end_time-start_time}")
    print("end")
--- a/data_helper.py
+++ b/data_helper.py
@ -0,0 +1,153 @@
 import pickle
 import re
 import string
 import os
 class DataHelper():
    def __init__(self):
        pass
    def clean_text(self, text_doc, new_line_elimination):
        punctuations = r')(}{:؟!،؛»«.' + r"/<>?.,:;"
        punctuations = '[' + punctuations + string.punctuation + ']'
        punctuations = punctuations.replace("@", "")
        text_doc.strip()
        # pattern = ur'\s*@[a-zA-Z0-9]*\s*'
        # tmp = re.findall(pattern, text_doc)
        # newstring = re.sub(pattern, eliminate_pattern, text_doc)
        #finding the numbers
        pattern = r"[-+]?\d*\.\d+|\d+"
        nums_list = re.findall(pattern, text_doc)
        newstring = re.sub(pattern, 'floatingpointnumber', text_doc)
        #pattern = '\s*' + punctuations + '+' + '\s*'
        #tmp = re.findall(pattern, newstring)
        #newstring = re.sub(pattern, self.add_space, newstring)
        # pattern = u'([a-zA-Z0-9]+)(\s*)(' + punctuations + u')(\s*)([a-zA-Z0-9]+)'
        # rep = ur'\1\3\5'
        # tmp = re.findall(pattern, newstring)
        # newstring = re.sub(pattern, rep, newstring)
        pattern = r'[\n]+'
        tmp = re.findall(pattern, newstring)
        if new_line_elimination:
            newstring = re.sub(pattern, " ", newstring)
        else:
            # newstring = re.sub(pattern, "\n", newstring)
            pass
        punctuations = r")(}{:؟!-،؛»«.@$&%" + r"/<>?.,:;"
        latinLettersDigits = r"a-zA-Z0-9"
        pattern = r'[^' + punctuations + latinLettersDigits + 'آ-ی' + '‌' + '\d\s:]'
        tmp = re.findall(pattern, newstring)
        newstring = re.sub(pattern, self.eliminate_pattern, newstring)
        pattern = r'[ ]+'
        tmp = re.findall(pattern, newstring)
        newstring = re.sub(pattern, ' ', newstring)
        for number in nums_list:
            pattern = 'floatingpointnumber'
            newstring = re.sub(pattern, number, newstring, 1)
        return newstring
    def add_space(self, mystring):
        mystring = mystring.group()  # this method return the string matched by re
        mystring = mystring.strip(' ')  # ommiting the whitespace around the pucntuation
        mystring = " " + mystring + " "  # adding a space after and before punctuation
        return mystring
    def replace_newline_with_dot(self, mystring):
        return ' . '
    def eliminate_pattern(self, mystring):
        return ""
    def load_var(self, load_path):
        file = open(load_path, 'rb')
        variable = pickle.load(file)
        file.close()
        return variable
    def save_var(self, save_path, variable):
        print("saving vars ...")
        file = open(save_path, 'wb')
        pickle.dump(variable, file)
        print("variable saved.")
        file.close()
    def build_stem_dictionary(self, normalizer, verb_tense_path, mokasar_noun_path):
        path_dir = "resource/Persian_Dependency_Treebank/Data/2ndRep"
        lexicon_stem = set()
        verb_stem = set()
        #verb_tense_map = {}
        verb_p2f_map = {}
        verb_f2p_map = {}
        for fileName in os.listdir(path_dir):
            file_path = path_dir + "/" + fileName
            with open(file_path, "r") as input:
                input_content = input.readlines()
                for el in input_content:
                    el = normalizer.sub_alphabets(el)
                    el = el.split("\t")
                    if (len(el) > 2):
                        if (el[3] == 'V'):
                            tmp_pos = "V"
                        else:
                            tmp_pos = "N"
                        stem_word = el[2]
                        stem_word = stem_word.split("#")
                        stem_word = [x.strip('\u200c') for x in stem_word]
                        if (tmp_pos == "V" and len(stem_word) == 2):
                            if (len(stem_word[0]) != 0 and len(stem_word[1]) != 0):
                                verb_p2f_map[stem_word[0]] = stem_word[1]
                                verb_f2p_map[stem_word[1]] = stem_word[0]
                                verb_stem.add(stem_word[0])
                                verb_stem.add(stem_word[1])
                        if(tmp_pos == 'V' and len(stem_word) == 3):
                            if(len(stem_word[0]) != 0 and len(stem_word[1]) != 0 and len(stem_word[2]) !=0):
                                #verb_prifix.add(stem_word[0])
                                verb_p2f_map[stem_word[1]] = stem_word[2]
                                verb_f2p_map[stem_word[2]] = stem_word[1]
                                verb_stem.add(stem_word[1])
                                verb_stem.add(stem_word[2])
                        for t in stem_word:
                            if len(t) > 1:
                                if (tmp_pos == 'N'):
                                    lexicon_stem.add(t)
        with open(verb_tense_path, "r") as bon_file:
            bon_file_content = bon_file.readlines()
            for el in bon_file_content:
                el = el.strip()
                el = normalizer.sub_alphabets(el)
                el = el.split()
                el = [x.strip('\u200c') for x in el]
                verb_p2f_map[el[0]] = el[1]
                verb_f2p_map[el[1]] = el[0]
                verb_stem.add(el[0])
                verb_stem.add(el[1])
        irregular_noun = {}
        with open(mokasar_noun_path, "r") as input:
            input_content = input.readlines()
            for el in input_content:
                el = normalizer.sub_alphabets(el)
                el = el.replace("\t\t", "\t")
                el = el.strip().split("\t")
                el = [x.strip('\u200c') for x in el]
                irregular_noun[el[0]] = el[1]
                lexicon_stem.add(el[0])
        verb_tense_map = [verb_p2f_map, verb_f2p_map]
        return lexicon_stem, verb_stem, verb_tense_map, irregular_noun
--- a/do_extractor.py
+++ b/do_extractor.py
@ -0,0 +1,200 @@
 """
 این فایل با نرمالایزر پارسیور کار می کند
 """
 from datetime import datetime
 # from elasticsearch import Elasticsearch
 from threading import Thread
 import torch
 import time
 import json 
 import os.path
 import os
 os.environ['HF_HOME'] = "/home/admin/HFHOME"
 # from general_functions import normalize_content
 from funcs import write_to_json, read_from_json
 from normalizer import Normalizer
 from tokenizer import *
 _normalizer = Normalizer(date_normalizing_needed=True)
 address = os.getcwd()
 # sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
 # sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
 # sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
 # sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
 # not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
 # not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
 not_have_two_token_kw_list = []
 import json
 from tqdm import tqdm
 import time
 from transformers import AutoTokenizer, AutoModel
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 os.environ['HF_HOME'] = "/home/admin/HFHOME"
 #model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
 model_id = "meta-llama/Llama-3.1-70B-Instruct"
 # use quantization to lower GPU usage
 # 4 bit:
 # bnb_config = BitsAndBytesConfig(
 #     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
 # )
 # 8 bit:
 bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
 )
 print("Model Loading START:")
 print(str(datetime.now()))
 print()
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
 )
 terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
 model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
 print("Model Loading END:")
 print(str(datetime.now()))
 print()
 sections_list = read_from_json('./data/sections_110.json') # Main File
 # if torch.cuda.is_available():
 #     #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
 #     # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #     model_id = "meta-llama/Llama-3.1-70B-Instruct"
 #     model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
 #     tokenizer = AutoTokenizer.from_pretrained(model_id)
 # index_name_i = 'semantic_search-v10'
 # es = Elasticsearch(
 #     "http://127.0.0.1:6900",
 #     # ca_certs="/path/to/http_ca.crt",
 #     basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
 # )
 counter = 0
 total = 0
 remained = 0
 id = ''
 keywords_count = 15
 def generateKeywords(text):
    global remained
    try:
        keywords_count = (len(text) / 1000) * 15
        keywords_count = int(keywords_count)
        if keywords_count == 0: 
            keywords_count = 1
        # messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
        #              {"role": "user", "content": 
        #             '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
        #             هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
        #             .format(keywords_count)
        # },
        #              {"role": "user", "content": 
        #             '''"متن": {}'''.format(text)
        # },]
        messages =  [{"role": "system", "content": "You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text." },
                     {"role": "user", "content": 
                    '''Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
 Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".'''
                    .format(keywords_count)
        },
                     {"role": "user", "content": 
                    '''"متن": {}'''.format(text)
        },]
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id
        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.85,
        )
        #lock0.release()
        response = outputs[0][input_ids.shape[-1]:]
        keywords = tokenizer.decode(response, skip_special_tokens=True)
        #lock1.acquire()
        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
        return keywords
    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print("Exception: " + str(inst))
 if __name__ == "__main__":
    start_time = time.time()
    print("start_time: "+str(datetime.now()))
    try:
        keywords_dict = []
        count = 1
        for content_item in sections_list:
            id = content_item['id']
            # if not id in not_have_two_token_kw_list:
            #     continue
            content = content_item['content']
            content_len = len(content.split())
            # کنارگذاشتن محتواهای با حجم زیاد
            if content_len > 2000:
                print("too long content " + str(id))
                continue
            content = _normalizer.sub_alphabets(content)
            keywords = generateKeywords(content) 
            print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " keyword extracting ... ")
            keywords_dict.append({
                    'id':id,
                    'keywords':keywords
                })
            if count % 500 == 0:
                write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
                keywords_dict = []
            count+=1
        write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
    except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
    end_time = time.time()
    print("end_time: "+ str(datetime.now()))
    operation_time = (int(end_time-start_time)/60)/60
    print(f"elapsed time: {operation_time} hours")
    print(f"Finished!!!")
--- a/funcs.py
+++ b/funcs.py
@ -0,0 +1,119 @@
 import re
 import os
 import json
 # from pandas import read_excel, DataFrame
 def remove_signs():
    str = read_file()
    # lines = 
    pattern = r"\(|\)"
    str = re.sub(pattern,'', str)
    # str = re.sub(')','', str)
    # str = re.sub('/','', str)
    return str
 def read_file():
    with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file:
        text = ''
        try:
            text = str(file.read())
        except:
            pass 
    return text
 def read_file_by_address(file_address):
    with open(file_address, 'r', encoding='utf-8') as file:
        text = ''
        try:
            text = str(file.read())
        except:
            pass 
    return text
 def save_to_file(result):
    with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file:
        previous_result = ''
        try:
            previous_result = file.read()
        except:
            pass 
        file.write(result)
        file.close()
 def save_to_file_by_address(file_address, text):
    with open(file_address, 'a+', encoding='utf-8') as file:
        previous_result = ''
        try:
            previous_result = file.read()
        except:
            pass 
        file.write(text)
        file.close()
 def read_from_excel(file_address, column_name):
    # خواندن فایل اکسل
    data = read_excel(file_address)
    # استخراج محتوای ستون مورد نظر
    column_data = data[column_name]
    return column_data
 def add_columndata_to_excel(file_address, column_name, columndata):
    # خواندن فایل اکسل
    data = read_excel(file_address)
    # اضافه کردن ستون جدید به داده‌ها
    data[column_name] = columndata
    # ذخیره کردن داده‌ها در فایل اکسل
    data.to_excel(file_address, index=False)
 def write_to_excel(data_dict, file_name_and_address):
    df = DataFrame(data_dict)
    # ذخیره DataFrame به عنوان فایل اکسل
    df.to_excel(file_name_and_address, index=False)
    return True
 def write_to_json(dict, file_address):
    # تبدیل دیکشنری به فرمت JSON
    json_data = json.dumps(dict, indent=2, ensure_ascii=False)
    # ذخیره فایل
    with open(file_address, 'w+', encoding='utf-8') as file:
        file.write(json_data)
    return True
 def read_from_json(file_address):
    data_dict = []
    # خواندن اطلاعات از فایل JSON
    with open(file_address, 'r', encoding='utf-8') as file:
        loaded_data = json.load(file)
    # نمایش اطلاعات خوانده شده
    for item in loaded_data:
        data_dict.append(item)
    return data_dict
 def separated_date_format_finder(date_ner):
        result = False
        date_ner = date_ner.replace('.','/')
        date_ner = date_ner.replace('،','/')
        date_ner = date_ner.replace('ر','/')
        #date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}'
        date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b'
        regex = re.compile(date_pattern)
        match_dates = regex.finditer(date_ner)
        for date_item in match_dates:
            result = True
            break
        return result
 if __name__ ==  "__main__":
        pass
--- a/general_functions.py
+++ b/general_functions.py
@ -0,0 +1,800 @@
 from normalizer import Normalizer
 from tokenizer import *
 # import jalali
 import re
 from re import sub
 import textwrap
 from html import escape
 from bs4 import BeautifulSoup
 # from lxml import etree
 import datetime
 #enumerate(token_list):  
 _normalizer = Normalizer(date_normalizing_needed=True)
 yeAr = r"ﻱ|ې|ێ|ے|ى|ي|ئ"
 yeFr= r"ی"
 keAr = r"ڭ|ﻚ|ﮎ|ﻜ|ﮏ|ګ|ﻛ|ﮑ|ﮐ|ڪ|ك"
 keFr = r"ک"
 mark1 = r'#\[#'
 mark2 = r'#\]#'
 hTag1 = r'<'
 hTag2 = r'>'
 tableTag=["table","tr", "th", "td", "TABLE", "TR", "TH", "TD"]
 strTable = ''
 for tag in tableTag:
    if strTable != '':
        strTable += '|'     
    strTable += '('+tag+')'
 regTable = r'<(?P<slash>\/)*(?P<tag>'+strTable+')(?P<class>[^>]+)*>'
 regTableReplace = r'#[#\g<slash>\g<tag>\g<class>#]#'
 def isNeedHtml(html):
    if "<TABLE" in html or  "<table" in html or "</TR" in html or "</tr" in html :
        return True
    else:
        return False
 def removeHtmlTags(html, exeptionTag=[]):
    #reg1 = r'<[^>]+>'
    if exeptionTag.__len__ :
        exceptTags = ''
        for tag in exeptionTag:
            if exceptTags != '':
                exceptTags += '|'     
            exceptTags += '('+tag+')'
        reg1 = r'<(?P<slash>/)*(?P<tag>'+exceptTags+')(?P<class>[^>]+)*>'
        html = sub(reg1, regTableReplace, html)
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text("\n", strip=True)
    if exeptionTag.__len__ :
        text = sub(mark1, hTag1, text)
        text = sub(mark2, hTag2, text)
    return text
 def removeHtmlNoTableTag(html):
    # خطا داره و هنگ می کنه در test2.py
    html = sub(regTable, regTableReplace, html)
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text("\n", strip=True)
    text = sub(mark1, hTag1, text)
    text = sub(mark2, hTag2, text)
    return text
 def normalizerData(data):
    global _normalizer
    normalTitle, dates = _normalizer.normalize(data, return_dates=True)
    tdates = []
    for d in dates:
        if not d.startswith("num"):
            try:
                tsd = jdate2timestamp(d)
                cdate = d
            except:
                try:
                    d = d.replace("y", "")
                    d = d.replace("m", "/")
                    d = d.replace("d", "/")
                    m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
                    if m:
                        [year, month, day] = [
                            int(m.group(1)),
                            int(m.group(2)),
                            int(m.group(3)),
                        ]
                        if year > 1200 and year < 1550:
                            if month < 1 or month > 12:
                                month = 1
                            if day < 1 or day > 31:
                                day = 1
                            cdate = str(year) + "/" + str(month) + "/" + str(day)
                            tsd = jdate2timestamp(cdate)
                        else:
                            # cdate = "1403/03/03"
                            # tsd = jdate2timestamp(cdate)
                            continue
                    else:
                        # cdate = "1403/03/03"
                        # tsd = jdate2timestamp(cdate)
                        continue
                except:
                    # print("Error in:"+ d +" for id: " + id)
                    # cdate = "1403/03/03"
                    # tsd = jdate2timestamp(cdate)
                    continue
            tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})
    return normalTitle,tdates
 def normalizerDate2(inputString):
    global _normalizer
    normalizedString, dates, recognized_dates, recognized_numbers = _normalizer.normalize(inputString, return_dates=True)
    tdates = []
    for date_item in recognized_dates:
        date_part              = date_item['date']
        date_token_index       = date_item['date_token_index']
        start_date_token_index = date_item['start_date_token_index']
        end_date_token_index   = date_item['end_date_token_index']
        if not date_part.startswith("num"):
            try:
                cdate = date_part
                tsd = jdate2timestamp(date_part)
            except:
                try:
                    date_part = date_part.replace("y", "")
                    date_part = date_part.replace("m", "/")
                    date_part = date_part.replace("d", "/")
                    m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", date_part)
                    if m:
                        [year, month, day] = [
                            int(m.group(1)),
                            int(m.group(2)),
                            int(m.group(3)),
                        ]
                        if year > 1200 and year < 1550:
                            if month < 1 or month > 12:
                                month = 1
                            if day < 1 or day > 31:
                                day = 1
                            cdate = str(year) + "/" + str(month) + "/" + str(day)
                            tsd = jdate2timestamp(cdate)
                        else:
                            # cdate = "1403/03/03"
                            # tsd = jdate2timestamp(cdate)
                            continue
                    # else:
                    #     # cdate = "1403/03/03"
                    #     # tsd = jdate2timestamp(cdate)
                    #     continue
                except:
                    # print("Error in:"+ d +" for id: " + id)
                    # cdate = "1403/03/03"
                    # tsd = jdate2timestamp(cdate)
                    continue
            import tokenizer as t
            inputString_token = t.Tokenizer.tokenize_words(None,inputString)
            # if start_date_token_index == end_date_token_index:
            #     end_date_token_index += 1
            #     original_date_part = inputString_token[start_date_token_index:end_date_token_index]
            # else:
            original_date_part = inputString_token[start_date_token_index:end_date_token_index + 1]
            original_date = ''
            for part in original_date_part:
                original_date = original_date + ' ' + part
            original_date = original_date.strip()    
            tdates.append({"converted_date": date_item['date'],
                           "date": cdate ,
                           "original_date" : original_date,
                        #    "timestamp": tsd,
                           "date_token_index": date_token_index,
                           "start_date_token_index": start_date_token_index,
                           "end_date_token_index":end_date_token_index})
    '''
    for d in dates:
        if not d.startswith("num"):
            try:
                tsd = jdate2timestamp(d)
                cdate = d
            except:
                try:
                    d = d.replace("y", "")
                    d = d.replace("m", "/")
                    d = d.replace("d", "/")
                    m = re.match(r"^(\d{4})\D(\d{1,2})\D(\d{1,2})$", d)
                    if m:
                        [year, month, day] = [
                            int(m.group(1)),
                            int(m.group(2)),
                            int(m.group(3)),
                        ]
                        if year > 1200 and year < 1550:
                            if month < 1 or month > 12:
                                month = 1
                            if day < 1 or day > 31:
                                day = 1
                            cdate = str(year) + "/" + str(month) + "/" + str(day)
                            tsd = jdate2timestamp(cdate)
                        else:
                            # cdate = "1403/03/03"
                            # tsd = jdate2timestamp(cdate)
                            continue
                    else:
                        # cdate = "1403/03/03"
                        # tsd = jdate2timestamp(cdate)
                        continue
                except:
                    # print("Error in:"+ d +" for id: " + id)
                    # cdate = "1403/03/03"
                    # tsd = jdate2timestamp(cdate)
                    continue
            tdates.append({"date": cdate, "timestamp": tsd, "index": 0, "slice": ""})'''
    return normalizedString,tdates,recognized_numbers
 def OtherDateFormatNormalizer(inputString,pattern):
    mainTextTemp = inputString
    regex_pattern_Mah = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه 1000 و 300 و 50 و 4 
    regex_pattern_MahSal =  r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\s(\d{1,4})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال 1000 و 300 و 50 و 4 
    regex_pattern_MahSal2 =  r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})\sو\s(\d{1,3})\sو\s(\d{1,2})\sو\s(\d{1})\s" # y0m4d4 ماه سال y1000m0d0 و 300 و 50 و 4 
    regex_pattern_MahSal3 = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})\sماه\sسال\sy(\d{1,4})m(\d{1,2})d(\d{1,2})" # y0m3d1 ماه سال y1353m0d0
    if(pattern==1):
        regex = re.compile(regex_pattern_Mah)
    elif(pattern==2):
        regex = re.compile(regex_pattern_MahSal)
    elif(pattern==3):
        regex = re.compile(regex_pattern_MahSal2)
    elif(pattern==4):
        regex = re.compile(regex_pattern_MahSal3)
    matches = regex.finditer(inputString)
    for match in matches:
            foundedPattern = match.group()
            foundedPatternTemp = match.group()
            if(pattern==1):
                foundedPattern = foundedPattern.replace('ماه','')
            else:
                foundedPattern = foundedPattern.replace('سال','')
                foundedPattern = foundedPattern.replace('ماه','')
            foundedPattern = foundedPattern.strip()
            tempString = foundedPattern
            standardDatePattern = r"y(\d{1,4})m(\d{1,2})d(\d{1,2})"
            #regex = re.compile(regex_pattern_Mah)
            matchItems = re.finditer(standardDatePattern,tempString)
            for item in matchItems:
                tempPattern = item.group()
                tempString = tempString.replace(tempPattern,'')
                tempString = tempString.strip()
                tempString = tempString.replace('و','')
                tempString = tempString.strip()
                tempArray = tempString.split()
                year = 0
                for item in tempArray:
                    dateMatch = re.finditer(standardDatePattern,item)
                    regexFlag = True
                    for dateItem in dateMatch:
                        yearStr = dateItem.group()[1:5]
                        year += int(yearStr)
                        regexFlag = False
                        break
                    if(item.isalnum() and regexFlag):
                        year += int(item)
                tempPattern = tempPattern.replace('y0','y'+str(year))
                mainTextTemp = mainTextTemp.replace(foundedPatternTemp,tempPattern+' ')    
    return mainTextTemp
            #foundedPattern = jdate2timestamp(foundedPattern)
            #convertedText = regex.sub(foundedPattern,convertedText)
 def normalizerLongData(data):
    dates = []
    if len(data) > 10000:
        textParts = textwrap.wrap(data, 10000, break_long_words=False)
        for part in textParts:
            dates.extend(normalizerData(part))
    else:
        dates = normalizerData(data)
    return dates
 # ##################
 # در ویندوز برای اعداد منفی که تاریخهای قبلی بود را خطا می داد
 #  rr = gdt.timestamp()
 # #################
 def jdate2timestamp_old(dt):
    ndt = dt.replace("y", "")
    ndt = ndt.replace("m", "/")
    ndt = ndt.replace("d", "/")
    gd = jalali.Persian(ndt).gregorian_datetime()
    # print(gd)
    ztime = datetime.time(0, 0, 0, 0)
    gdt = datetime.datetime.combine(gd, ztime)
    # print(gdt)
    rr = gdt.timestamp()
    tst = int( round(rr) * 1000)
    return tst
 def jdate2timestamp(dt):
    ndt = dt.replace("y", "")
    ndt = ndt.replace("m", "/")
    ndt = ndt.replace("d", "/")
    gd = jalali.Persian(ndt).gregorian_datetime()   
    base = datetime.date(1970, 1, 1)
    rr = (gd-base).total_seconds()  
    tst = int( round(rr) * 1000)
    return tst
 def getSortTimestamp(ts_date):
    empty_date = -15000000000
    ts_ts = empty_date
    try:
        if ts_date != "":
            ts_ts = jdate2timestamp(ts_date)
    except:
        ts_ts = empty_date
    return ts_ts
 def normalize_content(content):
    text = _normalizer.sub_alphabets(content)
    return text
 def normalYehKe(text):
    if(text == None) :
       return ''
    c1 = sub(yeAr, yeFr, text)
    c2 = sub(keAr, keFr, c1)
    c2 = c2.replace('\u00A0', '')    
    return c2.strip()
 _term_list = []
 def setTermList():
  global _term_list
  if(_term_list.__len__() > 0):
    return
  _term_list = [
      {
          "begin": jdate2timestamp("1285/07/14"),
          "end": jdate2timestamp("1287/04/2"),
          "term": "مجلس شورای ملی-دوره1",
          "term_number": 1,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1288/8/24"),
          "end": jdate2timestamp("1290/10/3"),
          "term": "مجلس شورای ملی-دوره2",
          "term_number": 2,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1293/9/14"),
          "end": jdate2timestamp("1294/8/21"),
          "term": "مجلس شورای ملی-دوره3",
          "term_number": 3,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1300/4/1"),
          "end": jdate2timestamp("1302/3/30"),
          "term": "مجلس شورای ملی-دوره4",
          "term_number": 4,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1302/11/22"),
          "end": jdate2timestamp("1304/11/22"),
          "term": "مجلس شورای ملی-دوره5",
          "term_number": 5,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1305/4/19"),
          "end": jdate2timestamp("1307/5/22"),
          "term": "مجلس شورای ملی-دوره6",
          "term_number": 6,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1307/7/19"),
          "end": jdate2timestamp("1309/8/14"),
          "term": "مجلس شورای ملی-دوره7",
          "term_number": 7,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1309/9/24"),
          "end": jdate2timestamp("1311/10/24"),
          "term": "مجلس شورای ملی-دوره8",
          "term_number": 8,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1311/12/24"),
          "end": jdate2timestamp("1314/1/24"),
          "term": "مجلس شورای ملی-دوره9",
          "term_number": 9,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1314/3/15"),
          "end": jdate2timestamp("1316/3/22"),
          "term": "مجلس شورای ملی-دوره10",
          "term_number": 10,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1316/6/20"),
          "end": jdate2timestamp("1318/6/27"),
          "term": "مجلس شورای ملی-دوره11",
          "term_number": 11,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1318/8/3"),
          "end": jdate2timestamp("1320/8/9"),
          "term": "مجلس شورای ملی-دوره12",
          "term_number": 12,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1320/8/22"),
          "end": jdate2timestamp("1322/9/1"),
          "term": "مجلس شورای ملی-دوره13",
          "term_number": 13,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1322/12/16"),
          "end": jdate2timestamp("1324/12/21"),
          "term": "مجلس شورای ملی-دوره14",
          "term_number": 14,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1326/4/25"),
          "end": jdate2timestamp("1328/5/6"),
          "term": "مجلس شورای ملی-دوره15",
          "term_number": 15,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1328/11/20"),
          "end": jdate2timestamp("1330/11/29"),
          "term": "مجلس شورای ملی-دوره16",
          "term_number": 16,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1331/2/7"),
          "end": jdate2timestamp("1332/8/28"),
          "term": "مجلس شورای ملی-دوره17",
          "term_number": 17,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1332/12/27"),
          "end": jdate2timestamp("1335/1/26"),
          "term": "مجلس شورای ملی-دوره18",
          "term_number": 18,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1335/3/10"),
          "end": jdate2timestamp("1339/3/29"),
          "term": "مجلس شورای ملی-دوره19",
          "term_number": 19,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1339/12/2"),
          "end": jdate2timestamp("1340/2/19"),
          "term": "مجلس شورای ملی-دوره20",
          "term_number": 20,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1342/7/14"),
          "end": jdate2timestamp("1346/7/13"),
          "term": "مجلس شورای ملی-دوره21",
          "term_number": 21,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1346/7/14"),
          "end": jdate2timestamp("1350/6/9"),
          "term": "مجلس شورای ملی-دوره22",
          "term_number": 22,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1350/6/9"),
          "end": jdate2timestamp("1354/6/16"),
          "term": "مجلس شورای ملی-دوره23",
          "term_number": 23,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1354/6/17"),
          "end": jdate2timestamp("1357/11/20"),
          "term": "مجلس شورای ملی-دوره24",
          "term_number": 24,
          "majles_name": "شورای ملی",
      },
      {
          "begin": jdate2timestamp("1359/3/7"),
          "end": jdate2timestamp("1363/3/6"),
          "term": "مجلس شورای اسلامی-دوره1",
          "term_number": 1,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1363/3/7"),
          "end": jdate2timestamp("1367/3/6"),
          "term": "مجلس شورای اسلامی-دوره2",
          "term_number": 2,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1367/3/7"),
          "end": jdate2timestamp("1371/3/6"),
          "term": "مجلس شورای اسلامی-دوره3",
          "term_number": 3,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1371/3/7"),
          "end": jdate2timestamp("1375/3/11"),
          "term": "مجلس شورای اسلامی-دوره4",
          "term_number": 4,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1375/3/12"),
          "end": jdate2timestamp("1379/3/6"),
          "term": "مجلس شورای اسلامی-دوره5",
          "term_number": 5,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1379/3/7"),
          "end": jdate2timestamp("1383/3/6"),
          "term": "مجلس شورای اسلامی-دوره6",
          "term_number": 6,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1383/3/7"),
          "end": jdate2timestamp("1387/3/6"),
          "term": "مجلس شورای اسلامی-دوره7",
          "term_number": 7,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1387/3/7"),
          "end": jdate2timestamp("1391/3/6"),
          "term": "مجلس شورای اسلامی-دوره8",
          "term_number": 8,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1391/3/7"),
          "end": jdate2timestamp("1395/3/7"),
          "term": "مجلس شورای اسلامی-دوره9",
          "term_number": 9,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1395/3/8"),
          "end": jdate2timestamp("1399/3/6"),
          "term": "مجلس شورای اسلامی-دوره10",
          "term_number": 10,
          "majles_name": "شورای اسلامی",
      },
      {
          "begin": jdate2timestamp("1399/3/7"),
          "end": jdate2timestamp("1403/3/6"),
          "term": "مجلس شورای اسلامی-دوره11",
          "term_number": 11,
          "majles_name": "شورای اسلامی",
      },
  ]
 def getTermQanon(ts_date_timestamp, ts_ref):
    setTermList()
    global _term_list
    term = ""
    term_number = 0
    majles_name = ""
    if ts_ref == "هيات وزيران (دوره فترت)":
        term = ts_ref
    if ts_ref == "نخست وزير (مصدق)":
        term = ts_ref
    if ts_ref == "وزير عدليه (داور)":
        term = ts_ref
    if ts_ref == "شوراي انقلاب جمهوري اسلامي ايران":
        term = ts_ref
    majles_name = term
    if term == "":
        for i in range(len(_term_list) - 1, -1, -1):
            begin = _term_list[i]["begin"]
            end = _term_list[i]["end"]
            if ts_date_timestamp >= begin and ts_date_timestamp <= end:
                term = _term_list[i]["term"]
                term_number = _term_list[i]["term_number"]
                majles_name = _term_list[i]["majles_name"]
                break
    error = ""
    if term == "":
        # if ts_date_timestamp >= _term_list[0]["begin"] and  ts_date_timestamp <= _term_list[len(_term_list)-1]["end"] :
        if ts_date_timestamp <= _term_list[len(_term_list) - 1]["end"]:
            for i in range(0, len(_term_list) - 1, 1):
                end = _term_list[i]["end"]
                if ts_date_timestamp <= end:
                    term = _term_list[i]["term"]
                    term_number = _term_list[i]["term_number"]
                    majles_name = _term_list[i]["majles_name"]
                    error = "تاریخ بین دو دوره"
                    break
        else:
            term_number = -1
            error = "تاریخ خارج از محدوده"
    return term, term_number, majles_name, error
 # این متد یک متن و ایندکس آغاز و پایان یک عبارت درون آن متن را دریافت می کند
 # و شماره توکن آغازین و توکن پایانی مربوط به عبارت در متن را بر می گرداند
 def token_state_finder(normalized_section_content, start_index, end_index):
    before_substring = normalized_section_content[0:start_index-1].strip()
    pattern_substring = normalized_section_content[start_index-1:end_index+1].strip()
    before_substring_token_list = before_substring.strip().split()
    pattern_token_list = pattern_substring.strip().split()
    start_token_state = len(before_substring_token_list)
    end_token_state = len(before_substring_token_list) + (len(pattern_token_list)-1)
    pattern_tokens_state ={
        "start_token_state": start_token_state,
        "end_token_state"  : end_token_state
          }
    return pattern_tokens_state
 def find_number_indexes_in_string(normalized_string,recognized_numbers):
    complete_recognized_numbers = []
    for item in recognized_numbers:
        number_start_index, number_end_index = find_token_indexes_in_string(normalized_string,item['start_token_index'],item['end_token_index'])
        content = normalized_string.split()
        # if item['start_token_index']==item['end_token_index']:
        #     # حذف این بخش و باقی گذاشتن دستور ذیل الز زیر در کفایت درست کار کردن متد بررسی شود
        #     number_token_list = content[item['start_token_index']]
        # else:
        number_token_list = content[item['start_token_index']:item['end_token_index']+1]
        complete_recognized_numbers.append(
            {
            'number_value'      : item['number_value'],
            'number_token_list' : number_token_list,
            'start_token_index' : item['start_token_index'],
            'end_token_index'   : item['end_token_index'],
            "start_number_state": number_start_index,
            "end_number_state"  : number_end_index
            }
        )
    return complete_recognized_numbers
 # این متد متن اصلی یک متن، توکن آغازین و توکن پایانی مربوط به یک عبارت را می گیرد
 # و ایندکس آغاز و ایندکس پایان متن وارد شده را بر می گرداند
 def find_token_indexes_in_string(normalized_string,start_token_state,end_token_state):
    before_tokens = normalized_string.split()[0:start_token_state]
    content_tokens = normalized_string.split()[start_token_state:end_token_state + 1]
    content_start_index = 0
    content_end_index = 0
    # شمردن تعداد کاراکترهای هر توکن در لیست توکن قبل از عدد
    for token in before_tokens:
        content_start_index += len(token)
    # اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس شروع عدد
    content_start_index += len(before_tokens) + 1
    # شمردن تعداد کاراکترهای هر توکن در لیست توکن مربوط به عدد
    for token in content_tokens:
        content_end_index += len(token)
    # اضافه کردن تعداد فاصله های خالی یا همان اسپیس به عدد ایندکس پایان عدد
    content_end_index += (content_start_index - 1) + (len(content_tokens) - 1)
    return content_start_index, content_end_index
 # این متد، متنی را دریافت می کند و الگوهای تعریف شده را در آن جستجو می کند و آرایه ای از عبارات مطابق با هر الگو،
 # شماره ایندکس شروع و پایان هر عبارت، عنوان و محتوای الگو، و شماره توکن شروع و توکن پایانی هر عبارت
 # پیدا شده را بر می گرداند
 def regex_patterns_finder(sectoin_content):
    regex_patterns = {
        "asle N asasi": r"اصل\s*شماره\s*(\d+)\s*قانون\s*اساسی\s*جمهوری\s*اسلامی\s*ایران", # اصل شماره فلان قانون اساسی جمهوری اسلامی ایران
        "qanone asasi": r"(?<!^)\b\sقانون\sاساسی\sجمهوری\sاسلامی\sایران",                  # قانون اساسی جمهوری اسلامی ایران که در اول پاراگراف نباشد
        "qanone asasi": r"(?<!^)\b\sقانون\sاساسی",                                          # قانون اساسی که در اول پاراگراف نباشد
        "qanon * mosavvab tarikh"    : r"\bقانون[\s\w/]*مصوب\s((y\d{2,4}m\d{1,2}d\d{1,2})|\d{2,4})",  # قانون * مصوب تاریخ
        "in qanon"    : r"این\sقانون",                                                    # این قانون
        "qanone foq"  : r"قانون\sفوق",                                                   # قانون فوق
        "eslahe qanon": r"قانون\sاصلاح",                                                  # اصلاح قانون
        "tabsare foq" : r"تبصره\sفوق",                                                   # تبصره فوق
        "made foq"    : r"ماده\sفوق",                                                     # ماده فوق
        "made vahede" : r"ماده\sواحده",                                                  # ماده واحده
        "made vahed"  : r"ماده\sواحد",                                                    # ماده واحد
        "tabsare N"   : r"^\bتبصره\s*شماره\s*(\d+)\s*",                                    # تبصره شماره فلان که فقط اول پاراگراف باشد
        "f tabsare N" : r"(?<!^)\bتبصره\sشماره\s(\d+)\s",                                  # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
        "tabsare N"   : r"(?<!^)\bتبصره ?\(? ?\d+? ?\)?[ :.]",                             # تبصره شماره فلان که همه جا غیر از اول پاراگراف باشد
        "made N"      : r"(?<!^)\bماده ?\(? ?\d+? ?\)?[ :.]",                              #  *** ماده فلان که هرجای پاراگراف غیر از اول آن باشد
        "f made N"    : r"^\bماده\s*[(]?\s*(\d+)\s*[)]?\s*"                                #  ماده فلان که فقط اول پاراگراف باشد با یا بدون ترکیب عدد با پرانتز
        }
    matched_array = []
    for pattern_key,pattern_value in regex_patterns.items():
        regex = re.compile(pattern_value)
        matches = regex.finditer(sectoin_content)
        for match in matches:
            # انجام عملیات مرتبط با هر الگو در اینجا
            founded_item = match.group()
            start_index = match.start() + 1
            end_index = match.end() - 1
            pattern_tokens_state = token_state_finder(sectoin_content, start_index, end_index)
            matched_array.append(
                {
                    "founded_item"     : founded_item,
                    "start_index"      : start_index,
                    "end_index"        : end_index,
                    "pattern_key"      : pattern_key,
                    "pattern_value"    : pattern_value,
                    "start_token_state": pattern_tokens_state["start_token_state"],
                    "end_token_state"  : pattern_tokens_state["end_token_state"],
                }
            )
            # convertedText = regex.sub(' wwwwwwwww ',convertedText)
    # مرتب کردن آرایه بر اساس توکن شروع عبارت
    matched_array.sort(key=lambda x: int(x['start_token_state']), reverse=False)
    return matched_array
 def change_refrece_tokens(normalized_section_content, recognized_patterns_array):
    token_list = normalized_section_content.strip().split()
    for ref_item in recognized_patterns_array:
        start_token_state = ref_item.get('start_token_state')
        end_token_state = ref_item.get('end_token_state')
        for i in range(start_token_state, end_token_state+1):
            token_list[i] = 'eeee'
    normalized_section_content = ''
    for token in token_list:
        normalized_section_content = ''.join([normalized_section_content, (' ' + token)])
    return normalized_section_content.strip()
 def getMetaData(text):
    normalized_section_content, recognized_dates, recognized_numbers = normalizerDate2(text.strip())
    recognized_numbers         = find_number_indexes_in_string(text,recognized_numbers)
    normalized_section_content = normalized_section_content.strip()
    recognized_patterns_array  = regex_patterns_finder(normalized_section_content)
    normalized_section_content  = change_refrece_tokens(normalized_section_content, recognized_patterns_array)
    nlp_parser = []
    date_list  = recognized_dates
    ref_list   = recognized_patterns_array
    for date_item in date_list:
        nlp_parser.append({
                "properties": {
                                "type"       : "date",
                                "index_start": int(date_item['start_date_token_index']),
                                "index_end"  : int(date_item['end_date_token_index']),
                                "text"       : date_item['original_date'],
                                "result"     : date_item['converted_date'],
                                #"timestamp"  : date_item['timestamp'],
                                "ref_link"   : ''
                                }      
                            })
    for ref_item in ref_list:
        nlp_parser.append({
                "properties": {
                                "type"         : "reference",
                                "index_start"  : int(ref_item['start_token_state']),
                                "index_end"    : int(ref_item['end_token_state']),
                                "text"         : ref_item['founded_item'],
                                "result"       : ref_item['pattern_value'],
                                "ref_link"     : ''
                                }      
                            })
    return nlp_parser, normalized_section_content
 def save_error(error_text,filename):
    with open(filename, 'a+', encoding='utf-8') as file:
        # نوشتن خطا در فایل
        file.write(error_text + '\n' + 50*'*' + '\n')
--- a/keyword_extractor.py
+++ b/keyword_extractor.py
@ -0,0 +1,106 @@
 import json
 from tqdm import tqdm
 import time
 import torch
 import os
 from transformers import AutoTokenizer, AutoModel
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 os.environ['HF_HOME'] = "/home/admin/HFHOME"
 #model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 model_id = "meta-llama/Llama-3.1-70B-Instruct"
 # use quantization to lower GPU usage
 # 4 bit:
 # bnb_config = BitsAndBytesConfig(
 #     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
 # )
 # 8 bit:
 bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
 )
 terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
 model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
 SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
 And you don't need to provide explanations or additional text.
 Put each keyword on a single line."
 """#  Explain your answer step by step.
 def format_prompt(SENTENCE):
  PROMPT = f"Persian legal text: {SENTENCE}."
  return PROMPT
 def generate(formatted_prompt):
  formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
  messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
  # tell the model to generate
  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)
  outputs = model.generate(
      input_ids,
      max_new_tokens=2048,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
  )
  response = outputs[0][input_ids.shape[-1]:]
  return tokenizer.decode(response, skip_special_tokens=True)
 def get_rules(sentence):
    formatted_prompt = format_prompt(sentence)
    rules = generate(formatted_prompt).split('\n')
    result =  [r.strip() for r in rules if r.strip()]
    return result
 if __name__ == "__main__":
    print('start')
    start_time = time.time()
    inputfile = open('./data/main_classes_dataset_03.json', "r", encoding='utf-8')
    data = json.load(inputfile)
    inputfile.close()
    counter = 1
    for c in tqdm(data):
        for item in tqdm(data[c]): 
            content = item['content']
            item['keywords'] = get_rules(content)
            print(f"section {counter} ...")
            counter += 1
    outputfile = open('./data/main_keywords_lama70B_dataset_03.json', "w", encoding='utf-8')
    outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4))
    outputfile.close()
    end_time = time.time()
    print(f"elapsed time:   {end_time-start_time}")
    print("end")
    exit()
    """
    system prompt version 2 for test:
    You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
    user prompt version 2 for test:
    Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
    Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
    """
--- a/llama_givechi.py
+++ b/llama_givechi.py
@ -0,0 +1,236 @@
 """
 این فایل با نرمالایزر پارسیور کار می کند
 """
 from datetime import datetime
 # from elasticsearch import Elasticsearch
 from threading import Thread
 import torch
 import time
 import json 
 import os.path
 import os
 from funcs import write_to_json, read_from_json
 from normalizer import Normalizer
 from tokenizer import *
 _normalizer = Normalizer(date_normalizing_needed=True)
 address = os.getcwd()
 # sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
 # sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
 # sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
 # sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
 def read_from_json1(file_address):
    data_dict = []
    # خواندن اطلاعات از فایل JSON
    with open(file_address, 'r', encoding='utf-8') as file:
        loaded_data = json.load(file)
    # نمایش اطلاعات خوانده شده
    # for item in loaded_data:
    #     data_dict.append(item)
    return loaded_data
 sections_list = read_from_json1("./data_givechi/main_classes_dataset_03.json") # Main File
 # not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
 # not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
 not_have_two_token_kw_list = []
 import json
 os.environ['HF_HOME'] = "/home/admin/HFHOME"
 from tqdm import tqdm
 import time
 from transformers import AutoTokenizer, AutoModel
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 #model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
 model_id = "meta-llama/Llama-3.1-70B-Instruct"
 # use quantization to lower GPU usage
 # 4 bit:
 # bnb_config = BitsAndBytesConfig(
 #     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
 # )
 # 8 bit:
 bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
 )
 print("Model Loading START:")
 print(str(datetime.now()))
 print()
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
 )
 terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
 ]
 model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
 print("Model Loading END:")
 print(str(datetime.now()))
 print()
 # if torch.cuda.is_available():
 #     #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
 #     # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #     model_id = "meta-llama/Llama-3.1-70B-Instruct"
 #     model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
 #     tokenizer = AutoTokenizer.from_pretrained(model_id)
 # index_name_i = 'semantic_search-v10'
 # es = Elasticsearch(
 #     "http://127.0.0.1:6900",
 #     # ca_certs="/path/to/http_ca.crt",
 #     basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
 # )
 counter = 0
 total = 0
 remained = 0
 id = ''
 keywords_count = 15
 def generateKeywords(law):
    global remained
    try:
        keywords_count = (len(law) / 1000) * 15
        keywords_count = int(keywords_count)
        if keywords_count == 0: 
            keywords_count = 1
        # messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
        #              {"role": "user", "content": 
        #             '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
        #             هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
        #             .format(keywords_count)
        # },
        #              {"role": "user", "content": 
        #             '''"متن": {}'''.format(text)
        # },]
        messages =  [{"role": "system", "content": '''You are a lawyer and legal expert who can interpret legal texts well, understand the semantic layers of the text and infer logical relationships in the text.
 You can understand Persian and English and interpret expressions in a Legal Context. You are also an expert in Deontic Logic and can understand the logical structure of law and rules in text.
 Use uppercase English letters such as A, B, C, etc. to identify all possible propositions. Do not include negative tones such as ""not"" in the propositions. For example, if the sentence is ""It is not bored,"" you should use ""A: bored"" to represent it.
 Next, for each proposition, use the symbol to represent its negative form. For example, the negative form of proposition A can be expressed as ¬A.
 Now, carefully analyze the context and find causal relationship between propositions seriously. A causal expression is only established when the context directly and explicitly supports this relationship. Use arrows (→) to indicate causal relationships, for example, ""If A, then B"", ""B if A"" and ""A causes B"" etc. must be represented as A → B.
 another example, ""if A and not C then B"" should be represented as A ∧ ¬C → B
 if the causal expression is not strict and is somehow loose, do not state it.
 Extraction Criteria:
 1- Defintion of a propostion:
 a proposition is an statement which could be true or false. ""I'm an animal"" is a propostion, but ""Ministry of Roads and Urban Development"" is not.
 Defintion of strict caustion:
 A strictly causes B if and only if:
 1. Whenever A occurs, B necessarily occurs.
 2. The occurrence of B is entirely dependent on the occurrence of A.
 3. There are no other factors or conditions that can independently cause B.
 4. The relationship between A and B is invariant and holds in all relevant circumstances.
 Additional Instructions:
 Do Not Add Extra Information: Provide only the extracted rules in the specified format without additional commentary or explanations.
 Formatting Instructions:
 Language: Output must be in English.
 output only propositions and causal expressions that are explicitly stated in the following text.''' },
        {"role": "user", "content": f'''"this is your text to process:{law}'''
        },]
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id
        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.85,
        )
        #lock0.release()
        response = outputs[0][input_ids.shape[-1]:]
        keywords = tokenizer.decode(response, skip_special_tokens=True)
        #lock1.acquire()
        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
        return keywords
    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print("Exception: " + str(inst))
 if __name__ == "__main__":
    start_time = time.time()
    print("start_time: "+str(datetime.now()))
    try:
        keywords_dict = []
        count = 1
        finall_data = []
        for q_class, current_section_list in sections_list.items():
            for content_item in current_section_list:
                id = content_item['id']
                qanon_id = content_item['qanon_id']
                # if not id in not_have_two_token_kw_list:
                #     continue
                content = content_item['content']
                content_len = len(content.split())
                # کنارگذاشتن محتواهای با حجم زیاد
                if content_len > 2000:
                    print("too long content " + str(id))
                    continue
                content = _normalizer.sub_alphabets(content)
                prompt_result = generateKeywords(content) 
                print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " prompting ... ")
                keywords_dict.append({
                        'id':id,
                        'qanon-id':qanon_id,
                        'content':content,
                        'prompt-result':prompt_result
                    })
                if count ==5:
                    write_to_json(keywords_dict, f"./data_givechi/main_test_{count}.json")
                    # keywords_dict = []
                count+=1
            # finall_data.append({'class':q_class, 'sections': keywords_dict})
            finall_data.append({q_class : keywords_dict})
        write_to_json(finall_data, "./data_givechi/main_classes_dataset_result.json")
    except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
    end_time = time.time()
    print("end_time: "+ str(datetime.now()))
    operation_time = (int(end_time-start_time)/60)/60
    print(f"elapsed time: {operation_time} hours")
    print(f"Finished!!!")
--- a/normalizer.py
+++ b/normalizer.py
--- a/resource/normalizer/Dic1_new.txt
+++ b/resource/normalizer/Dic1_new.txt
@ -0,0 +1,74 @@
 بیخبر بی‌خبر
 بیتوجهی بی‌توجهی
 بیطرفانه بی‌طرفانه
 گفتوگو گفت‌وگو
 آنها آن‌ها
 پیشبرد پیش‌برد
 روانشناختی روان‌شناختی
 میباشد می‌باشد
 لذتبخش لذت‌بخش
 میدادند می‌دادند
 مینویسد می‌نویسد
 میبخشد می‌بخشد
 بیقاعده بی‌قاعده
 میباشند می‌باشند
 موافقتنامه موافقت‌نامه
 تخمگذار تخم‌گذار
 پایینترین پایین‌ترین
 گرمکن گرم‌کن
 پیشبینی پیش‌بینی
 برونگرا برون‌گرا
 میدهد می‌دهد
 فیلمبرداری فیلم‌برداری
 آنسوی آن‌سوی
 خدمتدهی خدمت‌دهی
 اینگونه این‌گونه
 کمکرسانی کمک‌رسانی
 کلانشهر کلان‌شهر
 سپردهگذار سپرده‌گذار
 بنیانگذار بنیان‌گذار
 رضایتبخش رضایت‌بخش
 اصلاحطلبان اصلاح‌طلبان
 استخوانبندی استخوان‌بندی
 درونگرا درون‌گرا
 میگردد می‌گردد
 اصلاحطلب اصلاح‌طلب
 میتوان می‌توان
 عملکرد عمل‌کرد
 میروم می‌روم
 بزرگنمایی بزرگ‌نمایی
 همجنس هم‌جنس
 همانطور همان‌طور
 بیشترین بیش‌ترین
 انسانگرایی انسان‌گرایی
 نمیباشند نمی‌باشند
 جانبداری جانب‌داری
 نمیتوانی نمی‌توانی
 قانونگذار قانون‌گذار
 میشدند می‌شدند
 تفاهمنامه تفاهم‌نامه
 آسیبپذیر آسیب‌پذیر
 برونگرایی برون‌گرایی
 جفتگیری جفت‌گیری
 گرانبها گران‌بها
 میشوند می‌شوند
 کلاهبرداری کلاه‌برداری
 جهتیابی جهت‌یابی
 چشمپوشی چشم‌پوشی
 بنیانگذاران بنیان‌گذاران
 میکند می‌کند
 الهامبخش الهام‌بخش
 وقتگیر وقت‌گیر
 پسلرزه پس‌لرزه
 میکنند می‌کنند
 میتواند می‌تواند
 آرامبخش آرام‌بخش
 بینام بی‌نام
 غربزدگی غرب‌زدگی
 بیتفاوت بی‌تفاوت
 بیثباتی بی‌‌ثباتی
 پاسخگویی پاسخ‌گویی
 میگیرد می‌گیرد
 جمعبندی جمع‌بندی
 میشود می‌شود
 میکنیم می‌کنیم
--- a/resource/normalizer/Dic2_new.txt
+++ b/resource/normalizer/Dic2_new.txt
@ -0,0 +1,112 @@
 مهماننوازی مهمان‌نوازی
 صلیالله صلی‌الله
 موافقتنامه موافقت‌نامه
 اعتراضآمیز اعتراض‌آمیز
 رییسجمهور رییس‌جمهور
 چشمپوشی چشم‌پوشی
 هیئتعلمی هیئت‌علمی‌
 الزامآور الزام‌آور
 بیمهنامه بیمه‌نامه
 آییننامه آیین‌نامه
 بتنریزی بتن‌ریزی
 تشییعجنازه تشییع‌جنازه
 تامینکنندگان تامین‌کنندگان
 پرسشنامه پرسش‌نامه
 تحتالشعاع تحت‌الشعاع
 شگفتانگیز شگفت‌انگیز
 بزرگنمایی بزرگ‌نمایی
 نیمههادی نیمه‌هادی
 قابلکنترل قابل‌کنترل
 روانپزشکی روان‌پزشکی
 ضربالمثل ضرب‌المثل
 اضافهکاری اضافه‌کاری
 اختلافنظر اختلاف‌نظر
 بینالملل بین‌الملل
 یکطرفه یک‌طرفه
 موجشکن موج‌شکن
 عزتنفس عزت‌نفس
 بیسیم بی‌سیم
 شیبدار شیب‌دار
 دستیابی دست‌یابی
 روانشناختی روان‌شناختی
 عقبنشینی عقب‌نشینی
 بهطور به‌طور
 خطچین خط‌چین
 ادراکشده ادراک‌شده
 خزانهداری خزانه‌داری
 شیمیدرمانی شیمی‌درمانی
 آنسوی ‌آن‌سوی
 نقطهچین نقطه‌چین
 منحصربهفرد منحصربه‌فرد
 درحالتوسعه درحال‌توسعه
 رضایتبخش رضایت‌بخش
 قرضالحسنه قرض‌الحسنه
 هرجومرج هرج‌ومرج
 سیبزمینی سیب‌زمینی
 میلیگرم میلی‌گرم
 نخستوزیر نخست‌وزیر
 تعیینکنندهای تعیین‌کننده‌ای
 طاقتفرسا طاقت‌فرسا
 قابلمشاهده قابل‌مشاهده
 بهوسیله به‌وسیله
 قابلدستیابی قابل‌دستیابی
 الهامبخش الهام‌بخش
 پیدرپی پی‌درپی
 سرمایهداری سرمایه‌داری
 لذتبخش لذت‌بخش
 تخمگذار تخم‌گذار
 گرمکن گرم‌کن
 قابلتوجهی قابل‌توجهی
 فیلمبرداری فیلم‌برداری
 خدمتدهی خدمت‌دهی
 معنیدار معنی‌دار
 کلانشهری کلان‌شهری
 گواهینامه گواهی‌نامه
 همجنس هم‌جنس
 همانطور همان‌طور
 سیستمعامل سیستم‌عامل
 حملونقل حمل‌ونقل
 تفاهمنامه تفاهم‌نامه
 بینالمللی بین‌المللی
 کلاهبرداری کلاه‌برداری
 نرمافزار نرم‌افزار
 مضافالیه مضاف‌الیه
 قطعنامهای قطعنامه‌ای
 پاسخگویی پاسخ‌گویی
 عکسبرداری عکس‌برداری
 پسلرزه پس‌لرزه
 خردهفروشی خرده‌فروشی
 حقوقبشر حقوق‌بشر
 تحلیلگران تحلیل‌گران
 اینگونه این‌گونه
 صرفهجویی صرفه‌جویی
 علیالخصوص علی‌الخصوص
 کلانشهرها کلان‌شهرها
 حاصلضرب حاصل‌ضرب
 اطلاعرسانی اطلاع‌رسانی
 دندانپزشکی دندان‌پزشکی
 پیشبرد پیش‌برد
 ایدهال ایده‌ال
 هیچگاه هیچ‌گاه
 صنایعدستی صنایع‌دستی
 سانتیمتر سانتی‌متر
 پیشبینی پیش‌بینی
 خلیجفارس خلیج‌فارس
 تاریخنگاری تاریخ‌نگاری
 هیچگونه هیچ‌گونه
 راهاندازی راه‌اندازی
 جستوجوی جست‌وجوی
 حاشیهنشینی حاشیه‌نشینی
 رنگآمیزی رنگ‌آمیزی
 جمعآوری جمع‌‌آوری
 وقتگیر وقت‌گیر
 آرامبخش آرام‌بخش
 غربزدگی غرب‌زدگی
 کلانشهر کلان‌شهر
 نرمافزاری نرم‌افزاری
 بدینوسیله بدین‌وسیله
 جمعبندی جمع‌بندی
 گفتوگو گفت‌وگو
 حملونقل حمل‌ونقل
 آیتالله آیت‌الله
 حجتالاسلام حجت‌الاسلام
--- a/resource/normalizer/Dic3_new.txt
+++ b/resource/normalizer/Dic3_new.txt
@ -0,0 +1,5 @@
 حملونقل حمل‌ونقل
 حجتالاسلاموالمسلمین حجت‌الاسلام‌والمسلمین
 آیتاللهالعظمی آیت‌الله‌العظمی
 گفتوگو گفت‌وگو
 حملونقل حمل‌ونقل
--- a/resource/tokenizer/TokenMerger.pckl
+++ b/resource/tokenizer/TokenMerger.pckl
--- a/resource/tokenizer/enDict
+++ b/resource/tokenizer/enDict
--- a/resource/tokenizer/faDict
+++ b/resource/tokenizer/faDict
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,59 @@
 import re
 class Tokenizer():
    def __init__(self):
        pass
    def tokenize_words(self, doc_string):
        token_list = doc_string.strip().split()
        #token_list = [x.strip("\u200c").strip('\u200f') for x in token_list if len(x.strip("\u200c")) != 0]
        new_token_list = []
        new_token = ''
        for index,token in enumerate(token_list):
            if len(token.strip("\u200c")) != 0:
                new_token = token.strip("\u200c")
            if len(token.strip("\u200f")) != 0:
                new_token = new_token.strip('\u200f')
            new_token_list.append(new_token)
        return new_token_list
    def tokenize_sentences(self, doc_string):
        #finding the numbers
        pattern = r"[-+]?\d*\.\d+|\d+"
        nums_list = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
        pattern = r'([!\.\?؟]+)[\n]*'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)
        pattern = r':\n'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)
        pattern = r';\n'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)
        pattern = r'؛\n'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)
        pattern = r'[\n]+'
        doc_string = re.sub(pattern, self.add_tab, doc_string)
        for number in nums_list:
            pattern = 'floatingpointnumber'
            doc_string = re.sub(pattern, number, doc_string, 1)
        doc_string = doc_string.split('\t\t')
        doc_string = [x for x in doc_string if len(x) > 0]
        return doc_string
    def add_tab(self, mystring):
        mystring = mystring.group()  # this method return the string matched by re
        mystring = mystring.strip(' ')  # ommiting the whitespace around the pucntuation
        mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
        mystring = " " + mystring + "\t\t"  # adding a space after and before punctuation
        return mystring