first

2025-08-20 06:27:36 +03:30
6 changed files with 2491 additions and 0 deletions
--- a/ner_dataset/REF_finder.py
+++ b/ner_dataset/REF_finder.py
@ -0,0 +1,67 @@
 # بسم الله
 import json
 from elastic_helper import ElasticHelper 
 Read = open (".\data\DATASET140402_no_arefـoutput.json","r",encoding='utf8')
 RefList = json.loads(Read.read())
 path = ".\data\mj_qa_section-v02.zip"
 eh_obj = ElasticHelper()
 sections = eh_obj.iterateJsonFile(path, True)
 all_ref_list = []
 find_refs_list = []
 not_find_refs_list = []
 for index, item in enumerate(sections):
    ref_id = item['id']
    source = item['source']
    content = source['content']
    all_ref_list.append([ref_id,content.strip()])
 n=1
 for item in RefList :
    refID2 , Content2 , ner_list = item['id'],item['content'].strip(),item['ner']
    x=0
    for refID1 , Content in all_ref_list:
        if len(ner_list)==0:
            x=1
            continue
        else:
            if Content2 == Content and x == 0:
                find_refs_list.append([refID1,refID2,Content])
                print(f"REF ID {refID2} Found ! ... ")
                x = 1
    if x == 0:
        not_find_refs_list.append(refID2)
    print(f"{n} OF {len(RefList)} searched ...")
    n+=1
 with open("foundfind_refs_list.json", "w" , encoding="utf8") as f:
    json.dump(find_refs_list, f, indent=4, ensure_ascii=False )
 txt=''
 for id_ in not_find_refs_list:
    txt+=f"{id_}\n"
 with open("not_found_ids.txt", "w",encoding="utf8") as file:
        # نوشتن داده‌ها در فایل
        file.write(txt)
 print("finish!")
--- a/ner_dataset/pycache/elastic_helper.cpython-313.pyc
+++ b/ner_dataset/pycache/elastic_helper.cpython-313.pyc
--- a/ner_dataset/data/mj_qa_section-v02.zip
+++ b/ner_dataset/data/mj_qa_section-v02.zip
--- a/ner_dataset/elastic_helper.py
+++ b/ner_dataset/elastic_helper.py
@ -0,0 +1,677 @@
 import zipfile
 import sys
 import os
 import json
 from time import sleep
 from elasticsearch import Elasticsearch,helpers
 class ElasticHelper():
    counter = 0
    total = 0
    id = ""
    path_mappings = os.getcwd() + '/repo/_other/'
    # def __init__(self, es_url="http://127.0.0.1:6900", es_pass="", es_user="elastic", path_mappings = ""):
        # if path_mappings : 
        #    self.path_mappings = path_mappings
        # if es_pass == '' :
        #     self.es = Elasticsearch(es_url)
        # else:
        #     self.es = Elasticsearch(
        #         es_url,
        #         http_auth=(es_user, es_pass),
        #     )
        # print(es_url)
        # print(self.es)
        # self.success_connect = False
        # for a in range(0,10):
        #     try :
        #         if not self.es.ping():
        #             print('elastic not ping, sleep 30 s   : ', a)
        #             sleep(5)
        #             continue
        #         else:
        #             self.success_connect = True
        #             break
        #     except Exception as e:
        #         break
        # if not self.success_connect :
        #     print('******','not access to elastic service')
        #     return
        # self.counter = 0
        # self.total = 0
        # self.id = ""
    def get_doctument(self, index_name, id):
        res = self.es.get(index=index_name, id=id)
        return res
    def exist_doctument(self, index_name, id):
        res = self.es.exists(index=index_name, id=id)
        return res
    def update_index_doc(self, is_update_state, index_name_o, eid, data):
        if is_update_state:
            resp = self.es.update(index=index_name_o, id=eid, doc=data)
            # resp = self.es.update(index=index_name_o, id=eid, body={'doc':data})
        else:
            resp = self.es.index(index=index_name_o, id=eid, document=data)
        return resp    
    def exportToJsonForAI(self, path_back, index_name, out_name= '', body={}, fields=[]) :
        print('*' * 50, ' start backup -->', index_name)
        self.counter = 0
        sid = None
        out = out_name
        if out_name == '' :
           out = index_name 
        fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
        s_res = self.es.search(
            index=index_name,
            scroll='5m',
            size=1000,
            body=body
        )
        self.total = s_res["hits"]["total"]['value']
        print('start index = %s' % index_name)
        print('total = %d' % self.total)
        sid = s_res['_scroll_id']
        scroll_size = len(s_res['hits']['hits'])
        file_count = 1
        out_json = []
        while scroll_size > 0:
            "Scrolling..."
            self.counter += scroll_size
            print("progress -> %.2f %%" % ((self.counter / self.total)*100))
            #############################
            for item in s_res['hits']['hits']:                
                if fields :
                   item2={}
                   item2['id']=item['_id']
                   for kf in  fields :
                       #print(kf)
                       if kf in item['_source'] :
                          # print(item['_source'][kf])
                          item2[kf] = item['_source'][kf]
                       #exit()
                else :
                    item2=item        
                out_json.append(item2)
            s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
            sid = s_res['_scroll_id']
            scroll_size = len(s_res['hits']['hits'])
        sid = None
        text = json.dumps(out_json, ensure_ascii=False) 
        fout.write(text)
        ##############################
    def backupIndexToZipfile(self, path_back, index_name, out_name= '', body={}, byzip = True, fields=[], noFields=[]) :
        print('*' * 50, ' start backup -->', index_name)
        self.counter = 0
        sid = None 
        out = out_name
        if out_name == '' :
           out = index_name 
        if body == {} :
            s_res = self.es.search(
                index=index_name,
                scroll='5m',
                size=1000
            )
        else:
            s_res = self.es.search(
                index=index_name,
                scroll='5m',
                size=1000,
                body=body
            )
        self.total = s_res["hits"]["total"]['value']
        if self.total == 0 :
           print('total index_name by query = %d' % self.total)
           return False
        if byzip:
            fout = zipfile.ZipFile(path_back + "/"+ out + '.zip', 'w')
        else:    
            fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
        print('start index = %s' % index_name)
        print('total = %d' % self.total)
        sid = s_res['_scroll_id']
        scroll_size = len(s_res['hits']['hits'])
        file_count = 1
        while scroll_size > 0:
            "Scrolling..."
            self.counter += scroll_size
            print("progress -> %.2f %%" % ((self.counter / self.total)*100))
            #############################
            out_json = []
            for item in s_res['hits']['hits']:                
                if fields :
                   item2={}
                   item2['id']=item['_id']
                   item2['_source']={}
                   for kf in  fields :
                       if kf in item['_source'] :
                          item2['_source'][kf] = item['_source'][kf]
                else :
                    item2=item   
                if noFields :
                    for kf in  noFields :
                       if kf in item2['_source']:
                          del item2['_source'][kf]
                out_json.append(item2)
            text = json.dumps(out_json, ensure_ascii=False) 
            out_json = []
            if byzip:
                filename = out + str(file_count) + '.json'    
                file_count +=1
                fout.writestr(filename, text.encode('utf-8'), zipfile.ZIP_DEFLATED )
            else:    
                fout.write(text)
            ##############################
            s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
            sid = s_res['_scroll_id']
            scroll_size = len(s_res['hits']['hits'])
        sid = None    
        fout.close()
    def restorFileToElastic(self, path_back, index_name, app_key = '', queryDelete = True, map_name='') :
        if not os.path.exists(path_back) : 
            print(' **** error *** path not exist: ', path_back)  
            return False
        file_path = path_back + '/' + index_name + '.zip'
        if not os.path.exists(file_path ) :
            return False
        if queryDelete :
            # اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
            if  self.deleteIndex(index_name) :
                self.createIndex(index_name, app_key, map_name)
                self.zipFileToElastic(file_path, index_name)
        else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
            self.createIndex(index_name, app_key, map_name)
            self.zipFileToElastic(file_path, index_name) 
    def restorFileToElastic2(self, path_file, index_name, app_key = '', queryDelete = True, map_name='') :
        if not os.path.exists(path_file) : 
            print(' **** error *** path not exist: ', path_file)  
            return False
        file_path = path_file
        if not os.path.exists(file_path ) :
            return False
        if queryDelete :
            # اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
            if  self.deleteIndex(index_name) :
                self.createIndex(index_name, app_key, map_name)
                self.zipFileToElastic(file_path, index_name)
        else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
            self.createIndex(index_name, app_key, map_name)
            self.zipFileToElastic(file_path, index_name) 
    def renameElasticIndex(self, index_name_i, index_name_o, app_key = '', map_name='') :
        if self.createIndex(index_name_o, app_key, map_name) :
           res = self.es.reindex(
                    body={
                        "source": {"index": index_name_i},
                        "dest": {"index": index_name_o}
                    },
                    wait_for_completion=False)
           print(type(res))
           print(res)
           taskid = res["task"] if res["task"] else ""
           #tasks = client.TasksClient(self.es)
           tasks = self.es.tasks
           while True :               
               res = tasks.get(task_id = taskid)
               if res["completed"] :
                  break
               # print( res["task"])
               print( '----', index_name_o, '  imported : ', res["task"]["status"]["total"] , ' / ', res["task"]["status"]["created"])
               sleep(1)
           print( '----', index_name_o, '  complated')
    def deleteIndex(self, index_name) :
        if not self.es.indices.exists(index=index_name) :
            print(' ' * 10, " for delete NOT exist index :", index_name )
            return True
        question = 'Is DELETE elastic index (' + index_name +') ? ' 
        if self.query_yes_no(question) :
           self.es.indices.delete(index = index_name)  
           print('%' * 10 , "  Finish DELETE  index :", index_name )
           return True
        else :
            return False   
    def query_yes_no(self, question, default="no"):
        valid = { "yes": True, "y": True, "ye": True, "no": False, "n": False }
        if default is None:
            prompt = " [y/n] "
        elif default == "yes":
            prompt = " [Y/n] "
        elif default == "no":
            prompt = " [y/N] " 
        else:
            raise ValueError("invalid default answer: '%s'" % default)
        while True:
            print('%'*10, '  quistion ', '%'*10 , '\n')
            sys.stdout.write(question + prompt) 
            choice = input().lower()
            if default is not None and choice == "":
                return valid[default]
            elif choice in valid:
                return valid[choice]
            else:
                sys.stdout.write("لطفا یکی از موارد روبرو را وارد کنید : 'yes' or 'no' " "(or 'y' or 'n').\n") 
    def createIndexIfNotExist(self, index_name_o, mapping_o=""):
        try:
            if not self.es.indices.exists(index=index_name_o):
                response = self.es.indices.create(index=index_name_o, body=mapping_o)
                # print out the response:
                print("create index response:", response)
        except:
            print("....... index exist ! ... not created")
    def createIndex(self, index_name, app_key='', map_name=''):
        path_base = self.path_mappings
        path_mapping1 =  path_base + 'general/'
        if app_key == '' :
           app_key = 'tavasi'
        path_mapping2 =  path_base + app_key + '/'
        if map_name == '':
            map_name = index_name
        if self.es.indices.exists(index=index_name) :
            print("============== exist index :", index_name )
            return True  
        if map_name == 'mj_rg_section' or map_name == 'semantic_search' :
            map_name = 'mj_qa_section'
        elif map_name[-3]=='_ai':
            map_name=[0-len(map_name)-3]
            print(map_name)
        mapping_file_path = path_mapping1 + map_name + '.json'
        print("mapping_file_path : " , mapping_file_path)
        if not os.path.isfile(mapping_file_path):
            if not os.path.isfile(mapping_file_path):
                mapping_file_path = path_mapping2 + map_name + '.json'
        print("mapping_file_path : " , mapping_file_path)
        # Create Index With Mapping
        if os.path.isfile(mapping_file_path):
            mapping_file = open( mapping_file_path,'r', encoding='utf-8' )
            mapping_file_read = mapping_file.read()
            mapping_data = json.loads(mapping_file_read)
            mapping_file.close()   
            if self.es.indices.exists(index=index_name) :
                print("============== exist index :", index_name )
            else :                   
                self.es.indices.create(index = index_name , body = mapping_data) 
            return True   
        else:       
            print('*** error not find maping file elastic : *******',  mapping_file_path)     
            return False
    def updateBulkList(self, listData, index_name):
        chunk_size=1000
        raise_on_error=False
        raise_on_exception=False
        stats_only=True
        yield_ok = False
        actions=[]
        for item in listData:
            actions.append({
                            "_op_type": "update", 
                            "_index": index_name,
                            "_id"   : item['_id'],
                            "doc": item['_source']
                        } 
            )                                       
        helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
    def importBulkList(self, listData, index_name):
        chunk_size=100000
        raise_on_error=False
        raise_on_exception=False
        stats_only=True
        yield_ok = False
        for item in listData:
            actions = [{
                            "_op_type": "index", 
                            "_index": index_name,
                            "_id"   : item['_id'],
                            "_source": item['_source']
                        } 
                    ]                                             
            helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
    def importJsonDataToElastic(self, jsonData, index_name, fields=[]):
        chunk_size=1000
        raise_on_error=False
        raise_on_exception=False
        stats_only=True
        yield_ok = False
        actions=[]
        for item in jsonData:
            id = item['_id'] if item['_id'] else item['id']
            source = item['_source']
            if fields :
                source = {}
                for col in fields :
                    if col in item['_source'] :
                      source[col] = item['_source']
            actions.append({
                            "_op_type": "index", 
                            "_index": index_name,
                            "_id"   : id,
                            "_source": source
                        }) 
        helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
    def fileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
            if not os.path.exists(file_path):
                print("file zip:" , file_path , " not exist")
                return
            print("index:" , index_name , '=>' , file_path ) 
            self.counter = 0
            with open(file_path) as file:  
                data = json.loads(file.read())
                self.importJsonDataToElastic(data, index_name, fields)
            self.es.indices.refresh(index=index_name)
            print(self.es.cat.count(index=index_name, format="json"))
    def zipFileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
            if not os.path.exists(file_path):
                print("file zip:" , file_path , " not exist for imort to elastic : ", index_name )
                return
            fileNo = 0
            with zipfile.ZipFile(file_path, 'r') as zObject:
                fileNo +=1
                print("="*10, " zip fileNo: " , fileNo ,"  - ( ", index_name," ) | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
                packNo = 0   
                self.counter = 0
                for filename in zObject.namelist(): 
                    packNo += 1
                    if limit_pack != -1 :
                        if packNo > limit_pack :
                            print('limit_data  ', index_name, '  ', limit_pack)
                            break
                    print("index:" , index_name , '=>' , filename ) 
                    with zObject.open(filename) as file:  
                        data = json.loads(file.read())
                        self.importJsonDataToElastic(data, index_name, fields)
                self.es.indices.refresh(index=index_name)
                print(self.es.cat.count(index=index_name, format="json"))
                print(" END Of Import to elastic ", index_name ,"\n")
    def iterateJsonFile(self, file_path, isZip=True, limit_pack = -1):
            if not os.path.exists(file_path):
                print("file zip:" , file_path , " not exist  iterateJsonFile " )
                return
            if isZip :
                fileNo = 0
                with zipfile.ZipFile(file_path, 'r') as zObject:
                    fileNo +=1
                    print("="*10, " zip fileNo: " , fileNo ,"  iterateJsonFile - | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
                    packNo = 0   
                    self.counter = 0
                    for filename in zObject.namelist(): 
                        packNo += 1
                        if limit_pack != -1 :
                            if packNo > limit_pack :
                                print('limit_data  iterateJsonFile   ', limit_pack)
                                break
                        print("index iterateJsonFile :", '=>' , filename ) 
                        with zObject.open(filename) as file:  
                            data = json.loads(file.read())
                            # Yield each entry
                            # yield data
                            yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
            else :
                with open(filename, 'r', encoding='utf-8') as file:  
                    data = json.loads(file.read())
                    # Yield each entry
                    # yield from (hit for hit in data)
                    #return data
                    yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
    def es_iterate_all_documents(self, index, body="", pagesize=250, scroll_timeout="25m", **kwargs):
        """
        Helper to iterate ALL values from a single index
        Yields all the documents.
        """
        is_first = True
        while True:
            # Scroll next
            if is_first:  # Initialize scroll
                # result = self.es.search(index=index, scroll="2m", **kwargs, body={
                #     "size": pagesize
                # })
                if body : 
                    result = self.es.search(
                        index=index,
                        scroll=scroll_timeout,
                        **kwargs,
                        size=pagesize,
                        body=body
                    )
                else :
                    result = self.es.search(
                        index=index,
                        scroll=scroll_timeout,
                        **kwargs,
                        size=pagesize
                    )
                self.total = result["hits"]["total"]["value"]
                if self.total > 0:
                    print("total = %d" % self.total)
                is_first = False
            else:
                # result = es.scroll(body={
                #     "scroll_id": scroll_id,
                #     "scroll": scroll_timeout
                # })
                result = self.es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
            scroll_id = result["_scroll_id"]
            hits = result["hits"]["hits"]
            self.counter += len(hits)
            if self.total > 0 :
                print("progress -> %.2f %%" % ((self.counter / self.total) * 100))
            # Stop after no more docs
            if not hits:
                break
            # Yield each entry
            yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
    def moveCustomFileds(self, index_name_i, index_name_o, fields=[], renameFileds={}):
        try:
            body = {}
            list = []
            try:
                list = self.es_iterate_all_documents(index_name_i)
            except Exception as e:
                print(e)
            count = 0
            for mentry in list:
                count += 1
                entry = mentry["source"]
                id = mentry["id"]
                # print(id)
                eid = id
                if (count % 100) == 0 :   
                    print("%s -> %.2f " % (id , (count / self.total) if self.total > 0 else 0))
                data_filled = False
                data = {}
                for col in fields:
                    if '.' in col :
                        cols = col.split('.')
                        subsource = entry 
                        for sub in cols :
                            dCol = subsource.get(sub, None)
                            if dCol :
                               subsource = dCol
                            else :
                                break
                    else : 
                        dCol = entry.get(col, None)
                    if dCol is None:
                        continue
                    if col in renameFileds :                       
                        data[renameFileds[col]] = dCol    
                    else:
                        data[col] = dCol    
                    data_filled = True
                if not data_filled :
                    continue
                try:
                    resp = self.update_index_doc(True, index_name_o, eid, data)
                except Exception as e:
                    print(e)
                    # save_error(id, e)
        except Exception as e:
            # print("1111")
            print(e)
            # save_error(id, e)
    def mappingIndex(self, index_name_i):
        # فقط از طریق کیبانا میشه تغییر مپ داد 
        #  با پایتون نمیشه 
        # باید ایندکس جدیدی با مپ مطلوب ایجاد کرد و رایندکس کرد
        pass
    def updateByQueryIndex(self, index_name_i, body):
        ## sample
        # body = {
        #         "script": {
        #             "inline": "ctx._source.Device='Test'",
        #             "lang": "painless"
        #         },
        #         "query": {
        #             "match": {
        #                 "Device": "Boiler"
        #             }
        #         }
        #     }
        try:
            self.es.update_by_query(body=body, index=index_name_i)
        except Exception as e:
            print(e)
            # save_error(id, e)    
    def deleteByQueryIndex(self, index_name_i, body):
        ## sample
        # body = {
        #         "query": {
        #             "match": {
        #                 "Device": "Boiler"
        #             }
        #         }
        #     }
        try:
            self.es.delete_by_query(index=index_name_i, body=body )
        except Exception as e:
            print(e)
            # save_error(id, e)   
    def delete_by_ids(self, index_name_i, ids):
        try:
            # ids = ['test1', 'test2', 'test3'] 
            query = {"query": {"terms": {"_id": ids}}}
            res = self.es.delete_by_query(index=index_name_i, body=query)
            print(res)                    
        except Exception as e:
            print(e)
            # save_error(id, e)   
--- a/ner_dataset/find_law.py
+++ b/ner_dataset/find_law.py
@ -0,0 +1,334 @@
 # بسم الله 
 from elastic_helper import ElasticHelper
 from thefuzz import fuzz
 import json
 Read = open ('.\data\DATASET140402_no_arefـoutput.json',"r",encoding='utf8')
 RefList = json.loads(Read.read())
 path = ".\\data\\mj_qa_section-v02.zip"
 eh_obj = ElasticHelper()
 sections = eh_obj.iterateJsonFile(path, True)
 no_found_id = []
 txt_file = open(".\\no_find_txt.txt" , "r" , encoding="utf8")
 n = 0
 for line in txt_file:
    if n != 0:
        no_found_id.append(int(line.strip()))
        n=0
        continue
    n = 1
 all_law_dict = []
 for index, item in enumerate(sections):
    ref_id = item['id']
    source = item['source']
    content = source['content'].strip()
    all_law_dict.append({"id":ref_id , "caption":content, "approve_date":source['ts_date']})
 def law_dict_saver(law_id,start_token_index,end_token_index,found_law_list,law_captions,matched_string,original_string,multi_flag):
        dict = {
                                "law_id"        : law_id,
                                "start_token_index": start_token_index,
                                "end_token_index"  : end_token_index,
                                "found_law_list": found_law_list,
                                "law_captions"  : law_captions,
                                "matched_string": matched_string,
                                "original_string": original_string,
                                "multi_flag": multi_flag
                                }
        return dict
 def remove_latest_added_token(text):
        temp = text.strip().split(' ')
        temp.pop()
        text = ''
        for token in temp:
            text = text + ' ' + token
        return text.strip()
 def law_recognizer(text, law_dict):
        i = 0
        normalized_content = text
        text_token_list = normalized_content.strip().split()
        matched_token_index_list = []
        # جمع آوری عناوین احتمالی قانون در یک متن بر اساس کلیدواژه قانون
        for index,token in enumerate(text_token_list):
            if 'قانون' in token:
                matched_token_index_list.append(index)
        content_token_list = []
        law_token_list     = []
        for index, item in enumerate(matched_token_index_list):
            # اگر آیتم، آخرین عنصر موجود در آرایه نبود ...
            end = 12  # در اینجا مشخص میکنیم چند کلمه را بررسی کند و حلقه بررسی چندبار تکرار شود
            if item < len(text_token_list):
                # نُه توکن بعدی را به عنوان عبارات تکمیلی احتمالی عنوان قانون ذخیره می کنیم
                if item + end < len(text_token_list):
                    for i in range(end):
                        if item + (i+1) >= len(text_token_list):
                            break
                        content_token_list.append(text_token_list[item + (i+1)])
                    i = 0
                # توکن های باقیمانده(که کمتر از نُه توکن است) تا پایان آرایه را ذخیره کن
                else:
                    j = 0
                    while j < len(text_token_list)-index:
                        if item + (j+1) >= len(text_token_list)-index:
                            break
                        content_token_list.append(text_token_list[item + (j+1)])
                        j += 1
                    j = 0
            law_token_list.append({
                'start_token_index': item,
                'law_token'        : content_token_list
                })
            if len(content_token_list) < end :   # اگر مقدار کلمات انتخابی برای بررسی از طول کلمات جمله بیشتر بود
                end = len(content_token_list)  # کلمات انتخابی برای بررسی را به اندازه کل کلمات جمله قرار بده
            content_token_list = []
        matched_law_list = [] 
        c = 0 
        for key, law_value in enumerate(law_token_list):
            c += 1
            law_token         = law_value['law_token']
            start_token_index = law_value['start_token_index']
            end_token_index   = 0
            found_law_list_1     = []
            found_law_list_2     = []
            found_law_list_3     = []
            # اگر تعداد توکن های متنی که احتمالا عنوان یک قانون است، صفر بود،
            # از حلقه خارج می شویم و به سراغ بررسی عنوان قانون بعدی می رویم
            if len(law_token) < 1:
                break
            # در ابتدا اولین توکن عبارتی که احتمالا عنوان یک قانون است را در عنوان قانون موجود در بانک بررسی می کنیم
            # در مراحل بعدی تا به نُه گام برسیم، یکی یکی توکن ها را به توکن اول اضافه و سپس با عناوین قانون ها مقایسه می کنیم
            law_section = law_token[0]
            for index, value in enumerate(law_dict):
                # عنوان قانونی که در حال مقایسه متن مورد نظر با آن هستیم
                id = value['id']
                current_caption = value['caption']
                current_approve_date = value['approve_date']
                # بررسی وجود عبارت مورد نظر در عنوان قانون
                if current_caption.__contains__(law_section):
                    # به دست آوردن اولین توکن از عنوان قانون
                    current_law_first_token = current_caption.strip().split(' ')[0]
                    # اگر اولین توکن از عنوان قانون برابر با کلمه "قانون" بود، این کلمه را نادیده میگیریم
                    # زیرا در لیست مربوط به لیست توکن های احتمالی مربوط به قوانین، کلمه قانون را در نظر نگرفته ایم 
                    if current_law_first_token == 'قانون':
                        current_law_first_token = current_caption.strip().split(' ')[1]
                    if law_section == current_law_first_token:
                        # اگر زیر رشته موردنظر ما در عنوان قانون وجود داشت، نام قانون را در یک لیست ذخیره می کنیم. 
                        # در مرحله بعد متن احتمالی قانون که در حال بررسی آن هستیم را با این لیست مقایسه می کنیم تا مقایسه محدود تری داشته باشیم
                        found_law_list_1.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
                    else:
                        continue
            X = 0
            FoundLawList=[]
            OldFoundLawList=[]
            NewFoundLawList=[]
            while X < end-1 :
            # for x in range(end):
                X+=1
                if X == 1:  # در بررسی توکن اول وارد این شرط میشود
                    if len(found_law_list_1) == 0:
                        # X= X+1
                        continue
                    else:
                        # X=X+1
                        if len(found_law_list_1) == 1:
                            found_law = []
                            found_law.append(found_law_list_1.pop())
                            k = 0
                            matched_string = ''
                            found_law_caption = found_law[0]['caption'].strip()
                            if found_law_caption.startswith('قانون'):
                                found_law_caption = found_law_caption[5:]
                            found_law_caption_tokens = found_law_caption.strip().split()
                            for k in range(len(law_token)):
                                if k >= len(found_law_caption_tokens):
                                    break
                                if law_token[k] == found_law_caption_tokens[k]:
                                    matched_string += law_token[k] + ' '
                                else:
                                    end_token_index = start_token_index + len(matched_string.strip().split())
                                    found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
                                    matched_law_list.append(found_law_dict)
                                    break
                                end_token_index = start_token_index + len(matched_string.strip().split())
                                found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
                                matched_law_list.append(found_law_dict)
                            continue
                        if len(law_token) < 2:
                            continue
                        law_section = law_token[0]+' '+law_token[1]
                        for value in found_law_list_1:
                            id = value['id']
                            current_caption = value['caption']
                            current_approve_date = value['approve_date']
                            rate = fuzz.token_set_ratio(current_caption,law_section)
                            if rate == 100:
                                found_law_list_2.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
                    FoundLawList = found_law_list_1
                    NewFoundLawList = found_law_list_2
                    continue
                OldFoundLawList = FoundLawList
                FoundLawList = NewFoundLawList
                NewFoundLawList = []
                if X == int(end-1): # در بررسی آخرین کلمه وارد این شرط میشود
                    if len(FoundLawList) == 0:
                        # اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
                        if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
                            # به دقت کنترل شود
                            # مرتب سازی بر اساس قدیم به جدیدترین شناسه
                            sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
                            found_law               = sorted_found_law_list.pop()
                            end_token_index = start_token_index + len(law_section.strip().split())
                            # آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
                            # زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
                            law_section = remove_latest_added_token(law_section)
                            found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
                            matched_law_list.append(found_law_dict)
                        continue
                    else:
                        if len(FoundLawList) == 1:
                            sorted_found_law_list   = sorted(FoundLawList, key=lambda x: x['approve_date'])
                            found_law = []
                            found_law.append(FoundLawList.pop())
                            end_token_index = start_token_index + len(law_section.strip().split())
                            found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
                            matched_law_list.append(found_law_dict)
                        elif len(FoundLawList) > 1 and len(FoundLawList) < 6:
                            sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'] )
                            found_law               = sorted_found_law_list.pop()
                            end_token_index = start_token_index + len(law_section.strip().split())
                            found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,FoundLawList,found_law['caption'],law_section,law_token,True)
                            matched_law_list.append(found_law_dict)
                    break
                if len(FoundLawList) == 0:
                    # اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
                    if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
                        # به دقت کنترل شود
                        # مرتب سازی بر اساس قدیم به جدیدترین شناسه
                        sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
                        found_law               = sorted_found_law_list.pop()
                        end_token_index = start_token_index + len(law_section.strip().split())
                        # آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
                        # زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
                        law_section = remove_latest_added_token(law_section)
                        found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
                        matched_law_list.append(found_law_dict)
                    continue
                else:
                    if len(FoundLawList) == 1:
                        found_law = []
                        found_law.append(FoundLawList.pop()) #             = found_law_list_2.pop()
                        end_token_index = start_token_index + len(law_section.strip().split())
                        found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
                        matched_law_list.append(found_law_dict)
                        # اگر در جستجوی عنوان قانون، به یک مورد منحصر به فرد رسیده بودیم، فقط همین یک عنوان را ذخیره کند
                        continue
                    if len(law_token) < X+1 :
                        continue
                    law_section += ' ' + law_token[X] # X = new token
                    # law_section     =  List_Law_tokens
                    for value in FoundLawList:
                        id = value['id']
                        current_caption = value['caption']
                        current_approve_date = value['approve_date']
                        rate = fuzz.token_set_ratio(current_caption,law_section)
                        if rate == 100:
                            NewFoundLawList.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
                # OldFoundLawList=FoundLawList
                # FoundLawList=NewFoundLawList
                # NewFoundLawList=[]
        if matched_law_list:
            for law_item in matched_law_list:
                temp_list = []
                found_list = law_item['found_law_list']
                for item in found_list:
                    temp_list.append(item['caption'] + '#' + str(item['id']) + '#' + item['approve_date'])
                law_item['found_law_list'] = temp_list
        return matched_law_list, law_token_list
 n=0
 not_found_ids = []
 all_laws_founded = []
 for section in RefList :
    refID , Content , ner_list = section['id'],section['content'].strip(),section['ner']
    print(f"ID {refID} is searching... ")
    if refID in no_found_id:
        matched_law_list, law_token_list = law_recognizer(Content, all_law_dict )
        matched_law_list_ids = []
        matched_law_list_content = []
        for law in matched_law_list:
            matched_law_list_ids.append(law['law_id'])
            matched_law_list_content.append(law['law_captions'])
        if len(matched_law_list) != 0 :
            n+=1
            all_laws_founded.append({"dataset-REF":{"id":refID,"content":Content},
                                 "All-REF":{"id":matched_law_list_ids,"content":matched_law_list_content}})
        else: 
            not_found_ids.append(refID)
 txt=''
 for id_ in not_found_ids:
    txt+=f"{id_}\n"
 with open("not_found_idsX.txt", "w",encoding="utf8") as file:
        # نوشتن داده‌ها در فایل
        file.write(txt)
 with open("founded_lawsX.json", "w" , encoding="utf8") as f:
    json.dump(all_laws_founded, f, indent=4, ensure_ascii=False )
 print(f"{n} Law Founded ! ")
 print(f"{len(not_found_ids)} Law Not Founded ! ")
--- a/ner_dataset/normalizer.py
+++ b/ner_dataset/normalizer.py