6 changed files with 0 additions and 2491 deletions
--- a/ner_dataset/REF_finder.py
+++ b/ner_dataset/REF_finder.py
@ -1,67 +0,0 @@
-# بسم الله
-
-
-import json
-from elastic_helper import ElasticHelper 
-
-
-
-Read = open (".\data\DATASET140402_no_arefـoutput.json","r",encoding='utf8')
-RefList = json.loads(Read.read())
-path = ".\data\mj_qa_section-v02.zip"
-eh_obj = ElasticHelper()
-sections = eh_obj.iterateJsonFile(path, True)
-
-all_ref_list = []
-find_refs_list = []
-not_find_refs_list = []
-
-for index, item in enumerate(sections):
-    ref_id = item['id']
-    source = item['source']
-    content = source['content']
-    all_ref_list.append([ref_id,content.strip()])
-    
-    
-    
-n=1
-for item in RefList :
-    refID2 , Content2 , ner_list = item['id'],item['content'].strip(),item['ner']
-    x=0
-    for refID1 , Content in all_ref_list:
-        
-        if len(ner_list)==0:
-            x=1
-            continue
-        else:
-            if Content2 == Content and x == 0:
-                find_refs_list.append([refID1,refID2,Content])
-                print(f"REF ID {refID2} Found ! ... ")
-                x = 1
-    
-    if x == 0:
-        not_find_refs_list.append(refID2)
-        
-    print(f"{n} OF {len(RefList)} searched ...")
-    n+=1
-    
-    
-    
-    
-
-
-with open("foundfind_refs_list.json", "w" , encoding="utf8") as f:
-    json.dump(find_refs_list, f, indent=4, ensure_ascii=False )
-
-
-txt=''
-for id_ in not_find_refs_list:
-    txt+=f"{id_}\n"
-
-with open("not_found_ids.txt", "w",encoding="utf8") as file:
-        # نوشتن داده‌ها در فایل
-        file.write(txt)
-
-
-print("finish!")
-    
--- a/ner_dataset/pycache/elastic_helper.cpython-313.pyc
+++ b/ner_dataset/pycache/elastic_helper.cpython-313.pyc
--- a/ner_dataset/data/mj_qa_section-v02.zip
+++ b/ner_dataset/data/mj_qa_section-v02.zip
--- a/ner_dataset/elastic_helper.py
+++ b/ner_dataset/elastic_helper.py
@ -1,677 +0,0 @@
-import zipfile
-import sys
-import os
-import json
-from time import sleep
-from elasticsearch import Elasticsearch,helpers
-
-class ElasticHelper():
-    
-    counter = 0
-    total = 0
-    id = ""
-    path_mappings = os.getcwd() + '/repo/_other/'
-    
-    # def __init__(self, es_url="http://127.0.0.1:6900", es_pass="", es_user="elastic", path_mappings = ""):
-
-        # if path_mappings : 
-        #    self.path_mappings = path_mappings
-
-        # if es_pass == '' :
-        #     self.es = Elasticsearch(es_url)
-        # else:
-        #     self.es = Elasticsearch(
-        #         es_url,
-        #         http_auth=(es_user, es_pass),
-        #     )
-        
-        # print(es_url)
-        # print(self.es)
-
-        # self.success_connect = False
-        # for a in range(0,10):
-        #     try :
-        #         if not self.es.ping():
-        #             print('elastic not ping, sleep 30 s   : ', a)
-        #             sleep(5)
-        #             continue
-        #         else:
-        #             self.success_connect = True
-        #             break
-
-        #     except Exception as e:
-        #         break
-        # if not self.success_connect :
-        #     print('******','not access to elastic service')
-        #     return
-
-
-        # self.counter = 0
-        # self.total = 0
-        # self.id = ""
-
-
-    def get_doctument(self, index_name, id):
-        res = self.es.get(index=index_name, id=id)
-        return res
-    
-    def exist_doctument(self, index_name, id):
-        res = self.es.exists(index=index_name, id=id)
-        return res
-
-    def update_index_doc(self, is_update_state, index_name_o, eid, data):
-        if is_update_state:
-            resp = self.es.update(index=index_name_o, id=eid, doc=data)
-            # resp = self.es.update(index=index_name_o, id=eid, body={'doc':data})
-        else:
-            resp = self.es.index(index=index_name_o, id=eid, document=data)
-        return resp    
-
-    
-    def exportToJsonForAI(self, path_back, index_name, out_name= '', body={}, fields=[]) :
-        print('*' * 50, ' start backup -->', index_name)
-        self.counter = 0
-        sid = None
-
-        out = out_name
-        if out_name == '' :
-           out = index_name 
-
-        fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
-
-        s_res = self.es.search(
-            index=index_name,
-            scroll='5m',
-            size=1000,
-            body=body
-        )
-        self.total = s_res["hits"]["total"]['value']
-
-        print('start index = %s' % index_name)
-        print('total = %d' % self.total)
-
-        sid = s_res['_scroll_id']
-        scroll_size = len(s_res['hits']['hits'])
-        file_count = 1
-        out_json = []
-        while scroll_size > 0:
-            "Scrolling..."
-            self.counter += scroll_size
-            print("progress -> %.2f %%" % ((self.counter / self.total)*100))
-            #############################
-            for item in s_res['hits']['hits']:                
-
-                if fields :
-                   item2={}
-                   item2['id']=item['_id']
-                   for kf in  fields :
-                       #print(kf)
-                       if kf in item['_source'] :
-                          # print(item['_source'][kf])
-                          item2[kf] = item['_source'][kf]
-                       #exit()
-                else :
-                    item2=item        
-
-                out_json.append(item2)
-
-
-            s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
-            sid = s_res['_scroll_id']
-            scroll_size = len(s_res['hits']['hits'])
-
-        sid = None
-        text = json.dumps(out_json, ensure_ascii=False) 
-        fout.write(text)
-
-        ##############################
-
-    def backupIndexToZipfile(self, path_back, index_name, out_name= '', body={}, byzip = True, fields=[], noFields=[]) :
-        print('*' * 50, ' start backup -->', index_name)
-        self.counter = 0
-        sid = None 
-
-        out = out_name
-        if out_name == '' :
-           out = index_name 
-
-
-        if body == {} :
-            s_res = self.es.search(
-                index=index_name,
-                scroll='5m',
-                size=1000
-            )
-        else:
-            s_res = self.es.search(
-                index=index_name,
-                scroll='5m',
-                size=1000,
-                body=body
-            )
-    
-        self.total = s_res["hits"]["total"]['value']
-        if self.total == 0 :
-           print('total index_name by query = %d' % self.total)
-           return False
-
-        if byzip:
-            fout = zipfile.ZipFile(path_back + "/"+ out + '.zip', 'w')
-        else:    
-            fout = open( path_back + "/"+ out + '.json', 'a+' , encoding='utf-8')
-
-
-        print('start index = %s' % index_name)
-        print('total = %d' % self.total)
-
-        sid = s_res['_scroll_id']
-        scroll_size = len(s_res['hits']['hits'])
-        file_count = 1
-        while scroll_size > 0:
-            "Scrolling..."
-            self.counter += scroll_size
-            print("progress -> %.2f %%" % ((self.counter / self.total)*100))
-            #############################
-            out_json = []
-            for item in s_res['hits']['hits']:                
-                if fields :
-                   item2={}
-                   item2['id']=item['_id']
-                   item2['_source']={}
-                   for kf in  fields :
-                       if kf in item['_source'] :
-                          item2['_source'][kf] = item['_source'][kf]
-                else :
-                    item2=item   
-
-                if noFields :
-                    for kf in  noFields :
-                       if kf in item2['_source']:
-                          del item2['_source'][kf]
-                          
-
-                out_json.append(item2)
-
-
-            text = json.dumps(out_json, ensure_ascii=False) 
-            out_json = []
-            if byzip:
-                filename = out + str(file_count) + '.json'    
-                file_count +=1
-                fout.writestr(filename, text.encode('utf-8'), zipfile.ZIP_DEFLATED )
-            else:    
-                fout.write(text)
-
-            ##############################
-            s_res = self.es.scroll(scroll_id=sid, scroll='2m', request_timeout=100000)
-            sid = s_res['_scroll_id']
-            scroll_size = len(s_res['hits']['hits'])
-        sid = None    
-        fout.close()
-
-
-    def restorFileToElastic(self, path_back, index_name, app_key = '', queryDelete = True, map_name='') :
-        if not os.path.exists(path_back) : 
-            print(' **** error *** path not exist: ', path_back)  
-            return False
-
-        file_path = path_back + '/' + index_name + '.zip'
-        if not os.path.exists(file_path ) :
-            return False
-
-        if queryDelete :
-            # اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
-            if  self.deleteIndex(index_name) :
-                self.createIndex(index_name, app_key, map_name)
-                self.zipFileToElastic(file_path, index_name)
-        else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
-            self.createIndex(index_name, app_key, map_name)
-            self.zipFileToElastic(file_path, index_name) 
-
-    def restorFileToElastic2(self, path_file, index_name, app_key = '', queryDelete = True, map_name='') :
-        if not os.path.exists(path_file) : 
-            print(' **** error *** path not exist: ', path_file)  
-            return False
-
-        file_path = path_file
-        if not os.path.exists(file_path ) :
-            return False
-
-        if queryDelete :
-            # اگر وجود داشته باشد، از کاربر برای حذفش سوال میکند
-            if  self.deleteIndex(index_name) :
-                self.createIndex(index_name, app_key, map_name)
-                self.zipFileToElastic(file_path, index_name)
-        else : # اگر وجود داشته باشد پرش می کند و کاری نمیکند
-            self.createIndex(index_name, app_key, map_name)
-            self.zipFileToElastic(file_path, index_name) 
-
-
-    def renameElasticIndex(self, index_name_i, index_name_o, app_key = '', map_name='') :
-
-        if self.createIndex(index_name_o, app_key, map_name) :
-           res = self.es.reindex(
-                    body={
-                        "source": {"index": index_name_i},
-                        "dest": {"index": index_name_o}
-                    },
-                    wait_for_completion=False)
-           
-           print(type(res))
-           print(res)
-
-           taskid = res["task"] if res["task"] else ""
-           #tasks = client.TasksClient(self.es)
-           tasks = self.es.tasks
-           while True :               
-               res = tasks.get(task_id = taskid)
-               if res["completed"] :
-                  break
-
-               # print( res["task"])
-               print( '----', index_name_o, '  imported : ', res["task"]["status"]["total"] , ' / ', res["task"]["status"]["created"])
-               sleep(1)
-           print( '----', index_name_o, '  complated')
-
-
-    def deleteIndex(self, index_name) :
-        if not self.es.indices.exists(index=index_name) :
-            print(' ' * 10, " for delete NOT exist index :", index_name )
-            return True
-
-        question = 'Is DELETE elastic index (' + index_name +') ? ' 
-        if self.query_yes_no(question) :
-           self.es.indices.delete(index = index_name)  
-           print('%' * 10 , "  Finish DELETE  index :", index_name )
-           return True
-        else :
-            return False   
-
-    def query_yes_no(self, question, default="no"):
-        valid = { "yes": True, "y": True, "ye": True, "no": False, "n": False }
-        if default is None:
-            prompt = " [y/n] "
-        elif default == "yes":
-            prompt = " [Y/n] "
-        elif default == "no":
-            prompt = " [y/N] " 
-        else:
-            raise ValueError("invalid default answer: '%s'" % default)
-
-        while True:
-            print('%'*10, '  quistion ', '%'*10 , '\n')
-            sys.stdout.write(question + prompt) 
-            choice = input().lower()
-            if default is not None and choice == "":
-                return valid[default]
-            elif choice in valid:
-                return valid[choice]
-            else:
-                sys.stdout.write("لطفا یکی از موارد روبرو را وارد کنید : 'yes' or 'no' " "(or 'y' or 'n').\n") 
-
-    def createIndexIfNotExist(self, index_name_o, mapping_o=""):
-        try:
-            if not self.es.indices.exists(index=index_name_o):
-                response = self.es.indices.create(index=index_name_o, body=mapping_o)
-                # print out the response:
-                print("create index response:", response)
-        except:
-            print("....... index exist ! ... not created")
-
-
-    def createIndex(self, index_name, app_key='', map_name=''):
-
-        path_base = self.path_mappings
-        path_mapping1 =  path_base + 'general/'
-        if app_key == '' :
-           app_key = 'tavasi'
-        path_mapping2 =  path_base + app_key + '/'
-
-
-        if map_name == '':
-            map_name = index_name
-            
-        if self.es.indices.exists(index=index_name) :
-            print("============== exist index :", index_name )
-            return True  
-
-        if map_name == 'mj_rg_section' or map_name == 'semantic_search' :
-            map_name = 'mj_qa_section'
-        elif map_name[-3]=='_ai':
-            map_name=[0-len(map_name)-3]
-            print(map_name)
-    
-        mapping_file_path = path_mapping1 + map_name + '.json'
-        print("mapping_file_path : " , mapping_file_path)
-        if not os.path.isfile(mapping_file_path):
-            if not os.path.isfile(mapping_file_path):
-                mapping_file_path = path_mapping2 + map_name + '.json'
-
-        print("mapping_file_path : " , mapping_file_path)
-
-        # Create Index With Mapping
-        if os.path.isfile(mapping_file_path):
-            mapping_file = open( mapping_file_path,'r', encoding='utf-8' )
-            mapping_file_read = mapping_file.read()
-            mapping_data = json.loads(mapping_file_read)
-            mapping_file.close()   
-            if self.es.indices.exists(index=index_name) :
-                print("============== exist index :", index_name )
-            else :                   
-                self.es.indices.create(index = index_name , body = mapping_data) 
-            return True   
-        else:       
-            print('*** error not find maping file elastic : *******',  mapping_file_path)     
-            return False
-
-
-    def updateBulkList(self, listData, index_name):
-        chunk_size=1000
-        raise_on_error=False
-        raise_on_exception=False
-        stats_only=True
-        yield_ok = False
-
-        actions=[]
-        for item in listData:
-            actions.append({
-                            "_op_type": "update", 
-                            "_index": index_name,
-                            "_id"   : item['_id'],
-                            "doc": item['_source']
-                        } 
-            )                                       
-        helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
-
-    def importBulkList(self, listData, index_name):
-        chunk_size=100000
-        raise_on_error=False
-        raise_on_exception=False
-        stats_only=True
-        yield_ok = False
-
-        for item in listData:
-            actions = [{
-                            "_op_type": "index", 
-                            "_index": index_name,
-                            "_id"   : item['_id'],
-                            "_source": item['_source']
-                        } 
-                    ]                                             
-            helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
-
-
-    def importJsonDataToElastic(self, jsonData, index_name, fields=[]):
-        chunk_size=1000
-        raise_on_error=False
-        raise_on_exception=False
-        stats_only=True
-        yield_ok = False
-
-        actions=[]
-
-        for item in jsonData:
-            id = item['_id'] if item['_id'] else item['id']
-            source = item['_source']
-            if fields :
-                source = {}
-                for col in fields :
-                    if col in item['_source'] :
-                      source[col] = item['_source']
-
-
-            actions.append({
-                            "_op_type": "index", 
-                            "_index": index_name,
-                            "_id"   : id,
-                            "_source": source
-                        }) 
-        helpers.bulk(self.es, actions, chunk_size, raise_on_error, raise_on_exception, stats_only, yield_ok )
-                      
-
-    def fileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
-            if not os.path.exists(file_path):
-                print("file zip:" , file_path , " not exist")
-                return
-            print("index:" , index_name , '=>' , file_path ) 
-            self.counter = 0
-            with open(file_path) as file:  
-                data = json.loads(file.read())
-                self.importJsonDataToElastic(data, index_name, fields)
-
-            self.es.indices.refresh(index=index_name)
-            print(self.es.cat.count(index=index_name, format="json"))
-                                    
-    def zipFileToElastic(self, file_path, index_name, limit_pack = -1, fields=[]):
-            if not os.path.exists(file_path):
-                print("file zip:" , file_path , " not exist for imort to elastic : ", index_name )
-                return
-
-            fileNo = 0
-            with zipfile.ZipFile(file_path, 'r') as zObject:
-                fileNo +=1
-                print("="*10, " zip fileNo: " , fileNo ,"  - ( ", index_name," ) | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
-
-                packNo = 0   
-                self.counter = 0
-                for filename in zObject.namelist(): 
-                    packNo += 1
-                    if limit_pack != -1 :
-                        if packNo > limit_pack :
-                            print('limit_data  ', index_name, '  ', limit_pack)
-                            break
-
-                    print("index:" , index_name , '=>' , filename ) 
-                    with zObject.open(filename) as file:  
-                        data = json.loads(file.read())
-                        self.importJsonDataToElastic(data, index_name, fields)
-      
-                self.es.indices.refresh(index=index_name)
-                print(self.es.cat.count(index=index_name, format="json"))
-                print(" END Of Import to elastic ", index_name ,"\n")
-        
-
-    def iterateJsonFile(self, file_path, isZip=True, limit_pack = -1):
-            if not os.path.exists(file_path):
-                print("file zip:" , file_path , " not exist  iterateJsonFile " )
-                return
-
-            if isZip :
-                fileNo = 0
-                with zipfile.ZipFile(file_path, 'r') as zObject:
-                    fileNo +=1
-                    print("="*10, " zip fileNo: " , fileNo ,"  iterateJsonFile - | File Numbers:" ,len(zObject.namelist()) , "=" * 10)
-
-                    packNo = 0   
-                    self.counter = 0
-                    for filename in zObject.namelist(): 
-                        packNo += 1
-                        if limit_pack != -1 :
-                            if packNo > limit_pack :
-                                print('limit_data  iterateJsonFile   ', limit_pack)
-                                break
-
-                        print("index iterateJsonFile :", '=>' , filename ) 
-                        with zObject.open(filename) as file:  
-                            data = json.loads(file.read())
-                            # Yield each entry
-                            # yield data
-                            yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
-            else :
-                with open(filename, 'r', encoding='utf-8') as file:  
-                    data = json.loads(file.read())
-                    # Yield each entry
-                    # yield from (hit for hit in data)
-                    #return data
-                    yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in data)
-    
-
-    def es_iterate_all_documents(self, index, body="", pagesize=250, scroll_timeout="25m", **kwargs):
-        """
-        Helper to iterate ALL values from a single index
-        Yields all the documents.
-        """
-        is_first = True
-        while True:
-            # Scroll next
-            if is_first:  # Initialize scroll
-                # result = self.es.search(index=index, scroll="2m", **kwargs, body={
-                #     "size": pagesize
-                # })
-                if body : 
-                    result = self.es.search(
-                        index=index,
-                        scroll=scroll_timeout,
-                        **kwargs,
-                        size=pagesize,
-                        body=body
-                    )
-                else :
-                    result = self.es.search(
-                        index=index,
-                        scroll=scroll_timeout,
-                        **kwargs,
-                        size=pagesize
-                    )
-
-                self.total = result["hits"]["total"]["value"]
-                if self.total > 0:
-                    print("total = %d" % self.total)
-                is_first = False
-            else:
-                # result = es.scroll(body={
-                #     "scroll_id": scroll_id,
-                #     "scroll": scroll_timeout
-                # })
-                result = self.es.scroll(scroll_id=scroll_id, scroll=scroll_timeout)
-               
-            scroll_id = result["_scroll_id"]
-            hits = result["hits"]["hits"]
-            self.counter += len(hits)
-            if self.total > 0 :
-                print("progress -> %.2f %%" % ((self.counter / self.total) * 100))
-            # Stop after no more docs
-            if not hits:
-                break
-            # Yield each entry
-            yield from ({"source": hit["_source"], "id": hit["_id"]} for hit in hits)
-
-
-    def moveCustomFileds(self, index_name_i, index_name_o, fields=[], renameFileds={}):
-        try:
-            body = {}
-            list = []
-            try:
-                list = self.es_iterate_all_documents(index_name_i)
-            except Exception as e:
-                print(e)
-
-            count = 0
-            for mentry in list:
-                count += 1
-               
-                entry = mentry["source"]
-                id = mentry["id"]
-                # print(id)
-                eid = id
-
-                if (count % 100) == 0 :   
-                    print("%s -> %.2f " % (id , (count / self.total) if self.total > 0 else 0))
-
-                data_filled = False
-                data = {}
-                for col in fields:
-
-                    if '.' in col :
-                        cols = col.split('.')
-                        subsource = entry 
-                        for sub in cols :
-                            dCol = subsource.get(sub, None)
-                            if dCol :
-                               subsource = dCol
-                            else :
-                                break
-                    else : 
-                        dCol = entry.get(col, None)
-
-                    if dCol is None:
-                        continue
-
-                    if col in renameFileds :                       
-                        data[renameFileds[col]] = dCol    
-                    else:
-                        data[col] = dCol    
-
-                    data_filled = True
-
-                if not data_filled :
-                    continue
-
-                try:
-                    resp = self.update_index_doc(True, index_name_o, eid, data)
-                except Exception as e:
-                    print(e)
-                    # save_error(id, e)
-
-        except Exception as e:
-            # print("1111")
-            print(e)
-
-            # save_error(id, e)
-
-    def mappingIndex(self, index_name_i):
-        # فقط از طریق کیبانا میشه تغییر مپ داد 
-        
-        #  با پایتون نمیشه 
-        # باید ایندکس جدیدی با مپ مطلوب ایجاد کرد و رایندکس کرد
-        pass
-
-    def updateByQueryIndex(self, index_name_i, body):
-        ## sample
-        # body = {
-        #         "script": {
-        #             "inline": "ctx._source.Device='Test'",
-        #             "lang": "painless"
-        #         },
-        #         "query": {
-        #             "match": {
-        #                 "Device": "Boiler"
-        #             }
-        #         }
-        #     }
-        try:
-            self.es.update_by_query(body=body, index=index_name_i)
-        
-        except Exception as e:
-            print(e)
-            # save_error(id, e)    
-
-
-    def deleteByQueryIndex(self, index_name_i, body):
-        ## sample
-        # body = {
-        #         "query": {
-        #             "match": {
-        #                 "Device": "Boiler"
-        #             }
-        #         }
-        #     }
-        try:
-            self.es.delete_by_query(index=index_name_i, body=body )
-        
-        except Exception as e:
-            print(e)
-            # save_error(id, e)   
-
-    def delete_by_ids(self, index_name_i, ids):
-        try:
-            # ids = ['test1', 'test2', 'test3'] 
-
-            query = {"query": {"terms": {"_id": ids}}}
-            res = self.es.delete_by_query(index=index_name_i, body=query)
-            print(res)                    
-            
-        except Exception as e:
-            print(e)
-            # save_error(id, e)   
-
--- a/ner_dataset/find_law.py
+++ b/ner_dataset/find_law.py
@ -1,334 +0,0 @@
-# بسم الله 
-
-
-
-from elastic_helper import ElasticHelper
-from thefuzz import fuzz
-import json
-
-
-
-
-
-Read = open ('.\data\DATASET140402_no_arefـoutput.json',"r",encoding='utf8')
-RefList = json.loads(Read.read())
-path = ".\\data\\mj_qa_section-v02.zip"
-eh_obj = ElasticHelper()
-sections = eh_obj.iterateJsonFile(path, True)
-
-
-no_found_id = []
-txt_file = open(".\\no_find_txt.txt" , "r" , encoding="utf8")
-n = 0
-for line in txt_file:
-    if n != 0:
-        no_found_id.append(int(line.strip()))
-        n=0
-        continue
-    n = 1
-
-
-all_law_dict = []
-for index, item in enumerate(sections):
-    ref_id = item['id']
-    source = item['source']
-    content = source['content'].strip()
-    all_law_dict.append({"id":ref_id , "caption":content, "approve_date":source['ts_date']})
-
-
-
-
-
-def law_dict_saver(law_id,start_token_index,end_token_index,found_law_list,law_captions,matched_string,original_string,multi_flag):
-    
-        dict = {
-                                "law_id"        : law_id,
-                                "start_token_index": start_token_index,
-                                "end_token_index"  : end_token_index,
-                                "found_law_list": found_law_list,
-                                "law_captions"  : law_captions,
-                                "matched_string": matched_string,
-                                "original_string": original_string,
-                                "multi_flag": multi_flag
-                                }
-        return dict
-    
-def remove_latest_added_token(text):
-        temp = text.strip().split(' ')
-        temp.pop()
-        text = ''
-        for token in temp:
-            text = text + ' ' + token
-        
-        return text.strip()
-
-def law_recognizer(text, law_dict):
-
-        i = 0
-
-        normalized_content = text
-        text_token_list = normalized_content.strip().split()
-        matched_token_index_list = []
-        
-        # جمع آوری عناوین احتمالی قانون در یک متن بر اساس کلیدواژه قانون
-        for index,token in enumerate(text_token_list):
-            if 'قانون' in token:
-                matched_token_index_list.append(index)
-            
-        content_token_list = []
-        law_token_list     = []
-        for index, item in enumerate(matched_token_index_list):
-            # اگر آیتم، آخرین عنصر موجود در آرایه نبود ...
-            
-            end = 12  # در اینجا مشخص میکنیم چند کلمه را بررسی کند و حلقه بررسی چندبار تکرار شود
-
-            if item < len(text_token_list):
-                # نُه توکن بعدی را به عنوان عبارات تکمیلی احتمالی عنوان قانون ذخیره می کنیم
-                if item + end < len(text_token_list):
-                    for i in range(end):
-                        if item + (i+1) >= len(text_token_list):
-                            break
-                        content_token_list.append(text_token_list[item + (i+1)])
-                    i = 0
-                # توکن های باقیمانده(که کمتر از نُه توکن است) تا پایان آرایه را ذخیره کن
-                else:
-                    j = 0
-                    while j < len(text_token_list)-index:
-                        if item + (j+1) >= len(text_token_list)-index:
-                            break
-                        content_token_list.append(text_token_list[item + (j+1)])
-                        j += 1
-                    j = 0
-            law_token_list.append({
-                'start_token_index': item,
-                'law_token'        : content_token_list
-                })
-
-            if len(content_token_list) < end :   # اگر مقدار کلمات انتخابی برای بررسی از طول کلمات جمله بیشتر بود
-                end = len(content_token_list)  # کلمات انتخابی برای بررسی را به اندازه کل کلمات جمله قرار بده
-
-            content_token_list = []
-        matched_law_list = [] 
-        c = 0 
-
-
-        for key, law_value in enumerate(law_token_list):
-            c += 1
-            law_token         = law_value['law_token']
-            start_token_index = law_value['start_token_index']
-            end_token_index   = 0
-            found_law_list_1     = []
-            found_law_list_2     = []
-            found_law_list_3     = []
-            # اگر تعداد توکن های متنی که احتمالا عنوان یک قانون است، صفر بود،
-            # از حلقه خارج می شویم و به سراغ بررسی عنوان قانون بعدی می رویم
-            if len(law_token) < 1:
-                break
-
-            # در ابتدا اولین توکن عبارتی که احتمالا عنوان یک قانون است را در عنوان قانون موجود در بانک بررسی می کنیم
-            # در مراحل بعدی تا به نُه گام برسیم، یکی یکی توکن ها را به توکن اول اضافه و سپس با عناوین قانون ها مقایسه می کنیم
-            law_section = law_token[0]
-            for index, value in enumerate(law_dict):
-                # عنوان قانونی که در حال مقایسه متن مورد نظر با آن هستیم
-                id = value['id']
-                current_caption = value['caption']
-                current_approve_date = value['approve_date']
-                # بررسی وجود عبارت مورد نظر در عنوان قانون
-                if current_caption.__contains__(law_section):
-                    # به دست آوردن اولین توکن از عنوان قانون
-                    current_law_first_token = current_caption.strip().split(' ')[0]
-                    # اگر اولین توکن از عنوان قانون برابر با کلمه "قانون" بود، این کلمه را نادیده میگیریم
-                    # زیرا در لیست مربوط به لیست توکن های احتمالی مربوط به قوانین، کلمه قانون را در نظر نگرفته ایم 
-                    if current_law_first_token == 'قانون':
-                        current_law_first_token = current_caption.strip().split(' ')[1]
-                    if law_section == current_law_first_token:
-                        # اگر زیر رشته موردنظر ما در عنوان قانون وجود داشت، نام قانون را در یک لیست ذخیره می کنیم. 
-                        # در مرحله بعد متن احتمالی قانون که در حال بررسی آن هستیم را با این لیست مقایسه می کنیم تا مقایسه محدود تری داشته باشیم
-                        found_law_list_1.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
-                    else:
-                        continue
-        
-
-            X = 0
-            FoundLawList=[]
-            OldFoundLawList=[]
-            NewFoundLawList=[]
-            while X < end-1 :
-            # for x in range(end):
-
-
-                X+=1
-                if X == 1:  # در بررسی توکن اول وارد این شرط میشود
-                    if len(found_law_list_1) == 0:
-                        # X= X+1
-                        continue
-                    else:
-                        # X=X+1
-                        if len(found_law_list_1) == 1:
-                            found_law = []
-                            found_law.append(found_law_list_1.pop())
-                            k = 0
-                            matched_string = ''
-                            found_law_caption = found_law[0]['caption'].strip()
-                            if found_law_caption.startswith('قانون'):
-                                found_law_caption = found_law_caption[5:]
-                            found_law_caption_tokens = found_law_caption.strip().split()
-                            for k in range(len(law_token)):
-                                if k >= len(found_law_caption_tokens):
-                                    break
-                                if law_token[k] == found_law_caption_tokens[k]:
-                                    matched_string += law_token[k] + ' '
-                                else:
-                                    end_token_index = start_token_index + len(matched_string.strip().split())
-                                    found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
-                                    matched_law_list.append(found_law_dict)
-
-                                    break
-                                    
-                                end_token_index = start_token_index + len(matched_string.strip().split())
-                                found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
-                                matched_law_list.append(found_law_dict)
-
-                            continue
-
-                        if len(law_token) < 2:
-                            continue
-                            
-                        law_section = law_token[0]+' '+law_token[1]
-                        for value in found_law_list_1:
-                            id = value['id']
-                            current_caption = value['caption']
-                            current_approve_date = value['approve_date']
-                            rate = fuzz.token_set_ratio(current_caption,law_section)
-                            if rate == 100:
-                                found_law_list_2.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
-                    
-                    FoundLawList = found_law_list_1
-                    NewFoundLawList = found_law_list_2
-                    continue
-
-                OldFoundLawList = FoundLawList
-                FoundLawList = NewFoundLawList
-                NewFoundLawList = []
-                
-                if X == int(end-1): # در بررسی آخرین کلمه وارد این شرط میشود
-                    if len(FoundLawList) == 0:
-                        # اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
-                        if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
-                            # به دقت کنترل شود
-                            # مرتب سازی بر اساس قدیم به جدیدترین شناسه
-                            sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
-                            found_law               = sorted_found_law_list.pop()
-                            end_token_index = start_token_index + len(law_section.strip().split())
-                            # آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
-                            # زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
-                            law_section = remove_latest_added_token(law_section)
-                            found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
-                            matched_law_list.append(found_law_dict)
-                        continue
-                    else:
-                        if len(FoundLawList) == 1:
-                            sorted_found_law_list   = sorted(FoundLawList, key=lambda x: x['approve_date'])
-                            found_law = []
-                            found_law.append(FoundLawList.pop())
-                            end_token_index = start_token_index + len(law_section.strip().split())
-                            found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
-                            matched_law_list.append(found_law_dict)
-
-                        elif len(FoundLawList) > 1 and len(FoundLawList) < 6:
-                            sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'] )
-                            found_law               = sorted_found_law_list.pop()
-                            end_token_index = start_token_index + len(law_section.strip().split())
-                            found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,FoundLawList,found_law['caption'],law_section,law_token,True)
-                            matched_law_list.append(found_law_dict)
-                    break
-                
-                if len(FoundLawList) == 0:
-                    # اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
-                    if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
-                        # به دقت کنترل شود
-                        # مرتب سازی بر اساس قدیم به جدیدترین شناسه
-                        sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
-                        found_law               = sorted_found_law_list.pop()
-                        end_token_index = start_token_index + len(law_section.strip().split())
-                        # آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
-                        # زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
-                        law_section = remove_latest_added_token(law_section)
-                        found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
-                        matched_law_list.append(found_law_dict)
-                    continue
-                else:
-                    if len(FoundLawList) == 1:
-                        found_law = []
-                        found_law.append(FoundLawList.pop()) #             = found_law_list_2.pop()
-                        end_token_index = start_token_index + len(law_section.strip().split())
-                        found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
-                        matched_law_list.append(found_law_dict)
-                        # اگر در جستجوی عنوان قانون، به یک مورد منحصر به فرد رسیده بودیم، فقط همین یک عنوان را ذخیره کند
-                        continue
-
-                    if len(law_token) < X+1 :
-                        continue
-                    law_section += ' ' + law_token[X] # X = new token
-                    # law_section     =  List_Law_tokens
-                    for value in FoundLawList:
-                        id = value['id']
-                        current_caption = value['caption']
-                        current_approve_date = value['approve_date']
-                        rate = fuzz.token_set_ratio(current_caption,law_section)
-
-                        if rate == 100:
-
-                            NewFoundLawList.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
-
-                # OldFoundLawList=FoundLawList
-                # FoundLawList=NewFoundLawList
-                # NewFoundLawList=[]
-        
-        if matched_law_list:
-            for law_item in matched_law_list:
-                temp_list = []
-                found_list = law_item['found_law_list']
-                for item in found_list:
-                    temp_list.append(item['caption'] + '#' + str(item['id']) + '#' + item['approve_date'])
-                        
-                law_item['found_law_list'] = temp_list
-        return matched_law_list, law_token_list
-    
-n=0
-not_found_ids = []
-all_laws_founded = []
-for section in RefList :
-    
-    refID , Content , ner_list = section['id'],section['content'].strip(),section['ner']
-    print(f"ID {refID} is searching... ")
-    if refID in no_found_id:
-        matched_law_list, law_token_list = law_recognizer(Content, all_law_dict )
-        matched_law_list_ids = []
-        matched_law_list_content = []
-        for law in matched_law_list:
-            matched_law_list_ids.append(law['law_id'])
-            matched_law_list_content.append(law['law_captions'])
-        if len(matched_law_list) != 0 :
-            n+=1
-            all_laws_founded.append({"dataset-REF":{"id":refID,"content":Content},
-                                 "All-REF":{"id":matched_law_list_ids,"content":matched_law_list_content}})
-        
-        else: 
-            not_found_ids.append(refID)
-
-
-
-txt=''
-for id_ in not_found_ids:
-    txt+=f"{id_}\n"
-
-with open("not_found_idsX.txt", "w",encoding="utf8") as file:
-        # نوشتن داده‌ها در فایل
-        file.write(txt)
-
-with open("founded_lawsX.json", "w" , encoding="utf8") as f:
-    json.dump(all_laws_founded, f, indent=4, ensure_ascii=False )
-
-print(f"{n} Law Founded ! ")
-print(f"{len(not_found_ids)} Law Not Founded ! ")
--- a/ner_dataset/normalizer.py
+++ b/ner_dataset/normalizer.py