ai_dataset/ner_dataset/find_law.py

# بسم الله


from elastic_helper import ElasticHelper
from thefuzz import fuzz
import json


Read = open ('.\data\DATASET140402_no_arefـoutput.json',"r",encoding='utf8')
RefList = json.loads(Read.read())
path = ".\\data\\mj_qa_section-v02.zip"
eh_obj = ElasticHelper()
sections = eh_obj.iterateJsonFile(path, True)


no_found_id = []
txt_file = open(".\\no_find_txt.txt" , "r" , encoding="utf8")
n = 0
for line in txt_file:
    if n != 0:
        no_found_id.append(int(line.strip()))
        n=0
        continue
    n = 1


all_law_dict = []
for index, item in enumerate(sections):
    ref_id = item['id']
    source = item['source']
    content = source['content'].strip()
    all_law_dict.append({"id":ref_id , "caption":content, "approve_date":source['ts_date']})


def law_dict_saver(law_id,start_token_index,end_token_index,found_law_list,law_captions,matched_string,original_string,multi_flag):

        dict = {
                                "law_id"        : law_id,
                                "start_token_index": start_token_index,
                                "end_token_index"  : end_token_index,
                                "found_law_list": found_law_list,
                                "law_captions"  : law_captions,
                                "matched_string": matched_string,
                                "original_string": original_string,
                                "multi_flag": multi_flag
                                }
        return dict

def remove_latest_added_token(text):
        temp = text.strip().split(' ')
        temp.pop()
        text = ''
        for token in temp:
            text = text + ' ' + token

        return text.strip()

def law_recognizer(text, law_dict):

        i = 0

        normalized_content = text
        text_token_list = normalized_content.strip().split()
        matched_token_index_list = []

        # جمع آوری عناوین احتمالی قانون در یک متن بر اساس کلیدواژه قانون
        for index,token in enumerate(text_token_list):
            if 'قانون' in token:
                matched_token_index_list.append(index)

        content_token_list = []
        law_token_list     = []
        for index, item in enumerate(matched_token_index_list):
            # اگر آیتم، آخرین عنصر موجود در آرایه نبود ...

            end = 12  # در اینجا مشخص میکنیم چند کلمه را بررسی کند و حلقه بررسی چندبار تکرار شود

            if item < len(text_token_list):
                # نُه توکن بعدی را به عنوان عبارات تکمیلی احتمالی عنوان قانون ذخیره می کنیم
                if item + end < len(text_token_list):
                    for i in range(end):
                        if item + (i+1) >= len(text_token_list):
                            break
                        content_token_list.append(text_token_list[item + (i+1)])
                    i = 0
                # توکن های باقیمانده(که کمتر از نُه توکن است) تا پایان آرایه را ذخیره کن
                else:
                    j = 0
                    while j < len(text_token_list)-index:
                        if item + (j+1) >= len(text_token_list)-index:
                            break
                        content_token_list.append(text_token_list[item + (j+1)])
                        j += 1
                    j = 0
            law_token_list.append({
                'start_token_index': item,
                'law_token'        : content_token_list
                })

            if len(content_token_list) < end :   # اگر مقدار کلمات انتخابی برای بررسی از طول کلمات جمله بیشتر بود
                end = len(content_token_list)  # کلمات انتخابی برای بررسی را به اندازه کل کلمات جمله قرار بده

            content_token_list = []
        matched_law_list = []
        c = 0


        for key, law_value in enumerate(law_token_list):
            c += 1
            law_token         = law_value['law_token']
            start_token_index = law_value['start_token_index']
            end_token_index   = 0
            found_law_list_1     = []
            found_law_list_2     = []
            found_law_list_3     = []
            # اگر تعداد توکن های متنی که احتمالا عنوان یک قانون است، صفر بود،
            # از حلقه خارج می شویم و به سراغ بررسی عنوان قانون بعدی می رویم
            if len(law_token) < 1:
                break

            # در ابتدا اولین توکن عبارتی که احتمالا عنوان یک قانون است را در عنوان قانون موجود در بانک بررسی می کنیم
            # در مراحل بعدی تا به نُه گام برسیم، یکی یکی توکن ها را به توکن اول اضافه و سپس با عناوین قانون ها مقایسه می کنیم
            law_section = law_token[0]
            for index, value in enumerate(law_dict):
                # عنوان قانونی که در حال مقایسه متن مورد نظر با آن هستیم
                id = value['id']
                current_caption = value['caption']
                current_approve_date = value['approve_date']
                # بررسی وجود عبارت مورد نظر در عنوان قانون
                if current_caption.__contains__(law_section):
                    # به دست آوردن اولین توکن از عنوان قانون
                    current_law_first_token = current_caption.strip().split(' ')[0]
                    # اگر اولین توکن از عنوان قانون برابر با کلمه "قانون" بود، این کلمه را نادیده میگیریم
                    # زیرا در لیست مربوط به لیست توکن های احتمالی مربوط به قوانین، کلمه قانون را در نظر نگرفته ایم
                    if current_law_first_token == 'قانون':
                        current_law_first_token = current_caption.strip().split(' ')[1]
                    if law_section == current_law_first_token:
                        # اگر زیر رشته موردنظر ما در عنوان قانون وجود داشت، نام قانون را در یک لیست ذخیره می کنیم.
                        # در مرحله بعد متن احتمالی قانون که در حال بررسی آن هستیم را با این لیست مقایسه می کنیم تا مقایسه محدود تری داشته باشیم
                        found_law_list_1.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})
                    else:
                        continue


            X = 0
            FoundLawList=[]
            OldFoundLawList=[]
            NewFoundLawList=[]
            while X < end-1 :
            # for x in range(end):


                X+=1
                if X == 1:  # در بررسی توکن اول وارد این شرط میشود
                    if len(found_law_list_1) == 0:
                        # X= X+1
                        continue
                    else:
                        # X=X+1
                        if len(found_law_list_1) == 1:
                            found_law = []
                            found_law.append(found_law_list_1.pop())
                            k = 0
                            matched_string = ''
                            found_law_caption = found_law[0]['caption'].strip()
                            if found_law_caption.startswith('قانون'):
                                found_law_caption = found_law_caption[5:]
                            found_law_caption_tokens = found_law_caption.strip().split()
                            for k in range(len(law_token)):
                                if k >= len(found_law_caption_tokens):
                                    break
                                if law_token[k] == found_law_caption_tokens[k]:
                                    matched_string += law_token[k] + ' '
                                else:
                                    end_token_index = start_token_index + len(matched_string.strip().split())
                                    found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
                                    matched_law_list.append(found_law_dict)

                                    break

                                end_token_index = start_token_index + len(matched_string.strip().split())
                                found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],matched_string.strip(),law_token,False)
                                matched_law_list.append(found_law_dict)

                            continue

                        if len(law_token) < 2:
                            continue

                        law_section = law_token[0]+' '+law_token[1]
                        for value in found_law_list_1:
                            id = value['id']
                            current_caption = value['caption']
                            current_approve_date = value['approve_date']
                            rate = fuzz.token_set_ratio(current_caption,law_section)
                            if rate == 100:
                                found_law_list_2.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})

                    FoundLawList = found_law_list_1
                    NewFoundLawList = found_law_list_2
                    continue

                OldFoundLawList = FoundLawList
                FoundLawList = NewFoundLawList
                NewFoundLawList = []

                if X == int(end-1): # در بررسی آخرین کلمه وارد این شرط میشود
                    if len(FoundLawList) == 0:
                        # اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
                        if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
                            # به دقت کنترل شود
                            # مرتب سازی بر اساس قدیم به جدیدترین شناسه
                            sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
                            found_law               = sorted_found_law_list.pop()
                            end_token_index = start_token_index + len(law_section.strip().split())
                            # آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
                            # زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
                            law_section = remove_latest_added_token(law_section)
                            found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
                            matched_law_list.append(found_law_dict)
                        continue
                    else:
                        if len(FoundLawList) == 1:
                            sorted_found_law_list   = sorted(FoundLawList, key=lambda x: x['approve_date'])
                            found_law = []
                            found_law.append(FoundLawList.pop())
                            end_token_index = start_token_index + len(law_section.strip().split())
                            found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
                            matched_law_list.append(found_law_dict)

                        elif len(FoundLawList) > 1 and len(FoundLawList) < 6:
                            sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'] )
                            found_law               = sorted_found_law_list.pop()
                            end_token_index = start_token_index + len(law_section.strip().split())
                            found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,FoundLawList,found_law['caption'],law_section,law_token,True)
                            matched_law_list.append(found_law_dict)
                    break

                if len(FoundLawList) == 0:
                    # اگر در مرحله قبل بیش از یک مورد پیدا کرده اما در این مرحله تعداد موارد مشابه به صفر رسیده
                    if len(OldFoundLawList) > 1 and len(OldFoundLawList) < 6:
                        # به دقت کنترل شود
                        # مرتب سازی بر اساس قدیم به جدیدترین شناسه
                        sorted_found_law_list   = sorted(OldFoundLawList, key=lambda x: x['approve_date'])
                        found_law               = sorted_found_law_list.pop()
                        end_token_index = start_token_index + len(law_section.strip().split())
                        # آخرین توکنی که اخیرا به عنوان قانون اضافه شده را باید برگردانیم
                        # زیرا متناظر با این توکن اضافه شده، عنوان قانونی پیدا نشده
                        law_section = remove_latest_added_token(law_section)
                        found_law_dict = law_dict_saver(found_law['id'],start_token_index,end_token_index,sorted_found_law_list,found_law['caption'],law_section,law_token,True)
                        matched_law_list.append(found_law_dict)
                    continue
                else:
                    if len(FoundLawList) == 1:
                        found_law = []
                        found_law.append(FoundLawList.pop()) #             = found_law_list_2.pop()
                        end_token_index = start_token_index + len(law_section.strip().split())
                        found_law_dict = law_dict_saver(found_law[0]['id'],start_token_index,end_token_index,found_law,found_law[0]['caption'],law_section,law_token,False)
                        matched_law_list.append(found_law_dict)
                        # اگر در جستجوی عنوان قانون، به یک مورد منحصر به فرد رسیده بودیم، فقط همین یک عنوان را ذخیره کند
                        continue

                    if len(law_token) < X+1 :
                        continue
                    law_section += ' ' + law_token[X] # X = new token
                    # law_section     =  List_Law_tokens
                    for value in FoundLawList:
                        id = value['id']
                        current_caption = value['caption']
                        current_approve_date = value['approve_date']
                        rate = fuzz.token_set_ratio(current_caption,law_section)

                        if rate == 100:

                            NewFoundLawList.append({"id": id ,"caption": current_caption, "approve_date":current_approve_date})

                # OldFoundLawList=FoundLawList
                # FoundLawList=NewFoundLawList
                # NewFoundLawList=[]

        if matched_law_list:
            for law_item in matched_law_list:
                temp_list = []
                found_list = law_item['found_law_list']
                for item in found_list:
                    temp_list.append(item['caption'] + '#' + str(item['id']) + '#' + item['approve_date'])

                law_item['found_law_list'] = temp_list
        return matched_law_list, law_token_list

n=0
not_found_ids = []
all_laws_founded = []
for section in RefList :

    refID , Content , ner_list = section['id'],section['content'].strip(),section['ner']
    print(f"ID {refID} is searching... ")
    if refID in no_found_id:
        matched_law_list, law_token_list = law_recognizer(Content, all_law_dict )
        matched_law_list_ids = []
        matched_law_list_content = []
        for law in matched_law_list:
            matched_law_list_ids.append(law['law_id'])
            matched_law_list_content.append(law['law_captions'])
        if len(matched_law_list) != 0 :
            n+=1
            all_laws_founded.append({"dataset-REF":{"id":refID,"content":Content},
                                 "All-REF":{"id":matched_law_list_ids,"content":matched_law_list_content}})

        else:
            not_found_ids.append(refID)


txt=''
for id_ in not_found_ids:
    txt+=f"{id_}\n"

with open("not_found_idsX.txt", "w",encoding="utf8") as file:
        # نوشتن داده‌ها در فایل
        file.write(txt)

with open("founded_lawsX.json", "w" , encoding="utf8") as f:
    json.dump(all_laws_founded, f, indent=4, ensure_ascii=False )

print(f"{n} Law Founded ! ")
print(f"{len(not_found_ids)} Law Not Founded ! ")