import json from sentence_transformers import SentenceTransformer, util # from normalizer import cleaning import os import torch # from inference import inference_main from funcs import read_from_json, save_to_file_by_address, add_columndata_to_excel , read_from_excel from datetime import datetime #from general_functions import normalize_content #model_path = './paraphrase-multilingual-mpnet-base-v2-1401-07-30' #model_path = '/home/gpu/NLP/MLM/MODELS/training_stsbenchmark-HooshvareLab-bert-fa-base-uncased-finetuned-2-pt-2024-02-20_16-55-15' def find_similarity(value_1, value_2): value_1 = [value_1] value_2 = [value_2] # value_1 = value_1.lstrip('tensor(') # value_1 = value_1.rstrip(', device=\'cuda:0\')') # # value_1 = torch.tensor(eval(value_1)) # # print(value_1) # # # value_2 = value_2.lstrip('tensor(') # # # value_2 = value_2.rstrip(', device=\'cuda:0\')') # # value_2 = torch.tensor(eval(value_2)) # # print(value_2) # اگر دستگاه GPU موجود باشد، آن را انتخاب کنید، در غیر این صورت از CPU استفاده کنید # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # value_1 = torch.tensor(value_1, device=device) # value_2 = torch.tensor(value_2, device=device) # Compute cosine-similarities cosine_scores = util.cos_sim(value_1, value_2) # print(cosine_scores) return cosine_scores def get_embedding(text): #text = cleaning(text) embedded_value = encoder.encode(text, convert_to_tensor=True) return embedded_value def method(): #model = "orgcatorg/xlm-v-base-ner" file_address = current_path + '/data/eee.xlsx'#archive.tgz column_name = "ai_nlp_results" sections_ner = read_from_excel(file_address, column_name) '''vals = Span[0:2]: "ماده 9"/HALFREFERENCE / 1.0 Span[7:10]: "ماده (5"/HALFREFERENCE / 1.0 Span[11:15]: "قانون مدیریت خدمات کشوری"/REFERENCE / 1.0 Span[16:21]: "8 /7 /1386"/DATE2 / 1.0 Span[27:31]: "ماده (5)"/HALFREFERENCE / 0.86 Span[31:35]: "قانون محاسبات عمومی کشور"/REFERENCE / 1.0 Span[36:41]: "1 /6 /1366"/DATE2 / 1.0 Span[53:56]: "ماده (2"/HALFREFERENCE / 1.0 Span[56:64]: ") قانون استخدام نیروی انتظامی جمهوری اسلامی ایران"/REFERENCE / 1.0 Span[65:70]: "20 /12 /1382"/DATE2 / 1.0 Span[86:88]: "این قانون"/HALFREFERENCE / 1.0 Span[8:13]: "1401/5/3"/DATE2 / 1.0 Span[16:21]: "قانون خانواده و جوانی جمعیت"/REFERENCE / 0.96 Span[1:2]: "روز"/DATE / 0.99 Span[2:12]: "نوزدهم دی ماه سال یکهزار وسیصد و نود و سه"/DATE / 1.0 Span[5:8]: "سازمان محیط زیست"/ORG / 1.0 Span[10:14]: "سازمان جوانان هلال احمر"/ORG / 1.0 Span[16:23]: "قانون صیانت از کاربران در فضای مجازی"/REFERENCE / 1.0 Span[26:29]: "قانون هوای پاک"/REFERENCE / 1.0 Span[35:38]: "سازمان محیط زیست"/ORG / 1.0''' # sections_ner.append(vals) print(f'number of sections: {str(len(sections_ner))}') new_nlp_list = [] for index, detected_ner in enumerate(sections_ner): print(f'section number: {index+1}') print('*'*80) ner_items = detected_ner.split('\n') new_nlp = '' for ner in ner_items: try: ner_parts = ner.split('\"') detected_value = ner_parts[1].strip() detected_type = ner_parts[2].lstrip("/") detected_type = detected_type.split("/") detected_type = detected_type[0].strip() except: continue if detected_type == 'REFERENCE': try: found_law = find_related_law(detected_value) found_item = f"""######################################################################## {detected_value} | {detected_type} found_law: {found_law['caption']} | score: {found_law['similarity']} url: https://qavanin.ir/Law/TreeText/{found_law['law_id']} ########################################################################\n""" # ذخیره در فایل تکست save_to_file_by_address(current_path + '/founded_law_3.txt',found_item) new_nlp = new_nlp + found_item except: new_nlp = new_nlp + ner + ' ??????????????? \n' else: new_nlp = new_nlp + ner + '\n' new_nlp_list.append(new_nlp) new_column_name = 'ai_nlp_values_6' add_columndata_to_excel(file_address, new_column_name, new_nlp_list) print('*** excel file updated! ***') return True def find_related_law(detected_value): similarity_arr = [] detected_value = pre_process(detected_value) # حذف عنوان قانون از ابتدای توکن به منظور یکدست سازی با امبدینگ های موجود در جیسون detected_value = detected_value.lstrip('قانون').strip() # print(detected_value) detected_value_embedding = get_embedding(detected_value) x = 1 for law in law_dict: caption_embedding = law['caption_embedding'] similarity_value = find_similarity(detected_value_embedding.tolist(), caption_embedding) similarity_arr.append({'law_id':law['id'], 'similarity':similarity_value, 'caption':law['caption']}) # if x == 1: # print(f'{datetime.now()} oooooooooooooooooooooooooooooooooooooooooooooooooooooooooo') # if x%1000 == 0: # print(f'law title number {str(x)} is reading ...') try: x += 1 except: pass sorted_similarity_arr = sorted(similarity_arr, key=lambda x: x['similarity'],reverse= True) found_law = sorted_similarity_arr[0] return found_law def pre_process(text): #text = normalize_content(text) return text if __name__ == "__main__": model_path = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt' encoder = SentenceTransformer(model_path) current_path = os.getcwd() if not current_path.__contains__('jokar'): current_path = os.getcwd() + '/jokar/Flair_NER' law_dict = read_from_json(current_path + '/data/law_title.json') # find_related_law('d') # method() # print(' operation finished!') # print(datetime.now())