import json from sentence_transformers import SentenceTransformer, util # from normalizer import cleaning import os from general_functions import normalize_content from datetime import datetime #model_path = './paraphrase-multilingual-mpnet-base-v2-1401-07-30' #model_path = '/home/gpu/NLP/MLM/MODELS/training_stsbenchmark-HooshvareLab-bert-fa-base-uncased-finetuned-2-pt-2024-02-20_16-55-15' model_path = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt' encoder = SentenceTransformer(model_path) def find_similarity(): # Two lists of sentences sentences2 = [ cleaning("قانون حمایت از خانواده و جوانی جمعیت") ] sentences1 = [ cleaning("قانون خانواده"), cleaning("قانون جمعیت"), cleaning("قانون جوانی جمعیت"), cleaning("قانون خانواده و جوانی جمعیت"), cleaning("جمعیت ایران"), cleaning("جوانی جمعیت"), cleaning("حمایت از خانواده"), ] embeddings1 = encoder.encode(sentences1, convert_to_tensor=True) embeddings2 = encoder.encode(sentences2, convert_to_tensor=True) print(embeddings2) # Compute cosine-similarities cosine_scores = util.cos_sim(embeddings1, embeddings2) #print(cosine_scores) # Output the pairs with their score for i in range(len(sentences1)): print("{} \t\t {} \t\t Score: {:.4f}".format( sentences1[i], sentences2[0], cosine_scores[i][0] )) def get_embedding(text): embedded_value = encoder.encode(text, convert_to_tensor=True) return embedded_value def save_lawtitle_embedding(): law_dict = read_from_json('/home/gpu/tnlp/jokar/Flair_NER/data/law_title.json') i = 1 for section_index, section in enumerate(law_dict): print(f"law row: {section_index + 1}") caption = section['caption'] caption = caption.lstrip('قانون').strip() caption = normalize_content(caption) caption_embedding = get_embedding(caption) section['caption_embedding'] = caption_embedding.tolist() write_to_json(law_dict,'/home/gpu/tnlp/jokar/Flair_NER/data/law_title_new222.json') def write_to_json(dict, file_address): # تبدیل دیکشنری به فرمت JSON json_data = json.dumps(dict, indent=2, ensure_ascii=False) # ذخیره فایل with open(file_address, 'w', encoding='utf-8') as file: file.write(json_data) def read_from_json(file_address): data_dict = [] # خواندن اطلاعات از فایل JSON with open(file_address, 'r', encoding='utf-8') as file: loaded_data = json.load(file) # نمایش اطلاعات خوانده شده for item in loaded_data: data_dict.append(item) return data_dict #find_similarity() save_lawtitle_embedding() print(datetime.now()) print(' finished ')