88 lines
2.9 KiB
Python
88 lines
2.9 KiB
Python
import json
|
|
from sentence_transformers import SentenceTransformer, util
|
|
# from normalizer import cleaning
|
|
import os
|
|
from general_functions import normalize_content
|
|
from datetime import datetime
|
|
|
|
|
|
#model_path = './paraphrase-multilingual-mpnet-base-v2-1401-07-30'
|
|
#model_path = '/home/gpu/NLP/MLM/MODELS/training_stsbenchmark-HooshvareLab-bert-fa-base-uncased-finetuned-2-pt-2024-02-20_16-55-15'
|
|
model_path = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt'
|
|
encoder = SentenceTransformer(model_path)
|
|
|
|
|
|
def find_similarity():
|
|
# Two lists of sentences
|
|
sentences2 = [
|
|
cleaning("قانون حمایت از خانواده و جوانی جمعیت")
|
|
|
|
]
|
|
|
|
sentences1 = [
|
|
cleaning("قانون خانواده"),
|
|
cleaning("قانون جمعیت"),
|
|
cleaning("قانون جوانی جمعیت"),
|
|
cleaning("قانون خانواده و جوانی جمعیت"),
|
|
cleaning("جمعیت ایران"),
|
|
cleaning("جوانی جمعیت"),
|
|
cleaning("حمایت از خانواده"),
|
|
]
|
|
|
|
|
|
embeddings1 = encoder.encode(sentences1, convert_to_tensor=True)
|
|
embeddings2 = encoder.encode(sentences2, convert_to_tensor=True)
|
|
print(embeddings2)
|
|
|
|
# Compute cosine-similarities
|
|
cosine_scores = util.cos_sim(embeddings1, embeddings2)
|
|
#print(cosine_scores)
|
|
|
|
# Output the pairs with their score
|
|
for i in range(len(sentences1)):
|
|
print("{} \t\t {} \t\t Score: {:.4f}".format(
|
|
|
|
sentences1[i], sentences2[0], cosine_scores[i][0]
|
|
))
|
|
def get_embedding(text):
|
|
embedded_value = encoder.encode(text, convert_to_tensor=True)
|
|
return embedded_value
|
|
|
|
def save_lawtitle_embedding():
|
|
|
|
law_dict = read_from_json('/home/gpu/tnlp/jokar/Flair_NER/data/law_title.json')
|
|
i = 1
|
|
for section_index, section in enumerate(law_dict):
|
|
print(f"law row: {section_index + 1}")
|
|
caption = section['caption']
|
|
caption = caption.lstrip('قانون').strip()
|
|
caption = normalize_content(caption)
|
|
caption_embedding = get_embedding(caption)
|
|
section['caption_embedding'] = caption_embedding.tolist()
|
|
|
|
write_to_json(law_dict,'/home/gpu/tnlp/jokar/Flair_NER/data/law_title_new222.json')
|
|
|
|
def write_to_json(dict, file_address):
|
|
|
|
# تبدیل دیکشنری به فرمت JSON
|
|
json_data = json.dumps(dict, indent=2, ensure_ascii=False)
|
|
|
|
# ذخیره فایل
|
|
with open(file_address, 'w', encoding='utf-8') as file:
|
|
file.write(json_data)
|
|
|
|
def read_from_json(file_address):
|
|
data_dict = []
|
|
# خواندن اطلاعات از فایل JSON
|
|
with open(file_address, 'r', encoding='utf-8') as file:
|
|
loaded_data = json.load(file)
|
|
|
|
# نمایش اطلاعات خوانده شده
|
|
for item in loaded_data:
|
|
data_dict.append(item)
|
|
return data_dict
|
|
|
|
#find_similarity()
|
|
save_lawtitle_embedding()
|
|
print(datetime.now())
|
|
print(' finished ') |