Flair_NER/similarity.py

88 lines
2.9 KiB
Python

import json
from sentence_transformers import SentenceTransformer, util
# from normalizer import cleaning
import os
from general_functions import normalize_content
from datetime import datetime
#model_path = './paraphrase-multilingual-mpnet-base-v2-1401-07-30'
#model_path = '/home/gpu/NLP/MLM/MODELS/training_stsbenchmark-HooshvareLab-bert-fa-base-uncased-finetuned-2-pt-2024-02-20_16-55-15'
model_path = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt'
encoder = SentenceTransformer(model_path)
def find_similarity():
# Two lists of sentences
sentences2 = [
cleaning("قانون حمایت از خانواده و جوانی جمعیت")
]
sentences1 = [
cleaning("قانون خانواده"),
cleaning("قانون جمعیت"),
cleaning("قانون جوانی جمعیت"),
cleaning("قانون خانواده و جوانی جمعیت"),
cleaning("جمعیت ایران"),
cleaning("جوانی جمعیت"),
cleaning("حمایت از خانواده"),
]
embeddings1 = encoder.encode(sentences1, convert_to_tensor=True)
embeddings2 = encoder.encode(sentences2, convert_to_tensor=True)
print(embeddings2)
# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
#print(cosine_scores)
# Output the pairs with their score
for i in range(len(sentences1)):
print("{} \t\t {} \t\t Score: {:.4f}".format(
sentences1[i], sentences2[0], cosine_scores[i][0]
))
def get_embedding(text):
embedded_value = encoder.encode(text, convert_to_tensor=True)
return embedded_value
def save_lawtitle_embedding():
law_dict = read_from_json('/home/gpu/tnlp/jokar/Flair_NER/data/law_title.json')
i = 1
for section_index, section in enumerate(law_dict):
print(f"law row: {section_index + 1}")
caption = section['caption']
caption = caption.lstrip('قانون').strip()
caption = normalize_content(caption)
caption_embedding = get_embedding(caption)
section['caption_embedding'] = caption_embedding.tolist()
write_to_json(law_dict,'/home/gpu/tnlp/jokar/Flair_NER/data/law_title_new222.json')
def write_to_json(dict, file_address):
# تبدیل دیکشنری به فرمت JSON
json_data = json.dumps(dict, indent=2, ensure_ascii=False)
# ذخیره فایل
with open(file_address, 'w', encoding='utf-8') as file:
file.write(json_data)
def read_from_json(file_address):
data_dict = []
# خواندن اطلاعات از فایل JSON
with open(file_address, 'r', encoding='utf-8') as file:
loaded_data = json.load(file)
# نمایش اطلاعات خوانده شده
for item in loaded_data:
data_dict.append(item)
return data_dict
#find_similarity()
save_lawtitle_embedding()
print(datetime.now())
print(' finished ')