RD_relation/relation/sections_embedding.py
2025-01-23 18:36:04 +03:30

148 lines
4.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer, util
# from normalizer import cleaning
from funcs import write_to_json, read_from_json, read_file_by_address
from transformers import AutoTokenizer, AutoModel
import torch
import json
import datetime
import pandas as pd
# MODEL_NAME = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt'
MODEL_NAME = "/home/gpu/NLP/MLM/CODES/BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt"
# مدل BERT فارسی
#MODEL_NAME = "HooshvareLab/bert-fa-base-uncased"
# بارگذاری مدل و توکنایزر
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
encoder = SentenceTransformer(MODEL_NAME)
sections = read_from_json('./data/joint_qanon_170k_newface.json')
stop_words = read_file_by_address("./data/stop_words_big.txt").splitlines()
def find_similarity():
# Two lists of sentences
sentences2 = [cleaning("قانون حمایت از خانواده و جوانی جمعیت")]
sentences1 = [
cleaning("قانون خانواده"),
cleaning("قانون جمعیت"),
cleaning("قانون جوانی جمعیت"),
cleaning("قانون خانواده و جوانی جمعیت"),
cleaning("جمعیت ایران"),
cleaning("جوانی جمعیت"),
cleaning("حمایت از خانواده"),
]
embeddings1 = encoder.encode(sentences1, convert_to_tensor=True)
embeddings2 = encoder.encode(sentences2, convert_to_tensor=True)
print(embeddings2)
# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
#print(cosine_scores)
# Output the pairs with their score
for i in range(len(sentences1)):
print("{} \t\t {} \t\t Score: {:.4f}".format(
sentences1[i], sentences2[0], cosine_scores[i][0]
))
def get_embedding(text):
embedded_text = encoder.encode(text, convert_to_tensor=True)
return embedded_text
def save_lawtitle_embedding():
law_dict = read_from_json('./jokar/llm_services/law_title.json')
i = 1
for section_index, section in enumerate(law_dict):
print(f"law row: {section_index + 1}")
caption = section['caption']
caption = cleaning(caption)
caption_embedding = get_embedding(caption)
section['caption_embedding'] = str(caption_embedding)
write_to_json(law_dict,'./jokar/llm_services/law_title_new222.json')
def get_word_embeddings(base_sentence):
"""
تبدیل کلمات یک جمله به بردار با استفاده از مدل BERT
"""
sentence_tokens = base_sentence.split()
# پیش پردازش توکن ها
for tkn in sentence_tokens:
if tkn in stop_words or tkn.isdigit():
sentence_tokens.remove(tkn)
sentence = ''
for t in sentence_tokens:
sentence += ''.join(t + " ")
# توکنایز کردن جمله
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
# گرفتن توکن‌های کلمات اصلی (نه Special Tokens مثل [CLS])
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
try:
# پردازش با مدل BERT
with torch.no_grad():
outputs = model(**inputs)
except:
print("falut")
return []
# بردارهای خروجی برای هر توکن
token_embeddings = outputs.last_hidden_state[0] # (seq_length, hidden_size)
# تبدیل توکن‌ها و بردارها به دیکشنری
token_to_embedding = {
token: embedding for token, embedding in zip(tokens, token_embeddings)
if token not in tokenizer.all_special_tokens # حذف توکن‌های خاص
}
return token_to_embedding
def compute_distance(embedding1, embedding2):
"""
محاسبه فاصله کسینوسی بین دو بردار
"""
return cosine(embedding1.numpy(), embedding2.numpy())
print(datetime.datetime.now())
#find_similarity()
sections_embeddings = []
fault_counter = 0
for i, section in enumerate(sections):
if section['is-long']:
continue
if i == 10: break
print('section: ' + str(i+1))
i+=1
content = section['content']
tokens_embedding = get_word_embeddings(content)
new_token_embds = {}
for key, embd_value in tokens_embedding.items():
# تبدیل Tensor به لیست
tensor_list = embd_value.tolist()
# تبدیل به JSON
json_data = json.dumps(tensor_list)
new_token_embds[key] = json_data
section['tokens_embs'] = new_token_embds
write_to_json(sections, './data/sections_170k_embds.json')
print("operation_finished")
print(datetime.datetime.now())