148 lines
4.8 KiB
148 lines
4.8 KiB
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer, util
# from normalizer import cleaning
from funcs import write_to_json, read_from_json, read_file_by_address
from transformers import AutoTokenizer, AutoModel
import torch
import json
import datetime
import pandas as pd
# MODEL_NAME = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt'
MODEL_NAME = "/home/gpu/NLP/MLM/CODES/BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt"
# مدل BERT فارسی
#MODEL_NAME = "HooshvareLab/bert-fa-base-uncased"
# بارگذاری مدل و توکنایزر
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
encoder = SentenceTransformer(MODEL_NAME)
sections = read_from_json('./data/joint_qanon_170k_newface.json')
stop_words = read_file_by_address("./data/stop_words_big.txt").splitlines()
def find_similarity():
# Two lists of sentences
sentences2 = [cleaning("قانون حمایت از خانواده و جوانی جمعیت")]
sentences1 = [
cleaning("قانون خانواده"),
cleaning("قانون جمعیت"),
cleaning("قانون جوانی جمعیت"),
cleaning("قانون خانواده و جوانی جمعیت"),
cleaning("جمعیت ایران"),
cleaning("جوانی جمعیت"),
cleaning("حمایت از خانواده"),
embeddings1 = encoder.encode(sentences1, convert_to_tensor=True)
embeddings2 = encoder.encode(sentences2, convert_to_tensor=True)
# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
# Output the pairs with their score
for i in range(len(sentences1)):
print("{} \t\t {} \t\t Score: {:.4f}".format(
sentences1[i], sentences2[0], cosine_scores[i][0]
def get_embedding(text):
embedded_text = encoder.encode(text, convert_to_tensor=True)
return embedded_text
def save_lawtitle_embedding():
law_dict = read_from_json('./jokar/llm_services/law_title.json')
i = 1
for section_index, section in enumerate(law_dict):
print(f"law row: {section_index + 1}")
caption = section['caption']
caption = cleaning(caption)
caption_embedding = get_embedding(caption)
section['caption_embedding'] = str(caption_embedding)
def get_word_embeddings(base_sentence):
تبدیل کلمات یک جمله به بردار با استفاده از مدل BERT
sentence_tokens = base_sentence.split()
# پیش پردازش توکن ها
for tkn in sentence_tokens:
if tkn in stop_words or tkn.isdigit():
sentence = ''
for t in sentence_tokens:
sentence += ''.join(t + " ")
# توکنایز کردن جمله
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
# گرفتن توکنهای کلمات اصلی (نه Special Tokens مثل [CLS])
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
# پردازش با مدل BERT
with torch.no_grad():
outputs = model(**inputs)
return []
# بردارهای خروجی برای هر توکن
token_embeddings = outputs.last_hidden_state[0] # (seq_length, hidden_size)
# تبدیل توکنها و بردارها به دیکشنری
token_to_embedding = {
token: embedding for token, embedding in zip(tokens, token_embeddings)
if token not in tokenizer.all_special_tokens # حذف توکنهای خاص
return token_to_embedding
def compute_distance(embedding1, embedding2):
محاسبه فاصله کسینوسی بین دو بردار
return cosine(embedding1.numpy(), embedding2.numpy())
sections_embeddings = []
fault_counter = 0
for i, section in enumerate(sections):
if section['is-long']:
if i == 10: break
print('section: ' + str(i+1))
content = section['content']
tokens_embedding = get_word_embeddings(content)
new_token_embds = {}
for key, embd_value in tokens_embedding.items():
# تبدیل Tensor به لیست
tensor_list = embd_value.tolist()
# تبدیل به JSON
json_data = json.dumps(tensor_list)
new_token_embds[key] = json_data
section['tokens_embs'] = new_token_embds
write_to_json(sections, './data/sections_170k_embds.json')