148 lines
4.8 KiB
Python
148 lines
4.8 KiB
Python
from scipy.spatial.distance import cosine
|
||
from sentence_transformers import SentenceTransformer, util
|
||
# from normalizer import cleaning
|
||
from funcs import write_to_json, read_from_json, read_file_by_address
|
||
from transformers import AutoTokenizer, AutoModel
|
||
import torch
|
||
import json
|
||
import datetime
|
||
import pandas as pd
|
||
|
||
# MODEL_NAME = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt'
|
||
|
||
MODEL_NAME = "/home/gpu/NLP/MLM/CODES/BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt"
|
||
# مدل BERT فارسی
|
||
#MODEL_NAME = "HooshvareLab/bert-fa-base-uncased"
|
||
|
||
# بارگذاری مدل و توکنایزر
|
||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||
model = AutoModel.from_pretrained(MODEL_NAME)
|
||
encoder = SentenceTransformer(MODEL_NAME)
|
||
|
||
|
||
sections = read_from_json('./data/joint_qanon_170k_newface.json')
|
||
|
||
stop_words = read_file_by_address("./data/stop_words_big.txt").splitlines()
|
||
def find_similarity():
|
||
# Two lists of sentences
|
||
sentences2 = [cleaning("قانون حمایت از خانواده و جوانی جمعیت")]
|
||
|
||
sentences1 = [
|
||
cleaning("قانون خانواده"),
|
||
cleaning("قانون جمعیت"),
|
||
cleaning("قانون جوانی جمعیت"),
|
||
cleaning("قانون خانواده و جوانی جمعیت"),
|
||
cleaning("جمعیت ایران"),
|
||
cleaning("جوانی جمعیت"),
|
||
cleaning("حمایت از خانواده"),
|
||
]
|
||
|
||
|
||
embeddings1 = encoder.encode(sentences1, convert_to_tensor=True)
|
||
embeddings2 = encoder.encode(sentences2, convert_to_tensor=True)
|
||
print(embeddings2)
|
||
|
||
# Compute cosine-similarities
|
||
cosine_scores = util.cos_sim(embeddings1, embeddings2)
|
||
#print(cosine_scores)
|
||
|
||
# Output the pairs with their score
|
||
for i in range(len(sentences1)):
|
||
print("{} \t\t {} \t\t Score: {:.4f}".format(
|
||
|
||
sentences1[i], sentences2[0], cosine_scores[i][0]
|
||
))
|
||
|
||
def get_embedding(text):
|
||
embedded_text = encoder.encode(text, convert_to_tensor=True)
|
||
return embedded_text
|
||
|
||
def save_lawtitle_embedding():
|
||
|
||
law_dict = read_from_json('./jokar/llm_services/law_title.json')
|
||
i = 1
|
||
for section_index, section in enumerate(law_dict):
|
||
print(f"law row: {section_index + 1}")
|
||
caption = section['caption']
|
||
caption = cleaning(caption)
|
||
caption_embedding = get_embedding(caption)
|
||
section['caption_embedding'] = str(caption_embedding)
|
||
|
||
write_to_json(law_dict,'./jokar/llm_services/law_title_new222.json')
|
||
|
||
def get_word_embeddings(base_sentence):
|
||
"""
|
||
تبدیل کلمات یک جمله به بردار با استفاده از مدل BERT
|
||
"""
|
||
sentence_tokens = base_sentence.split()
|
||
# پیش پردازش توکن ها
|
||
for tkn in sentence_tokens:
|
||
if tkn in stop_words or tkn.isdigit():
|
||
sentence_tokens.remove(tkn)
|
||
|
||
sentence = ''
|
||
for t in sentence_tokens:
|
||
sentence += ''.join(t + " ")
|
||
|
||
# توکنایز کردن جمله
|
||
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
|
||
|
||
# گرفتن توکنهای کلمات اصلی (نه Special Tokens مثل [CLS])
|
||
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
|
||
|
||
try:
|
||
# پردازش با مدل BERT
|
||
with torch.no_grad():
|
||
outputs = model(**inputs)
|
||
except:
|
||
print("falut")
|
||
return []
|
||
|
||
# بردارهای خروجی برای هر توکن
|
||
token_embeddings = outputs.last_hidden_state[0] # (seq_length, hidden_size)
|
||
|
||
# تبدیل توکنها و بردارها به دیکشنری
|
||
token_to_embedding = {
|
||
token: embedding for token, embedding in zip(tokens, token_embeddings)
|
||
if token not in tokenizer.all_special_tokens # حذف توکنهای خاص
|
||
}
|
||
|
||
return token_to_embedding
|
||
|
||
def compute_distance(embedding1, embedding2):
|
||
"""
|
||
محاسبه فاصله کسینوسی بین دو بردار
|
||
"""
|
||
return cosine(embedding1.numpy(), embedding2.numpy())
|
||
|
||
print(datetime.datetime.now())
|
||
|
||
#find_similarity()
|
||
|
||
sections_embeddings = []
|
||
fault_counter = 0
|
||
for i, section in enumerate(sections):
|
||
if section['is-long']:
|
||
continue
|
||
if i == 10: break
|
||
print('section: ' + str(i+1))
|
||
i+=1
|
||
content = section['content']
|
||
tokens_embedding = get_word_embeddings(content)
|
||
new_token_embds = {}
|
||
for key, embd_value in tokens_embedding.items():
|
||
# تبدیل Tensor به لیست
|
||
tensor_list = embd_value.tolist()
|
||
# تبدیل به JSON
|
||
json_data = json.dumps(tensor_list)
|
||
new_token_embds[key] = json_data
|
||
|
||
|
||
section['tokens_embs'] = new_token_embds
|
||
|
||
|
||
write_to_json(sections, './data/sections_170k_embds.json')
|
||
print("operation_finished")
|
||
print(datetime.datetime.now())
|
||
|