148 lines
4.8 KiB
Python
148 lines
4.8 KiB
Python
|
from scipy.spatial.distance import cosine
|
|||
|
from sentence_transformers import SentenceTransformer, util
|
|||
|
# from normalizer import cleaning
|
|||
|
from funcs import write_to_json, read_from_json, read_file_by_address
|
|||
|
from transformers import AutoTokenizer, AutoModel
|
|||
|
import torch
|
|||
|
import json
|
|||
|
import datetime
|
|||
|
import pandas as pd
|
|||
|
|
|||
|
# MODEL_NAME = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt'
|
|||
|
|
|||
|
MODEL_NAME = "/home/gpu/NLP/MLM/CODES/BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt"
|
|||
|
# مدل BERT فارسی
|
|||
|
#MODEL_NAME = "HooshvareLab/bert-fa-base-uncased"
|
|||
|
|
|||
|
# بارگذاری مدل و توکنایزر
|
|||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|||
|
model = AutoModel.from_pretrained(MODEL_NAME)
|
|||
|
encoder = SentenceTransformer(MODEL_NAME)
|
|||
|
|
|||
|
|
|||
|
sections = read_from_json('./data/joint_qanon_170k_newface.json')
|
|||
|
|
|||
|
stop_words = read_file_by_address("./data/stop_words_big.txt").splitlines()
|
|||
|
def find_similarity():
|
|||
|
# Two lists of sentences
|
|||
|
sentences2 = [cleaning("قانون حمایت از خانواده و جوانی جمعیت")]
|
|||
|
|
|||
|
sentences1 = [
|
|||
|
cleaning("قانون خانواده"),
|
|||
|
cleaning("قانون جمعیت"),
|
|||
|
cleaning("قانون جوانی جمعیت"),
|
|||
|
cleaning("قانون خانواده و جوانی جمعیت"),
|
|||
|
cleaning("جمعیت ایران"),
|
|||
|
cleaning("جوانی جمعیت"),
|
|||
|
cleaning("حمایت از خانواده"),
|
|||
|
]
|
|||
|
|
|||
|
|
|||
|
embeddings1 = encoder.encode(sentences1, convert_to_tensor=True)
|
|||
|
embeddings2 = encoder.encode(sentences2, convert_to_tensor=True)
|
|||
|
print(embeddings2)
|
|||
|
|
|||
|
# Compute cosine-similarities
|
|||
|
cosine_scores = util.cos_sim(embeddings1, embeddings2)
|
|||
|
#print(cosine_scores)
|
|||
|
|
|||
|
# Output the pairs with their score
|
|||
|
for i in range(len(sentences1)):
|
|||
|
print("{} \t\t {} \t\t Score: {:.4f}".format(
|
|||
|
|
|||
|
sentences1[i], sentences2[0], cosine_scores[i][0]
|
|||
|
))
|
|||
|
|
|||
|
def get_embedding(text):
|
|||
|
embedded_text = encoder.encode(text, convert_to_tensor=True)
|
|||
|
return embedded_text
|
|||
|
|
|||
|
def save_lawtitle_embedding():
|
|||
|
|
|||
|
law_dict = read_from_json('./jokar/llm_services/law_title.json')
|
|||
|
i = 1
|
|||
|
for section_index, section in enumerate(law_dict):
|
|||
|
print(f"law row: {section_index + 1}")
|
|||
|
caption = section['caption']
|
|||
|
caption = cleaning(caption)
|
|||
|
caption_embedding = get_embedding(caption)
|
|||
|
section['caption_embedding'] = str(caption_embedding)
|
|||
|
|
|||
|
write_to_json(law_dict,'./jokar/llm_services/law_title_new222.json')
|
|||
|
|
|||
|
def get_word_embeddings(base_sentence):
|
|||
|
"""
|
|||
|
تبدیل کلمات یک جمله به بردار با استفاده از مدل BERT
|
|||
|
"""
|
|||
|
sentence_tokens = base_sentence.split()
|
|||
|
# پیش پردازش توکن ها
|
|||
|
for tkn in sentence_tokens:
|
|||
|
if tkn in stop_words or tkn.isdigit():
|
|||
|
sentence_tokens.remove(tkn)
|
|||
|
|
|||
|
sentence = ''
|
|||
|
for t in sentence_tokens:
|
|||
|
sentence += ''.join(t + " ")
|
|||
|
|
|||
|
# توکنایز کردن جمله
|
|||
|
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
|
|||
|
|
|||
|
# گرفتن توکنهای کلمات اصلی (نه Special Tokens مثل [CLS])
|
|||
|
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
|
|||
|
|
|||
|
try:
|
|||
|
# پردازش با مدل BERT
|
|||
|
with torch.no_grad():
|
|||
|
outputs = model(**inputs)
|
|||
|
except:
|
|||
|
print("falut")
|
|||
|
return []
|
|||
|
|
|||
|
# بردارهای خروجی برای هر توکن
|
|||
|
token_embeddings = outputs.last_hidden_state[0] # (seq_length, hidden_size)
|
|||
|
|
|||
|
# تبدیل توکنها و بردارها به دیکشنری
|
|||
|
token_to_embedding = {
|
|||
|
token: embedding for token, embedding in zip(tokens, token_embeddings)
|
|||
|
if token not in tokenizer.all_special_tokens # حذف توکنهای خاص
|
|||
|
}
|
|||
|
|
|||
|
return token_to_embedding
|
|||
|
|
|||
|
def compute_distance(embedding1, embedding2):
|
|||
|
"""
|
|||
|
محاسبه فاصله کسینوسی بین دو بردار
|
|||
|
"""
|
|||
|
return cosine(embedding1.numpy(), embedding2.numpy())
|
|||
|
|
|||
|
print(datetime.datetime.now())
|
|||
|
|
|||
|
#find_similarity()
|
|||
|
|
|||
|
sections_embeddings = []
|
|||
|
fault_counter = 0
|
|||
|
for i, section in enumerate(sections):
|
|||
|
if section['is-long']:
|
|||
|
continue
|
|||
|
if i == 10: break
|
|||
|
print('section: ' + str(i+1))
|
|||
|
i+=1
|
|||
|
content = section['content']
|
|||
|
tokens_embedding = get_word_embeddings(content)
|
|||
|
new_token_embds = {}
|
|||
|
for key, embd_value in tokens_embedding.items():
|
|||
|
# تبدیل Tensor به لیست
|
|||
|
tensor_list = embd_value.tolist()
|
|||
|
# تبدیل به JSON
|
|||
|
json_data = json.dumps(tensor_list)
|
|||
|
new_token_embds[key] = json_data
|
|||
|
|
|||
|
|
|||
|
section['tokens_embs'] = new_token_embds
|
|||
|
|
|||
|
|
|||
|
write_to_json(sections, './data/sections_170k_embds.json')
|
|||
|
print("operation_finished")
|
|||
|
print(datetime.datetime.now())
|
|||
|
|