from scipy.spatial.distance import cosine from sentence_transformers import SentenceTransformer, util # from normalizer import cleaning from funcs import write_to_json, read_from_json, read_file_by_address from transformers import AutoTokenizer, AutoModel import torch import json import datetime import pandas as pd # MODEL_NAME = '/home/gpu/tnlp/jokar/Models/HooshvareLab-bert-fa-base-uncased-finetuned-2-pt' MODEL_NAME = "/home/gpu/NLP/MLM/CODES/BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt" # مدل BERT فارسی #MODEL_NAME = "HooshvareLab/bert-fa-base-uncased" # بارگذاری مدل و توکنایزر tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME) encoder = SentenceTransformer(MODEL_NAME) sections = read_from_json('./data/joint_qanon_170k_newface.json') stop_words = read_file_by_address("./data/stop_words_big.txt").splitlines() def find_similarity(): # Two lists of sentences sentences2 = [cleaning("قانون حمایت از خانواده و جوانی جمعیت")] sentences1 = [ cleaning("قانون خانواده"), cleaning("قانون جمعیت"), cleaning("قانون جوانی جمعیت"), cleaning("قانون خانواده و جوانی جمعیت"), cleaning("جمعیت ایران"), cleaning("جوانی جمعیت"), cleaning("حمایت از خانواده"), ] embeddings1 = encoder.encode(sentences1, convert_to_tensor=True) embeddings2 = encoder.encode(sentences2, convert_to_tensor=True) print(embeddings2) # Compute cosine-similarities cosine_scores = util.cos_sim(embeddings1, embeddings2) #print(cosine_scores) # Output the pairs with their score for i in range(len(sentences1)): print("{} \t\t {} \t\t Score: {:.4f}".format( sentences1[i], sentences2[0], cosine_scores[i][0] )) def get_embedding(text): embedded_text = encoder.encode(text, convert_to_tensor=True) return embedded_text def save_lawtitle_embedding(): law_dict = read_from_json('./jokar/llm_services/law_title.json') i = 1 for section_index, section in enumerate(law_dict): print(f"law row: {section_index + 1}") caption = section['caption'] caption = cleaning(caption) caption_embedding = get_embedding(caption) section['caption_embedding'] = str(caption_embedding) write_to_json(law_dict,'./jokar/llm_services/law_title_new222.json') def get_word_embeddings(base_sentence): """ تبدیل کلمات یک جمله به بردار با استفاده از مدل BERT """ sentence_tokens = base_sentence.split() # پیش پردازش توکن ها for tkn in sentence_tokens: if tkn in stop_words or tkn.isdigit(): sentence_tokens.remove(tkn) sentence = '' for t in sentence_tokens: sentence += ''.join(t + " ") # توکنایز کردن جمله inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True) # گرفتن توکن‌های کلمات اصلی (نه Special Tokens مثل [CLS]) tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) try: # پردازش با مدل BERT with torch.no_grad(): outputs = model(**inputs) except: print("falut") return [] # بردارهای خروجی برای هر توکن token_embeddings = outputs.last_hidden_state[0] # (seq_length, hidden_size) # تبدیل توکن‌ها و بردارها به دیکشنری token_to_embedding = { token: embedding for token, embedding in zip(tokens, token_embeddings) if token not in tokenizer.all_special_tokens # حذف توکن‌های خاص } return token_to_embedding def compute_distance(embedding1, embedding2): """ محاسبه فاصله کسینوسی بین دو بردار """ return cosine(embedding1.numpy(), embedding2.numpy()) print(datetime.datetime.now()) #find_similarity() sections_embeddings = [] fault_counter = 0 for i, section in enumerate(sections): if section['is-long']: continue if i == 10: break print('section: ' + str(i+1)) i+=1 content = section['content'] tokens_embedding = get_word_embeddings(content) new_token_embds = {} for key, embd_value in tokens_embedding.items(): # تبدیل Tensor به لیست tensor_list = embd_value.tolist() # تبدیل به JSON json_data = json.dumps(tensor_list) new_token_embds[key] = json_data section['tokens_embs'] = new_token_embds write_to_json(sections, './data/sections_170k_embds.json') print("operation_finished") print(datetime.datetime.now())