import json from tqdm import tqdm import numpy as np import torch from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer from transformers import AutoModel # for pytorch from transformers import TFAutoModelForTokenClassification # for tensorflow from transformers import pipeline import os from datasets import Dataset, load_from_disk print('start') #--- # NOTE: for bug in dumping float in json class NumpyFloatValuesEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.float32): return float(obj) if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) if isinstance(obj, np.ndarray): return obj.tolist() return json.JSONEncoder.default(self, obj) #json.dumps(d, cls=NumpyFloatValuesEncoder) #---- #--- text_arr = [] print('loading data') content_file = open('./mj/oil_domain.json', "r", encoding='utf-8') oil_data = json.load(content_file) for qan in oil_data: for sec in oil_data[qan]: qs_id = sec["id"] if (type(sec["graph_models"]) is list): for sh in sec["graph_models"]: rule_id = sh["id"] rule = sh['rule'] text_arr.append({"id":qs_id, "rule_id": rule_id, "rule": rule}) else: for sh in sec["graph_models"]: rule = sec["graph_models"][sh]['rule'] text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule}) content_file.close() remained = len(text_arr) #text_arr[0]['content'] #text_arr[0]['ner'] #--- device = "cpu" if torch.cuda.is_available(): device = "cuda" #model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2" #max_position_embeddings = 512 #model_name_or_path = "sharif-dal/dal-bert" #model_name_or_path = "jinaai/jina-embeddings-v3" model_name_or_path = "BAAI/bge-m3" #model_name_or_path = "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt" if model_name_or_path == "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt": if not os.path.exists(model_name_or_path+'/model.safetensors') or not os.path.exists(model_name_or_path+'/tokenizer.json'): print('model files is not exists in model path directory.') exit(0) # Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) # Load model from HuggingFace Hub tokenizer_bert = AutoTokenizer.from_pretrained(model_name_or_path) model_bert = AutoModel.from_pretrained(model_name_or_path) def get_embedding(sentences): # Tokenize sentences encoded_input = tokenizer_bert(sentences, padding=True, truncation=True, return_tensors='pt') # Compute token embeddings with torch.no_grad(): model_output = model_bert(**encoded_input) # Perform pooling. In this case, max pooling. sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) return sentence_embeddings elif model_name_or_path == "jinaai/jina-embeddings-v3": from sentence_transformers import SentenceTransformer embedder = SentenceTransformer(model_name_or_path, trust_remote_code=True) #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) def get_embedding(text): embedding1 = embedder.encode(text) return embedding1 elif model_name_or_path == 'BAAI/bge-m3': from FlagEmbedding import BGEM3FlagModel #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) score_model = BGEM3FlagModel(model_name_or_path, use_fp16=True)#, devices= device def get_embedding(text): output_1 = score_model.encode(text, return_dense=True, return_sparse=False, return_colbert_vecs=False) return output_1['dense_vecs'] elif model_name_or_path == "sharif-dal/dal-bert": max_position_embeddings = 258 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = AutoModel.from_pretrained(model_name_or_path) # Pytorch # model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path) # Tensorflow model.to(device) def get_embedding(text:str, max_length: int = max_position_embeddings): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length) #print(inputs.tokens()) inputs = inputs.to(device) with torch.no_grad(): model_output = model(**inputs, return_dict=True) # Perform pooling embeddings = model_output.last_hidden_state[0][0] # output of CLS #embeddings =embeddings.squeeze(0) return embeddings.detach().cpu().numpy() ###### corpus_embeddings = [] for item in tqdm(text_arr): id = item['id'] rule_id = item['rule_id'] rule = item['rule'] embedding = get_embedding(rule) corpus_embeddings.append({'embedding':embedding, 'rule': rule, 'id': id, 'rule_id': rule_id}) data = Dataset.from_list(corpus_embeddings) udata = data.add_faiss_index('embedding') k = 20 related_data = [] for item in tqdm(corpus_embeddings): id = item['id'] rule_id = item['rule_id'] rule = item['rule'] embedding = item['embedding'] scores, retrieved_rules = udata.get_nearest_examples( # retrieve results 'embedding', embedding, # compare our new embedded query with the dataset embeddings k=k # get only top k results ) related_data.append({'rule': rule, 'id': id, 'rule_id': rule_id, 'retrieved_ids': retrieved_rules['id'], 'retrieved_rule_ids': retrieved_rules['rule_id'], 'retrieved_rules': retrieved_rules['rule'], 'retrieved_scores': scores}) ##### # for item in tqdm(text_arr): # for rr in item['retrieved_rules']: # rr['embedding'] = [] ##### filename = "similar_{}_oil_{}.json".format(k,model_name_or_path.replace("/","_")) similarity_file = open(filename, "w", encoding='utf-8') similarity_file.write(json.dumps(related_data, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4 similarity_file.close() print('end')