import json from tqdm import tqdm import numpy as np import torch from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer from transformers import AutoModel # for pytorch from transformers import TFAutoModelForTokenClassification # for tensorflow from transformers import pipeline print('start') #--- # NOTE: for bug in dumping float in json class NumpyFloatValuesEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.float32): return float(obj) if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) if isinstance(obj, np.ndarray): return obj.tolist() return json.JSONEncoder.default(self, obj) #json.dumps(d, cls=NumpyFloatValuesEncoder) #---- #--- text_arr = [] print('loading data') content_file = open('./mj/oil_domain.json', "r", encoding='utf-8') oil_data = json.load(content_file) for qan in oil_data: for sec in oil_data[qan]: qs_id = sec["id"] if (type(sec["graph_models"]) is list): for sh in sec["graph_models"]: rule_id = sh["id"] rule = sh['rule'] text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule}) else: for sh in sec["graph_models"]: rule = sec["graph_models"][sh]['rule'] text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule}) content_file.close() remained = len(text_arr) #text_arr[0]['content'] #text_arr[0]['ner'] #--- device = "cpu" if torch.cuda.is_available(): device = "cuda" #model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2" #max_position_embeddings = 512 model_name_or_path = "sharif-dal/dal-bert" max_position_embeddings = 258 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = AutoModel.from_pretrained(model_name_or_path) # Pytorch # model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path) # Tensorflow model.to(device) def encode(text:str, max_length: int = max_position_embeddings): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length) #print(inputs.tokens()) inputs = inputs.to(device) with torch.no_grad(): model_output = model(**inputs, return_dict=True) # Perform pooling embeddings = model_output.last_hidden_state[0][0] # output of CLS #embeddings =embeddings.squeeze(0) return embeddings.detach().cpu().numpy() X = [] for item in tqdm(text_arr): content = item['rule'] embedding = encode(content) #item['embedding'] = embedding.reshape(1, -1) X.append(embedding) ###### from sklearn.cluster import KMeans clusterer = KMeans(n_clusters=100, random_state=0) # algorithm : {"lloyd", "elkan"}, default="lloyd" cluster_labels = clusterer.fit_predict(X) for indx,item in enumerate(text_arr): cluster = cluster_labels[indx] item['cluster'] = cluster ##### filename = "cluster_km_oil_{}.json".format(model_name_or_path.replace("/","_")) similarity_file = open(filename, "w", encoding='utf-8') similarity_file.write(json.dumps(text_arr, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4 similarity_file.close() print('end')