98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
import json
|
|
from tqdm import tqdm
|
|
import numpy as np
|
|
import torch
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from transformers import AutoTokenizer
|
|
from transformers import AutoModel # for pytorch
|
|
from transformers import TFAutoModelForTokenClassification # for tensorflow
|
|
from transformers import pipeline
|
|
|
|
print('start')
|
|
#---
|
|
# NOTE: for bug in dumping float in json
|
|
class NumpyFloatValuesEncoder(json.JSONEncoder):
|
|
def default(self, obj):
|
|
if isinstance(obj, np.float32):
|
|
return float(obj)
|
|
if isinstance(obj, np.integer):
|
|
return int(obj)
|
|
if isinstance(obj, np.floating):
|
|
return float(obj)
|
|
if isinstance(obj, np.ndarray):
|
|
return obj.tolist()
|
|
return json.JSONEncoder.default(self, obj)
|
|
#json.dumps(d, cls=NumpyFloatValuesEncoder)
|
|
#----
|
|
|
|
#---
|
|
text_arr = []
|
|
print('loading data')
|
|
content_file = open('./mj/oil_domain.json', "r", encoding='utf-8')
|
|
oil_data = json.load(content_file)
|
|
for qan in oil_data:
|
|
for sec in oil_data[qan]:
|
|
qs_id = sec["id"]
|
|
if (type(sec["graph_models"]) is list):
|
|
for sh in sec["graph_models"]:
|
|
rule_id = sh["id"]
|
|
rule = sh['rule']
|
|
text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
|
|
else:
|
|
for sh in sec["graph_models"]:
|
|
rule = sec["graph_models"][sh]['rule']
|
|
text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
|
|
content_file.close()
|
|
remained = len(text_arr)
|
|
#text_arr[0]['content']
|
|
#text_arr[0]['ner']
|
|
#---
|
|
|
|
device = "cpu"
|
|
if torch.cuda.is_available():
|
|
device = "cuda"
|
|
|
|
#model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2"
|
|
#max_position_embeddings = 512
|
|
|
|
model_name_or_path = "sharif-dal/dal-bert"
|
|
max_position_embeddings = 258
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
|
model = AutoModel.from_pretrained(model_name_or_path) # Pytorch
|
|
# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path) # Tensorflow
|
|
model.to(device)
|
|
|
|
def encode(text:str, max_length: int = max_position_embeddings):
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
|
|
#print(inputs.tokens())
|
|
inputs = inputs.to(device)
|
|
with torch.no_grad():
|
|
model_output = model(**inputs, return_dict=True)
|
|
# Perform pooling
|
|
embeddings = model_output.last_hidden_state[0][0] # output of CLS
|
|
#embeddings =embeddings.squeeze(0)
|
|
return embeddings.detach().cpu().numpy()
|
|
X = []
|
|
for item in tqdm(text_arr):
|
|
content = item['rule']
|
|
embedding = encode(content)
|
|
#item['embedding'] = embedding.reshape(1, -1)
|
|
X.append(embedding)
|
|
|
|
######
|
|
from sklearn.cluster import KMeans
|
|
|
|
clusterer = KMeans(n_clusters=100, random_state=0) # algorithm : {"lloyd", "elkan"}, default="lloyd"
|
|
cluster_labels = clusterer.fit_predict(X)
|
|
for indx,item in enumerate(text_arr):
|
|
cluster = cluster_labels[indx]
|
|
item['cluster'] = cluster
|
|
#####
|
|
|
|
|
|
filename = "cluster_km_oil_{}.json".format(model_name_or_path.replace("/","_"))
|
|
similarity_file = open(filename, "w", encoding='utf-8')
|
|
similarity_file.write(json.dumps(text_arr, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4
|
|
similarity_file.close()
|
|
print('end') |