add to git
This commit is contained in:
commit
4b517ce00e
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
*.jsonl
|
||||
*.json
|
||||
__pycache__/*
|
||||
*.pkl
|
||||
*.log
|
||||
caches/*
|
||||
mj/*
|
89
normalizer.py
Normal file
89
normalizer.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
import hazm
|
||||
from cleantext import clean
|
||||
import re
|
||||
|
||||
def cleanhtml(raw_html):
|
||||
cleanr = re.compile('<.*?>')
|
||||
cleantext = re.sub(cleanr, '', raw_html)
|
||||
return cleantext
|
||||
|
||||
normalizer = hazm.Normalizer()
|
||||
wierd_pattern = re.compile("["
|
||||
u"\U0001F600-\U0001F64F" # emoticons
|
||||
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
||||
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
||||
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
||||
u"\U00002702-\U000027B0"
|
||||
u"\U000024C2-\U0001F251"
|
||||
u"\U0001f926-\U0001f937"
|
||||
u'\U00010000-\U0010ffff'
|
||||
u"\u200d"
|
||||
u"\u2640-\u2642"
|
||||
u"\u2600-\u2B55"
|
||||
u"\u23cf"
|
||||
u"\u23e9"
|
||||
u"\u231a"
|
||||
u"\u3030"
|
||||
u"\ufe0f"
|
||||
u"\u2069"
|
||||
u"\u2066"
|
||||
u"\u200c"
|
||||
u"\u2068"
|
||||
u"\u2067"
|
||||
"]+", flags=re.UNICODE)
|
||||
|
||||
def cleaning(text):
|
||||
text = text.strip()
|
||||
|
||||
# regular cleaning
|
||||
# text = clean(text,
|
||||
# fix_unicode=True,
|
||||
# to_ascii=False,
|
||||
# lower=True,
|
||||
# no_line_breaks=True,
|
||||
# no_urls=True,
|
||||
# no_emails=True,
|
||||
# no_phone_numbers=True,
|
||||
# no_numbers=False,
|
||||
# no_digits=False,
|
||||
# no_currency_symbols=True,
|
||||
# no_punct=False,
|
||||
# replace_with_url="",
|
||||
# replace_with_email="",
|
||||
# replace_with_phone_number="",
|
||||
# replace_with_number="",
|
||||
# replace_with_digit="0",
|
||||
# replace_with_currency_symbol="",
|
||||
# )
|
||||
text = clean(text,
|
||||
extra_spaces = True,
|
||||
lowercase = True
|
||||
)
|
||||
|
||||
# cleaning htmls
|
||||
text = cleanhtml(text)
|
||||
|
||||
# normalizing
|
||||
text = normalizer.normalize(text)
|
||||
|
||||
# removing wierd patterns
|
||||
text = wierd_pattern.sub(r'', text)
|
||||
|
||||
# removing extra spaces, hashtags
|
||||
text = re.sub("#", "", text)
|
||||
text = re.sub("\s+", " ", text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# with open('./ghavanins.txt', encoding="utf-8") as fp:
|
||||
# current_content = fp.read()
|
||||
|
||||
# current_content = cleaning(current_content)
|
||||
|
||||
|
||||
# with open('./ghavanins2.txt', 'wb') as f:
|
||||
# f.write(current_content.encode('utf-8', 'ignore'))
|
98
oil_domain_clustering.py
Normal file
98
oil_domain_clustering.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
import json
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoModel # for pytorch
|
||||
from transformers import TFAutoModelForTokenClassification # for tensorflow
|
||||
from transformers import pipeline
|
||||
|
||||
print('start')
|
||||
#---
|
||||
# NOTE: for bug in dumping float in json
|
||||
class NumpyFloatValuesEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, np.float32):
|
||||
return float(obj)
|
||||
if isinstance(obj, np.integer):
|
||||
return int(obj)
|
||||
if isinstance(obj, np.floating):
|
||||
return float(obj)
|
||||
if isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
#json.dumps(d, cls=NumpyFloatValuesEncoder)
|
||||
#----
|
||||
|
||||
#---
|
||||
text_arr = []
|
||||
print('loading data')
|
||||
content_file = open('./mj/oil_domain.json', "r", encoding='utf-8')
|
||||
oil_data = json.load(content_file)
|
||||
for qan in oil_data:
|
||||
for sec in oil_data[qan]:
|
||||
qs_id = sec["id"]
|
||||
if (type(sec["graph_models"]) is list):
|
||||
for sh in sec["graph_models"]:
|
||||
rule_id = sh["id"]
|
||||
rule = sh['rule']
|
||||
text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
|
||||
else:
|
||||
for sh in sec["graph_models"]:
|
||||
rule = sec["graph_models"][sh]['rule']
|
||||
text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
|
||||
content_file.close()
|
||||
remained = len(text_arr)
|
||||
#text_arr[0]['content']
|
||||
#text_arr[0]['ner']
|
||||
#---
|
||||
|
||||
device = "cpu"
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
|
||||
#model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2"
|
||||
#max_position_embeddings = 512
|
||||
|
||||
model_name_or_path = "sharif-dal/dal-bert"
|
||||
max_position_embeddings = 258
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
model = AutoModel.from_pretrained(model_name_or_path) # Pytorch
|
||||
# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path) # Tensorflow
|
||||
model.to(device)
|
||||
|
||||
def encode(text:str, max_length: int = max_position_embeddings):
|
||||
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
|
||||
#print(inputs.tokens())
|
||||
inputs = inputs.to(device)
|
||||
with torch.no_grad():
|
||||
model_output = model(**inputs, return_dict=True)
|
||||
# Perform pooling
|
||||
embeddings = model_output.last_hidden_state[0][0] # output of CLS
|
||||
#embeddings =embeddings.squeeze(0)
|
||||
return embeddings.detach().cpu().numpy()
|
||||
X = []
|
||||
for item in tqdm(text_arr):
|
||||
content = item['rule']
|
||||
embedding = encode(content)
|
||||
#item['embedding'] = embedding.reshape(1, -1)
|
||||
X.append(embedding)
|
||||
|
||||
######
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
clusterer = KMeans(n_clusters=100, random_state=0) # algorithm : {"lloyd", "elkan"}, default="lloyd"
|
||||
cluster_labels = clusterer.fit_predict(X)
|
||||
for indx,item in enumerate(text_arr):
|
||||
cluster = cluster_labels[indx]
|
||||
item['cluster'] = cluster
|
||||
#####
|
||||
|
||||
|
||||
filename = "cluster_km_oil_{}.json".format(model_name_or_path.replace("/","_"))
|
||||
similarity_file = open(filename, "w", encoding='utf-8')
|
||||
similarity_file.write(json.dumps(text_arr, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4
|
||||
similarity_file.close()
|
||||
print('end')
|
168
oil_domain_nearest_02.py
Normal file
168
oil_domain_nearest_02.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
import json
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoModel # for pytorch
|
||||
from transformers import TFAutoModelForTokenClassification # for tensorflow
|
||||
from transformers import pipeline
|
||||
import os
|
||||
from datasets import Dataset, load_from_disk
|
||||
|
||||
|
||||
print('start')
|
||||
#---
|
||||
# NOTE: for bug in dumping float in json
|
||||
class NumpyFloatValuesEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, np.float32):
|
||||
return float(obj)
|
||||
if isinstance(obj, np.integer):
|
||||
return int(obj)
|
||||
if isinstance(obj, np.floating):
|
||||
return float(obj)
|
||||
if isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
#json.dumps(d, cls=NumpyFloatValuesEncoder)
|
||||
#----
|
||||
|
||||
#---
|
||||
text_arr = []
|
||||
print('loading data')
|
||||
content_file = open('./mj/oil_domain.json', "r", encoding='utf-8')
|
||||
oil_data = json.load(content_file)
|
||||
for qan in oil_data:
|
||||
for sec in oil_data[qan]:
|
||||
qs_id = sec["id"]
|
||||
if (type(sec["graph_models"]) is list):
|
||||
for sh in sec["graph_models"]:
|
||||
rule_id = sh["id"]
|
||||
rule = sh['rule']
|
||||
text_arr.append({"id":qs_id, "rule_id": rule_id, "rule": rule})
|
||||
else:
|
||||
for sh in sec["graph_models"]:
|
||||
rule = sec["graph_models"][sh]['rule']
|
||||
text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
|
||||
|
||||
content_file.close()
|
||||
remained = len(text_arr)
|
||||
#text_arr[0]['content']
|
||||
#text_arr[0]['ner']
|
||||
#---
|
||||
|
||||
device = "cpu"
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
|
||||
#model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2"
|
||||
#max_position_embeddings = 512
|
||||
|
||||
#model_name_or_path = "sharif-dal/dal-bert"
|
||||
#model_name_or_path = "jinaai/jina-embeddings-v3"
|
||||
model_name_or_path = "BAAI/bge-m3"
|
||||
#model_name_or_path = "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt"
|
||||
|
||||
if model_name_or_path == "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt":
|
||||
|
||||
if not os.path.exists(model_name_or_path+'/model.safetensors') or not os.path.exists(model_name_or_path+'/tokenizer.json'):
|
||||
print('model files is not exists in model path directory.')
|
||||
exit(0)
|
||||
|
||||
# Mean Pooling - Take attention mask into account for correct averaging
|
||||
def mean_pooling(model_output, attention_mask):
|
||||
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
||||
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
||||
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
||||
|
||||
# Load model from HuggingFace Hub
|
||||
tokenizer_bert = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
model_bert = AutoModel.from_pretrained(model_name_or_path)
|
||||
|
||||
def get_embedding(sentences):
|
||||
# Tokenize sentences
|
||||
encoded_input = tokenizer_bert(sentences, padding=True, truncation=True, return_tensors='pt')
|
||||
# Compute token embeddings
|
||||
with torch.no_grad():
|
||||
model_output = model_bert(**encoded_input)
|
||||
# Perform pooling. In this case, max pooling.
|
||||
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
||||
|
||||
return sentence_embeddings
|
||||
elif model_name_or_path == "jinaai/jina-embeddings-v3":
|
||||
from sentence_transformers import SentenceTransformer
|
||||
embedder = SentenceTransformer(model_name_or_path, trust_remote_code=True)
|
||||
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
def get_embedding(text):
|
||||
embedding1 = embedder.encode(text)
|
||||
return embedding1
|
||||
|
||||
elif model_name_or_path == 'BAAI/bge-m3':
|
||||
from FlagEmbedding import BGEM3FlagModel
|
||||
#tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
score_model = BGEM3FlagModel(model_name_or_path, use_fp16=True)#, devices= device
|
||||
|
||||
def get_embedding(text):
|
||||
output_1 = score_model.encode(text, return_dense=True, return_sparse=False, return_colbert_vecs=False)
|
||||
return output_1['dense_vecs']
|
||||
|
||||
elif model_name_or_path == "sharif-dal/dal-bert":
|
||||
max_position_embeddings = 258
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
model = AutoModel.from_pretrained(model_name_or_path) # Pytorch
|
||||
# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path) # Tensorflow
|
||||
model.to(device)
|
||||
|
||||
def get_embedding(text:str, max_length: int = max_position_embeddings):
|
||||
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
|
||||
#print(inputs.tokens())
|
||||
inputs = inputs.to(device)
|
||||
with torch.no_grad():
|
||||
model_output = model(**inputs, return_dict=True)
|
||||
# Perform pooling
|
||||
embeddings = model_output.last_hidden_state[0][0] # output of CLS
|
||||
#embeddings =embeddings.squeeze(0)
|
||||
return embeddings.detach().cpu().numpy()
|
||||
|
||||
######
|
||||
corpus_embeddings = []
|
||||
for item in tqdm(text_arr):
|
||||
id = item['id']
|
||||
rule_id = item['rule_id']
|
||||
rule = item['rule']
|
||||
embedding = get_embedding(rule)
|
||||
corpus_embeddings.append({'embedding':embedding, 'rule': rule, 'id': id, 'rule_id': rule_id})
|
||||
data = Dataset.from_list(corpus_embeddings)
|
||||
udata = data.add_faiss_index('embedding')
|
||||
|
||||
|
||||
k = 20
|
||||
related_data = []
|
||||
for item in tqdm(corpus_embeddings):
|
||||
id = item['id']
|
||||
rule_id = item['rule_id']
|
||||
rule = item['rule']
|
||||
embedding = item['embedding']
|
||||
scores, retrieved_rules = udata.get_nearest_examples( # retrieve results
|
||||
'embedding', embedding, # compare our new embedded query with the dataset embeddings
|
||||
k=k # get only top k results
|
||||
)
|
||||
|
||||
|
||||
related_data.append({'rule': rule, 'id': id, 'rule_id': rule_id,
|
||||
'retrieved_ids': retrieved_rules['id'], 'retrieved_rule_ids': retrieved_rules['rule_id'],
|
||||
'retrieved_rules': retrieved_rules['rule'], 'retrieved_scores': scores})
|
||||
|
||||
|
||||
#####
|
||||
# for item in tqdm(text_arr):
|
||||
# for rr in item['retrieved_rules']:
|
||||
# rr['embedding'] = []
|
||||
#####
|
||||
|
||||
filename = "similar_{}_oil_{}.json".format(k,model_name_or_path.replace("/","_"))
|
||||
similarity_file = open(filename, "w", encoding='utf-8')
|
||||
similarity_file.write(json.dumps(related_data, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4
|
||||
similarity_file.close()
|
||||
print('end')
|
83
oil_domain_reranking_03.py
Normal file
83
oil_domain_reranking_03.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
import json
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoModel # for pytorch
|
||||
from transformers import TFAutoModelForTokenClassification # for tensorflow
|
||||
from transformers import pipeline
|
||||
import os
|
||||
from datasets import Dataset, load_from_disk
|
||||
|
||||
|
||||
print('start')
|
||||
#---
|
||||
# NOTE: for bug in dumping float in json
|
||||
class NumpyFloatValuesEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, np.float32):
|
||||
return float(obj)
|
||||
if isinstance(obj, np.integer):
|
||||
return int(obj)
|
||||
if isinstance(obj, np.floating):
|
||||
return float(obj)
|
||||
if isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
#json.dumps(d, cls=NumpyFloatValuesEncoder)
|
||||
#----
|
||||
|
||||
|
||||
from FlagEmbedding import FlagReranker
|
||||
import os
|
||||
#os.environ['HUGGING_FACE_HUB_TOKEN'] = "hf_VeCSxLxSCVlt..."
|
||||
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
|
||||
|
||||
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
|
||||
|
||||
|
||||
|
||||
|
||||
def search_rerank(rule, sim_rules, rule_ids, rerank_k:int=4):
|
||||
z_results = [[rule, i] for i in sim_rules]
|
||||
# The scores map into 0-1 by set "normalize=True", which will apply sigmoid function to the score
|
||||
scores = reranker.compute_score(z_results, normalize=True)
|
||||
s_results = sorted(zip(scores, z_results, rule_ids), key=lambda x: x[0], reverse=True)
|
||||
s_results2 = s_results[:rerank_k]
|
||||
results = [[i[0], i[1][1], i[2]] for i in s_results2]
|
||||
return results
|
||||
|
||||
|
||||
#---
|
||||
k = 10
|
||||
related_data = []
|
||||
print('loading data')
|
||||
model_name_or_path = "similar_20_oil_BAAI_bge-m3"
|
||||
#model_name_or_path = "similar_20_oil_jinaai_jina-embeddings-v3"
|
||||
content_file = open(f'./mj/{model_name_or_path}.json', "r", encoding='utf-8')
|
||||
oil_data = json.load(content_file)
|
||||
for qan in tqdm(oil_data):
|
||||
id = qan["id"]
|
||||
rule_id = qan["rule_id"]
|
||||
rule = qan["rule"]
|
||||
retrieved_ids = []
|
||||
retrieved_rule_ids = []
|
||||
retrieved_rules = []
|
||||
for relateds in zip(qan["retrieved_ids"], qan["retrieved_rule_ids"], qan["retrieved_rules"]):
|
||||
if relateds[0] != id:
|
||||
retrieved_ids.append(relateds[0])
|
||||
retrieved_rule_ids.append(relateds[1])
|
||||
retrieved_rules.append(relateds[2])
|
||||
reranked = search_rerank(rule, retrieved_rules, retrieved_rule_ids, 10)
|
||||
related_data.append({"rule_id":rule_id, "rule": rule, "retrieved_rules": reranked})
|
||||
|
||||
content_file.close()
|
||||
remained = len(related_data)
|
||||
print(remained)
|
||||
##########
|
||||
filename = "./mj/reranked_{}_oil_{}.json".format(k,model_name_or_path)
|
||||
similarity_file = open(filename, "w", encoding='utf-8')
|
||||
similarity_file.write(json.dumps(related_data, ensure_ascii=False, cls=NumpyFloatValuesEncoder, indent=4))#
|
||||
similarity_file.close()
|
||||
print('end')
|
Loading…
Reference in New Issue
Block a user