From 4b517ce00e93d35fefe2ad049aca9e12a858131f Mon Sep 17 00:00:00 2001 From: hdeldar Date: Thu, 3 Jul 2025 09:59:12 +0330 Subject: [PATCH] add to git --- .gitignore | 7 ++ README.md | 0 normalizer.py | 89 ++++++++++++++++++++ oil_domain_clustering.py | 98 ++++++++++++++++++++++ oil_domain_nearest_02.py | 168 +++++++++++++++++++++++++++++++++++++ oil_domain_reranking_03.py | 83 ++++++++++++++++++ 6 files changed, 445 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 normalizer.py create mode 100644 oil_domain_clustering.py create mode 100644 oil_domain_nearest_02.py create mode 100644 oil_domain_reranking_03.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..32d29a5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.jsonl +*.json +__pycache__/* +*.pkl +*.log +caches/* +mj/* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/normalizer.py b/normalizer.py new file mode 100644 index 0000000..825e02f --- /dev/null +++ b/normalizer.py @@ -0,0 +1,89 @@ +import hazm +from cleantext import clean +import re + +def cleanhtml(raw_html): + cleanr = re.compile('<.*?>') + cleantext = re.sub(cleanr, '', raw_html) + return cleantext + +normalizer = hazm.Normalizer() +wierd_pattern = re.compile("[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + u"\U00002702-\U000027B0" + u"\U000024C2-\U0001F251" + u"\U0001f926-\U0001f937" + u'\U00010000-\U0010ffff' + u"\u200d" + u"\u2640-\u2642" + u"\u2600-\u2B55" + u"\u23cf" + u"\u23e9" + u"\u231a" + u"\u3030" + u"\ufe0f" + u"\u2069" + u"\u2066" + u"\u200c" + u"\u2068" + u"\u2067" + "]+", flags=re.UNICODE) + +def cleaning(text): + text = text.strip() + + # regular cleaning + # text = clean(text, + # fix_unicode=True, + # to_ascii=False, + # lower=True, + # no_line_breaks=True, + # no_urls=True, + # no_emails=True, + # no_phone_numbers=True, + # no_numbers=False, + # no_digits=False, + # no_currency_symbols=True, + # no_punct=False, + # replace_with_url="", + # replace_with_email="", + # replace_with_phone_number="", + # replace_with_number="", + # replace_with_digit="0", + # replace_with_currency_symbol="", + # ) + text = clean(text, + extra_spaces = True, + lowercase = True + ) + + # cleaning htmls + text = cleanhtml(text) + + # normalizing + text = normalizer.normalize(text) + + # removing wierd patterns + text = wierd_pattern.sub(r'', text) + + # removing extra spaces, hashtags + text = re.sub("#", "", text) + text = re.sub("\s+", " ", text) + + return text + + + + + +# with open('./ghavanins.txt', encoding="utf-8") as fp: +# current_content = fp.read() + +# current_content = cleaning(current_content) + + +# with open('./ghavanins2.txt', 'wb') as f: +# f.write(current_content.encode('utf-8', 'ignore')) \ No newline at end of file diff --git a/oil_domain_clustering.py b/oil_domain_clustering.py new file mode 100644 index 0000000..02b8c00 --- /dev/null +++ b/oil_domain_clustering.py @@ -0,0 +1,98 @@ +import json +from tqdm import tqdm +import numpy as np +import torch +from sklearn.metrics.pairwise import cosine_similarity +from transformers import AutoTokenizer +from transformers import AutoModel # for pytorch +from transformers import TFAutoModelForTokenClassification # for tensorflow +from transformers import pipeline + +print('start') +#--- +# NOTE: for bug in dumping float in json +class NumpyFloatValuesEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.float32): + return float(obj) + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return json.JSONEncoder.default(self, obj) +#json.dumps(d, cls=NumpyFloatValuesEncoder) +#---- + +#--- +text_arr = [] +print('loading data') +content_file = open('./mj/oil_domain.json', "r", encoding='utf-8') +oil_data = json.load(content_file) +for qan in oil_data: + for sec in oil_data[qan]: + qs_id = sec["id"] + if (type(sec["graph_models"]) is list): + for sh in sec["graph_models"]: + rule_id = sh["id"] + rule = sh['rule'] + text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule}) + else: + for sh in sec["graph_models"]: + rule = sec["graph_models"][sh]['rule'] + text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule}) +content_file.close() +remained = len(text_arr) +#text_arr[0]['content'] +#text_arr[0]['ner'] +#--- + +device = "cpu" +if torch.cuda.is_available(): + device = "cuda" + +#model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2" +#max_position_embeddings = 512 + +model_name_or_path = "sharif-dal/dal-bert" +max_position_embeddings = 258 + +tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) +model = AutoModel.from_pretrained(model_name_or_path) # Pytorch +# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path) # Tensorflow +model.to(device) + +def encode(text:str, max_length: int = max_position_embeddings): + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length) + #print(inputs.tokens()) + inputs = inputs.to(device) + with torch.no_grad(): + model_output = model(**inputs, return_dict=True) + # Perform pooling + embeddings = model_output.last_hidden_state[0][0] # output of CLS + #embeddings =embeddings.squeeze(0) + return embeddings.detach().cpu().numpy() +X = [] +for item in tqdm(text_arr): + content = item['rule'] + embedding = encode(content) + #item['embedding'] = embedding.reshape(1, -1) + X.append(embedding) + +###### +from sklearn.cluster import KMeans + +clusterer = KMeans(n_clusters=100, random_state=0) # algorithm : {"lloyd", "elkan"}, default="lloyd" +cluster_labels = clusterer.fit_predict(X) +for indx,item in enumerate(text_arr): + cluster = cluster_labels[indx] + item['cluster'] = cluster +##### + + +filename = "cluster_km_oil_{}.json".format(model_name_or_path.replace("/","_")) +similarity_file = open(filename, "w", encoding='utf-8') +similarity_file.write(json.dumps(text_arr, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4 +similarity_file.close() +print('end') \ No newline at end of file diff --git a/oil_domain_nearest_02.py b/oil_domain_nearest_02.py new file mode 100644 index 0000000..7454d64 --- /dev/null +++ b/oil_domain_nearest_02.py @@ -0,0 +1,168 @@ +import json +from tqdm import tqdm +import numpy as np +import torch +from sklearn.metrics.pairwise import cosine_similarity +from transformers import AutoTokenizer +from transformers import AutoModel # for pytorch +from transformers import TFAutoModelForTokenClassification # for tensorflow +from transformers import pipeline +import os +from datasets import Dataset, load_from_disk + + +print('start') +#--- +# NOTE: for bug in dumping float in json +class NumpyFloatValuesEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.float32): + return float(obj) + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return json.JSONEncoder.default(self, obj) +#json.dumps(d, cls=NumpyFloatValuesEncoder) +#---- + +#--- +text_arr = [] +print('loading data') +content_file = open('./mj/oil_domain.json', "r", encoding='utf-8') +oil_data = json.load(content_file) +for qan in oil_data: + for sec in oil_data[qan]: + qs_id = sec["id"] + if (type(sec["graph_models"]) is list): + for sh in sec["graph_models"]: + rule_id = sh["id"] + rule = sh['rule'] + text_arr.append({"id":qs_id, "rule_id": rule_id, "rule": rule}) + else: + for sh in sec["graph_models"]: + rule = sec["graph_models"][sh]['rule'] + text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule}) + +content_file.close() +remained = len(text_arr) +#text_arr[0]['content'] +#text_arr[0]['ner'] +#--- + +device = "cpu" +if torch.cuda.is_available(): + device = "cuda" + +#model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2" +#max_position_embeddings = 512 + +#model_name_or_path = "sharif-dal/dal-bert" +#model_name_or_path = "jinaai/jina-embeddings-v3" +model_name_or_path = "BAAI/bge-m3" +#model_name_or_path = "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt" + +if model_name_or_path == "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt": + + if not os.path.exists(model_name_or_path+'/model.safetensors') or not os.path.exists(model_name_or_path+'/tokenizer.json'): + print('model files is not exists in model path directory.') + exit(0) + + # Mean Pooling - Take attention mask into account for correct averaging + def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + + # Load model from HuggingFace Hub + tokenizer_bert = AutoTokenizer.from_pretrained(model_name_or_path) + model_bert = AutoModel.from_pretrained(model_name_or_path) + + def get_embedding(sentences): + # Tokenize sentences + encoded_input = tokenizer_bert(sentences, padding=True, truncation=True, return_tensors='pt') + # Compute token embeddings + with torch.no_grad(): + model_output = model_bert(**encoded_input) + # Perform pooling. In this case, max pooling. + sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) + + return sentence_embeddings +elif model_name_or_path == "jinaai/jina-embeddings-v3": + from sentence_transformers import SentenceTransformer + embedder = SentenceTransformer(model_name_or_path, trust_remote_code=True) + #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + def get_embedding(text): + embedding1 = embedder.encode(text) + return embedding1 + +elif model_name_or_path == 'BAAI/bge-m3': + from FlagEmbedding import BGEM3FlagModel + #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + score_model = BGEM3FlagModel(model_name_or_path, use_fp16=True)#, devices= device + + def get_embedding(text): + output_1 = score_model.encode(text, return_dense=True, return_sparse=False, return_colbert_vecs=False) + return output_1['dense_vecs'] + +elif model_name_or_path == "sharif-dal/dal-bert": + max_position_embeddings = 258 + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + model = AutoModel.from_pretrained(model_name_or_path) # Pytorch + # model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path) # Tensorflow + model.to(device) + + def get_embedding(text:str, max_length: int = max_position_embeddings): + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length) + #print(inputs.tokens()) + inputs = inputs.to(device) + with torch.no_grad(): + model_output = model(**inputs, return_dict=True) + # Perform pooling + embeddings = model_output.last_hidden_state[0][0] # output of CLS + #embeddings =embeddings.squeeze(0) + return embeddings.detach().cpu().numpy() + +###### +corpus_embeddings = [] +for item in tqdm(text_arr): + id = item['id'] + rule_id = item['rule_id'] + rule = item['rule'] + embedding = get_embedding(rule) + corpus_embeddings.append({'embedding':embedding, 'rule': rule, 'id': id, 'rule_id': rule_id}) +data = Dataset.from_list(corpus_embeddings) +udata = data.add_faiss_index('embedding') + + +k = 20 +related_data = [] +for item in tqdm(corpus_embeddings): + id = item['id'] + rule_id = item['rule_id'] + rule = item['rule'] + embedding = item['embedding'] + scores, retrieved_rules = udata.get_nearest_examples( # retrieve results + 'embedding', embedding, # compare our new embedded query with the dataset embeddings + k=k # get only top k results + ) + + + related_data.append({'rule': rule, 'id': id, 'rule_id': rule_id, + 'retrieved_ids': retrieved_rules['id'], 'retrieved_rule_ids': retrieved_rules['rule_id'], + 'retrieved_rules': retrieved_rules['rule'], 'retrieved_scores': scores}) + + +##### +# for item in tqdm(text_arr): +# for rr in item['retrieved_rules']: +# rr['embedding'] = [] +##### + +filename = "similar_{}_oil_{}.json".format(k,model_name_or_path.replace("/","_")) +similarity_file = open(filename, "w", encoding='utf-8') +similarity_file.write(json.dumps(related_data, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4 +similarity_file.close() +print('end') \ No newline at end of file diff --git a/oil_domain_reranking_03.py b/oil_domain_reranking_03.py new file mode 100644 index 0000000..c5d7133 --- /dev/null +++ b/oil_domain_reranking_03.py @@ -0,0 +1,83 @@ +import json +from tqdm import tqdm +import numpy as np +import torch +from sklearn.metrics.pairwise import cosine_similarity +from transformers import AutoTokenizer +from transformers import AutoModel # for pytorch +from transformers import TFAutoModelForTokenClassification # for tensorflow +from transformers import pipeline +import os +from datasets import Dataset, load_from_disk + + +print('start') +#--- +# NOTE: for bug in dumping float in json +class NumpyFloatValuesEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.float32): + return float(obj) + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + if isinstance(obj, np.ndarray): + return obj.tolist() + return json.JSONEncoder.default(self, obj) +#json.dumps(d, cls=NumpyFloatValuesEncoder) +#---- + + +from FlagEmbedding import FlagReranker +import os +#os.environ['HUGGING_FACE_HUB_TOKEN'] = "hf_VeCSxLxSCVlt..." +os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' + +reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation + + + + +def search_rerank(rule, sim_rules, rule_ids, rerank_k:int=4): + z_results = [[rule, i] for i in sim_rules] + # The scores map into 0-1 by set "normalize=True", which will apply sigmoid function to the score + scores = reranker.compute_score(z_results, normalize=True) + s_results = sorted(zip(scores, z_results, rule_ids), key=lambda x: x[0], reverse=True) + s_results2 = s_results[:rerank_k] + results = [[i[0], i[1][1], i[2]] for i in s_results2] + return results + + +#--- +k = 10 +related_data = [] +print('loading data') +model_name_or_path = "similar_20_oil_BAAI_bge-m3" +#model_name_or_path = "similar_20_oil_jinaai_jina-embeddings-v3" +content_file = open(f'./mj/{model_name_or_path}.json', "r", encoding='utf-8') +oil_data = json.load(content_file) +for qan in tqdm(oil_data): + id = qan["id"] + rule_id = qan["rule_id"] + rule = qan["rule"] + retrieved_ids = [] + retrieved_rule_ids = [] + retrieved_rules = [] + for relateds in zip(qan["retrieved_ids"], qan["retrieved_rule_ids"], qan["retrieved_rules"]): + if relateds[0] != id: + retrieved_ids.append(relateds[0]) + retrieved_rule_ids.append(relateds[1]) + retrieved_rules.append(relateds[2]) + reranked = search_rerank(rule, retrieved_rules, retrieved_rule_ids, 10) + related_data.append({"rule_id":rule_id, "rule": rule, "retrieved_rules": reranked}) + +content_file.close() +remained = len(related_data) +print(remained) +########## +filename = "./mj/reranked_{}_oil_{}.json".format(k,model_name_or_path) +similarity_file = open(filename, "w", encoding='utf-8') +similarity_file.write(json.dumps(related_data, ensure_ascii=False, cls=NumpyFloatValuesEncoder, indent=4))# +similarity_file.close() +print('end') \ No newline at end of file