add to git

2025-07-03 09:59:12 +03:30 · 2025-07-03 09:59:12 +03:30 · 4b517ce00e
commit 4b517ce00e
6 changed files with 445 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+*.jsonl
+*.json
+__pycache__/*
+*.pkl
+*.log
+caches/*
+mj/*
--- a/README.md
+++ b/README.md
--- a/normalizer.py
+++ b/normalizer.py
@ -0,0 +1,89 @@
+import hazm
+from cleantext import clean
+import re
+
+def cleanhtml(raw_html):
+    cleanr = re.compile('<.*?>')
+    cleantext = re.sub(cleanr, '', raw_html)
+    return cleantext
+
+normalizer = hazm.Normalizer()
+wierd_pattern = re.compile("["
+        u"\U0001F600-\U0001F64F"  # emoticons
+        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+        u"\U0001F680-\U0001F6FF"  # transport & map symbols
+        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+        u"\U00002702-\U000027B0"
+        u"\U000024C2-\U0001F251"
+        u"\U0001f926-\U0001f937"
+        u'\U00010000-\U0010ffff'
+        u"\u200d"
+        u"\u2640-\u2642"
+        u"\u2600-\u2B55"
+        u"\u23cf"
+        u"\u23e9"
+        u"\u231a"
+        u"\u3030"
+        u"\ufe0f"
+        u"\u2069"
+        u"\u2066"
+        u"\u200c"
+        u"\u2068"
+        u"\u2067"
+        "]+", flags=re.UNICODE)
+
+def cleaning(text):
+    text = text.strip()
+    
+    # regular cleaning
+    # text = clean(text,
+    #     fix_unicode=True,
+    #     to_ascii=False,
+    #     lower=True,
+    #     no_line_breaks=True,
+    #     no_urls=True,
+    #     no_emails=True,
+    #     no_phone_numbers=True,
+    #     no_numbers=False,
+    #     no_digits=False,
+    #     no_currency_symbols=True,
+    #     no_punct=False,
+    #     replace_with_url="",
+    #     replace_with_email="",
+    #     replace_with_phone_number="",
+    #     replace_with_number="",
+    #     replace_with_digit="0",
+    #     replace_with_currency_symbol="",
+    # )
+    text = clean(text,
+                 extra_spaces = True,
+                 lowercase = True
+                 )
+
+    # cleaning htmls
+    text = cleanhtml(text)
+    
+    # normalizing
+    text = normalizer.normalize(text)
+    
+    # removing wierd patterns
+    text = wierd_pattern.sub(r'', text)
+    
+    # removing extra spaces, hashtags
+    text = re.sub("#", "", text)
+    text = re.sub("\s+", " ", text)
+    
+    return text
+
+
+
+
+
+# with open('./ghavanins.txt', encoding="utf-8") as fp:
+#       current_content = fp.read()
+
+# current_content = cleaning(current_content) 
+
+
+# with open('./ghavanins2.txt', 'wb') as f:
+#         f.write(current_content.encode('utf-8', 'ignore'))
--- a/oil_domain_clustering.py
+++ b/oil_domain_clustering.py
@ -0,0 +1,98 @@
+import json
+from tqdm import tqdm
+import numpy as np
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoTokenizer
+from transformers import AutoModel  # for pytorch
+from transformers import TFAutoModelForTokenClassification  # for tensorflow
+from transformers import pipeline
+
+print('start')
+#---
+# NOTE: for bug in dumping float in json
+class NumpyFloatValuesEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.float32):
+            return float(obj)
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
+#json.dumps(d, cls=NumpyFloatValuesEncoder)
+#----
+
+#---
+text_arr = []
+print('loading data')
+content_file = open('./mj/oil_domain.json', "r", encoding='utf-8')
+oil_data = json.load(content_file)
+for qan in oil_data:
+    for sec in oil_data[qan]:
+        qs_id = sec["id"]
+        if (type(sec["graph_models"]) is list):
+            for sh in sec["graph_models"]:
+                rule_id = sh["id"]
+                rule = sh['rule']
+                text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
+        else:
+            for sh in sec["graph_models"]:
+                rule = sec["graph_models"][sh]['rule']
+                text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
+content_file.close()
+remained = len(text_arr)
+#text_arr[0]['content']
+#text_arr[0]['ner']
+#---
+
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda"
+
+#model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2"
+#max_position_embeddings = 512
+
+model_name_or_path = "sharif-dal/dal-bert"
+max_position_embeddings = 258
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+model = AutoModel.from_pretrained(model_name_or_path)  # Pytorch
+# model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Tensorflow
+model.to(device)
+
+def encode(text:str, max_length: int = max_position_embeddings):
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
+    #print(inputs.tokens())
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        model_output = model(**inputs, return_dict=True)
+    # Perform pooling
+    embeddings = model_output.last_hidden_state[0][0] # output of CLS
+    #embeddings =embeddings.squeeze(0)
+    return embeddings.detach().cpu().numpy()
+X = []
+for item in tqdm(text_arr):
+    content = item['rule']
+    embedding = encode(content)
+    #item['embedding'] = embedding.reshape(1, -1)
+    X.append(embedding)
+
+######
+from sklearn.cluster import KMeans
+
+clusterer = KMeans(n_clusters=100, random_state=0) # algorithm : {"lloyd", "elkan"}, default="lloyd"
+cluster_labels = clusterer.fit_predict(X)
+for indx,item in enumerate(text_arr):
+    cluster = cluster_labels[indx]
+    item['cluster'] = cluster
+#####
+
+
+filename = "cluster_km_oil_{}.json".format(model_name_or_path.replace("/","_"))
+similarity_file = open(filename, "w", encoding='utf-8')
+similarity_file.write(json.dumps(text_arr, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4
+similarity_file.close()
+print('end')
--- a/oil_domain_nearest_02.py
+++ b/oil_domain_nearest_02.py
@ -0,0 +1,168 @@
+import json
+from tqdm import tqdm
+import numpy as np
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoTokenizer
+from transformers import AutoModel  # for pytorch
+from transformers import TFAutoModelForTokenClassification  # for tensorflow
+from transformers import pipeline
+import os
+from datasets import Dataset, load_from_disk
+
+
+print('start')
+#---
+# NOTE: for bug in dumping float in json
+class NumpyFloatValuesEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.float32):
+            return float(obj)
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
+#json.dumps(d, cls=NumpyFloatValuesEncoder)
+#----
+
+#---
+text_arr = []
+print('loading data')
+content_file = open('./mj/oil_domain.json', "r", encoding='utf-8')
+oil_data = json.load(content_file)
+for qan in oil_data:
+    for sec in oil_data[qan]:
+        qs_id = sec["id"]
+        if (type(sec["graph_models"]) is list):
+            for sh in sec["graph_models"]:
+                rule_id = sh["id"]
+                rule = sh['rule']
+                text_arr.append({"id":qs_id, "rule_id": rule_id, "rule": rule})
+        else:
+            for sh in sec["graph_models"]:
+                rule = sec["graph_models"][sh]['rule']
+                text_arr.append({"id":qs_id, "rule_id": sh, "rule": rule})
+
+content_file.close()
+remained = len(text_arr)
+#text_arr[0]['content']
+#text_arr[0]['ner']
+#---
+
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda"
+
+#model_name_or_path = "HooshvareLab/albert-fa-zwnj-base-v2"
+#max_position_embeddings = 512
+
+#model_name_or_path = "sharif-dal/dal-bert"
+#model_name_or_path = "jinaai/jina-embeddings-v3"
+model_name_or_path = "BAAI/bge-m3"
+#model_name_or_path = "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt"
+
+if model_name_or_path == "../../BERT/finetune/MODELS/roberta-fa-zwnj-base-law-2-pt":
+    
+    if not os.path.exists(model_name_or_path+'/model.safetensors') or not os.path.exists(model_name_or_path+'/tokenizer.json'):
+        print('model files is not exists in model path directory.')
+        exit(0)
+
+    # Mean Pooling - Take attention mask into account for correct averaging
+    def mean_pooling(model_output, attention_mask):
+        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+    # Load model from HuggingFace Hub
+    tokenizer_bert = AutoTokenizer.from_pretrained(model_name_or_path)
+    model_bert = AutoModel.from_pretrained(model_name_or_path)
+
+    def get_embedding(sentences):
+        # Tokenize sentences
+        encoded_input = tokenizer_bert(sentences, padding=True, truncation=True, return_tensors='pt')
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = model_bert(**encoded_input)
+        # Perform pooling. In this case, max pooling.
+        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+
+        return sentence_embeddings
+elif model_name_or_path == "jinaai/jina-embeddings-v3":
+    from sentence_transformers import SentenceTransformer
+    embedder = SentenceTransformer(model_name_or_path, trust_remote_code=True)
+    #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    def get_embedding(text):
+        embedding1 = embedder.encode(text)
+        return embedding1
+    
+elif model_name_or_path == 'BAAI/bge-m3':
+    from FlagEmbedding import BGEM3FlagModel
+    #tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    score_model = BGEM3FlagModel(model_name_or_path,  use_fp16=True)#, devices= device
+
+    def get_embedding(text):
+        output_1 = score_model.encode(text, return_dense=True, return_sparse=False, return_colbert_vecs=False)
+        return output_1['dense_vecs']
+
+elif model_name_or_path == "sharif-dal/dal-bert":
+    max_position_embeddings = 258
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    model = AutoModel.from_pretrained(model_name_or_path)  # Pytorch
+    # model = TFAutoModelForTokenClassification.from_pretrained(model_name_or_path)  # Tensorflow
+    model.to(device)
+
+    def get_embedding(text:str, max_length: int = max_position_embeddings):
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
+        #print(inputs.tokens())
+        inputs = inputs.to(device)
+        with torch.no_grad():
+            model_output = model(**inputs, return_dict=True)
+        # Perform pooling
+        embeddings = model_output.last_hidden_state[0][0] # output of CLS
+        #embeddings =embeddings.squeeze(0)
+        return embeddings.detach().cpu().numpy()
+
+######
+corpus_embeddings = []
+for item in tqdm(text_arr):
+    id = item['id']
+    rule_id = item['rule_id']
+    rule = item['rule']
+    embedding = get_embedding(rule)
+    corpus_embeddings.append({'embedding':embedding, 'rule': rule, 'id': id, 'rule_id': rule_id})
+data = Dataset.from_list(corpus_embeddings)
+udata = data.add_faiss_index('embedding')
+
+
+k = 20
+related_data = []
+for item in tqdm(corpus_embeddings):
+    id = item['id']
+    rule_id = item['rule_id']
+    rule = item['rule']
+    embedding = item['embedding']
+    scores, retrieved_rules = udata.get_nearest_examples( # retrieve results
+                'embedding', embedding, # compare our new embedded query with the dataset embeddings
+                k=k # get only top k results
+            )
+
+    
+    related_data.append({'rule': rule, 'id': id, 'rule_id': rule_id,
+    'retrieved_ids': retrieved_rules['id'], 'retrieved_rule_ids': retrieved_rules['rule_id'],
+    'retrieved_rules': retrieved_rules['rule'], 'retrieved_scores': scores})
+    
+
+#####
+# for item in tqdm(text_arr):
+#     for rr in item['retrieved_rules']:
+#         rr['embedding'] = []
+#####
+
+filename = "similar_{}_oil_{}.json".format(k,model_name_or_path.replace("/","_"))
+similarity_file = open(filename, "w", encoding='utf-8')
+similarity_file.write(json.dumps(related_data, ensure_ascii=False, cls=NumpyFloatValuesEncoder))#, indent=4
+similarity_file.close()
+print('end')
--- a/oil_domain_reranking_03.py
+++ b/oil_domain_reranking_03.py
@ -0,0 +1,83 @@
+import json
+from tqdm import tqdm
+import numpy as np
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoTokenizer
+from transformers import AutoModel  # for pytorch
+from transformers import TFAutoModelForTokenClassification  # for tensorflow
+from transformers import pipeline
+import os
+from datasets import Dataset, load_from_disk
+
+
+print('start')
+#---
+# NOTE: for bug in dumping float in json
+class NumpyFloatValuesEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.float32):
+            return float(obj)
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
+#json.dumps(d, cls=NumpyFloatValuesEncoder)
+#----
+
+
+from FlagEmbedding import FlagReranker
+import os
+#os.environ['HUGGING_FACE_HUB_TOKEN'] = "hf_VeCSxLxSCVlt..."
+os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
+
+reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
+
+
+
+
+def search_rerank(rule, sim_rules, rule_ids, rerank_k:int=4):
+    z_results = [[rule, i] for i in sim_rules]
+    # The scores map into 0-1 by set "normalize=True", which will apply sigmoid function to the score
+    scores = reranker.compute_score(z_results, normalize=True)
+    s_results = sorted(zip(scores, z_results, rule_ids), key=lambda x: x[0], reverse=True)
+    s_results2 = s_results[:rerank_k]
+    results = [[i[0], i[1][1], i[2]] for i in s_results2]
+    return results
+
+
+#---
+k = 10
+related_data = []
+print('loading data')
+model_name_or_path = "similar_20_oil_BAAI_bge-m3"
+#model_name_or_path = "similar_20_oil_jinaai_jina-embeddings-v3"
+content_file = open(f'./mj/{model_name_or_path}.json', "r", encoding='utf-8')
+oil_data = json.load(content_file)
+for qan in tqdm(oil_data):
+    id = qan["id"]
+    rule_id = qan["rule_id"]
+    rule = qan["rule"]
+    retrieved_ids = []
+    retrieved_rule_ids = []
+    retrieved_rules = []
+    for relateds in zip(qan["retrieved_ids"], qan["retrieved_rule_ids"], qan["retrieved_rules"]):
+        if relateds[0] != id:
+            retrieved_ids.append(relateds[0])
+            retrieved_rule_ids.append(relateds[1])
+            retrieved_rules.append(relateds[2])
+    reranked = search_rerank(rule, retrieved_rules, retrieved_rule_ids, 10)
+    related_data.append({"rule_id":rule_id, "rule": rule, "retrieved_rules": reranked})
+   
+content_file.close()
+remained = len(related_data)
+print(remained)
+##########
+filename = "./mj/reranked_{}_oil_{}.json".format(k,model_name_or_path)
+similarity_file = open(filename, "w", encoding='utf-8')
+similarity_file.write(json.dumps(related_data, ensure_ascii=False, cls=NumpyFloatValuesEncoder, indent=4))#
+similarity_file.close()
+print('end')