complete embedder and kw

2025-08-10 18:14:42 +03:30 · 2025-08-10 18:14:42 +03:30 · afe4f0b87e
commit afe4f0b87e
parent b2b1be0a21
5 changed files with 270 additions and 131 deletions
--- a/do_nlp_processes.py
+++ b/do_nlp_processes.py
@ -1,12 +1,12 @@
 """
 سورس اجرای پردازش های مختلف روی اجزای قانونی
-شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و ساده‌سازی متن
+شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و ساده‌سازی(بازنمایی) متن
 """
 from p1_classifier import do_classify
 from p2_ner_recognizer import do_ner_recognize
-from p3_words_embedder import do_word_embedder
-from p4_keyword_extractor import do_keyword_extract
-from p5_simplifier import do_simplify
+# from p3_words_embedder import do_word_embedder
+# from p4_keyword_extractor import do_keyword_extract
+# from p5_simplifier import do_simplify

 from elastic_helper import ElasticHelper

@ -21,6 +21,14 @@ def main():
    # get sections to do nlp processes
    sections = get_sections()
    
+    # dictsections = {}
+    # for item in sections:
+        # if not item['id'] == 'qs2180272':
+        #     continue
+        # dictsections[item['id']] = item['source']
+        # break
+    # sections = dictsections
+    
    # 1. classify
    sections = do_classify(sections)
    
@ -28,15 +36,15 @@ def main():
    sections = do_ner_recognize(sections)
    
    # 3. word embedder
-    sections = do_word_embedder(sections)
+    # sections = do_word_embedder(sections)
    
    # 4. keyword extract
-    sections = do_keyword_extract(sections)
+    # result_kw = do_keyword_extract(sections)
    
    # 5. simpify
-    sections = do_simplify(sections)
+    # result_simp = do_simplify(sections)
    
    print('all nlp processes finished successfully!')


-
+main()
--- a/p1_classifier.py
+++ b/p1_classifier.py
@ -120,9 +120,9 @@ def do_classify(sections):

    for index, item in enumerate(sections):

-        id = item['id']
+        id = item

-        source = item['source']
+        source = sections[id]
        # اگر نوع سکشن، عنوان یا موخره یا امضاء باشد، نیازی به فرایند کلاسبندی نیست و لیست کلاس های مربوط به این سکشن را خالی قرار می دهیم
        if source['other_info']['full_path'] == 'عنوان' or source['other_info']['full_path'] == 'موخره' or source['other_info']['full_path'] == 'امضاء':
            new_sections_dict[id] ={
--- a/p2_ner_recognizer.py
+++ b/p2_ner_recognizer.py
@ -1,10 +1,15 @@
 """
 سورس تشخیص موجودیت های نامدار برای هر جزء قانون
+این فرایند برای 285 هزار ماده کمتر از دو ساعت زمان می برد
 """

 from flair.data import Sentence
 from flair.models import SequenceTagger
 import re
+import json
+import datetime
+from elastic_helper import ElasticHelper
+from normalizer import cleaning

 model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-ner-peyma/final-model.pt"
 tagger = SequenceTagger.load(model)
@ -175,14 +180,56 @@ def split_sentence(input_sentence):
    
    return parts

+def get_sections():
+    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
+    eh_obj = ElasticHelper()
+    sections = eh_obj.iterateJsonFile(sections_path, True)
+    sections = convert_to_dict(sections)
+    return sections
+    
 def do_ner_recognize(sections):
    len_sections = len(sections)
-    for index, section in enumerate(sections):
-        content = section['content']
+    for index, item in enumerate(sections):
+        content = sections[item]['content']
+        if not content == '':
+            content = cleaning(content)
        ner_obj_list, content_ai, ner_result = inference_main(content)
        ner_data_list = prepare_data(ner_obj_list)
-        section['ners_v2'] = ner_data_list
-        print(f'ner process: {section}/{len_sections}')
+        sections[item]['ners_v2'] = ner_data_list
+        print(f'ner process: {index+1}/{len_sections}')
    print(f'len_sections ner recognization finished!')
    
-    return sections
+    return sections
+
+def convert_to_dict(sections):
+    sections_dict = {}
+    for item in sections:
+        id = item['id']
+        source = item['source']
+        sections_dict[id] = source
+        
+    return sections_dict
+
+if __name__ == '__main__':
+    
+    print(f'start: {datetime.datetime.now()}')
+    sections = get_sections()
+    
+    # dictsections = {}
+    # for item in sections:
+    #     if not item['id'] == 'qs2180272':
+    #         continue
+    #     dictsections[item['id']] = item['source']
+    #     break
+    # sections = dictsections
+    
+    sections = do_ner_recognize(sections)
+    
+    with open('./data/ner/sections_ner.json', 'w', encoding='utf-8') as output:
+        data = json.dumps(sections, ensure_ascii=False, indent=4)
+        output.write(data)
+   
+    print(f'end: {datetime.datetime.now()}')
+        
+    print('finished ner recognization!')
+    
--- a/p3_words_embedder.py
+++ b/p3_words_embedder.py
@ -1,4 +1,30 @@
+"""
+ایجاد بردار جملات - امبدینگ
+"""
+from sentence_transformers import SentenceTransformer
+import json

+def load_embedder_model(model_name):
+    model = SentenceTransformer(model_name)
+    return model

 def do_word_embedder(sections):
-    pass
+    model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
+    model = load_embedder_model(model_name)
+    for index, item in enumerate(sections):
+        embeddings = get_sentence_embeddings(sections[item]['content'])
+        sections[item]['embeddings'] = embeddings
+    
+    with open('./data/embeddings/sections_embeddings.json', 'w', encoding='utf-8') as outpu_file:
+        data = json.dumps(sections, ensure_ascii=False)
+        outpu_file.write(data)
+    
+def get_sentence_embeddings(sentence):
+    model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
+    model = load_embedder_model(model_name)
+    embeddings = model.encode(sentence)
+    return embeddings
+
+if __name__ == '__main__':
+    pass
+    
--- a/p4_keyword_extractor.py
+++ b/p4_keyword_extractor.py
@ -1,135 +1,193 @@
-
-
 """
 استخراج واژگان کلیدی از اجزاء قانونی
 """

-from datetime import datetime
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
+import json
+import datetime
 import torch
-import time
-
 import os
+from elastic_helper import ElasticHelper
+from transformers import AutoTokenizer, AutoModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

-from normalizer import Normalizer
-from tokenizer import *
+os.environ['HF_HOME'] = "/home/admin/HFHOME"

-address = os.getcwd()
-
-not_have_two_token_kw_list = []
-
-if torch.cuda.is_available():
-    model_id = "PartAI/Dorna-Llama3-8B-Instruct"
-    #model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-index_name_i = 'semantic_search-v10'
+model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+#model_id = "meta-llama/Llama-3.1-70B-Instruct"


-counter = 0
-total = 0
-remained = 0
-id = ''
-keywords_count = 15
-   
-def generateKeywords(text):
-    global remained
-    try:
-        keywords_count = (len(text) / 1000) * 15
-        keywords_count = int(keywords_count)
-        if keywords_count == 0: 
-            keywords_count = 1
-       
-        
-        messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
-                     {"role": "user", "content": 
-                    '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
-                    هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.
-                    "متن": {}
-                    '''.format(keywords_count, text)
-        }]
-       
-        
-        input_ids = tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            return_tensors="pt"
-        ).to(model.device)
+bnb_config = BitsAndBytesConfig(
+    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
+)

-        terminators = [
-                    tokenizer.eos_token_id,
-                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
-                ]
-        model.generation_config.pad_token_id = tokenizer.pad_token_id
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=bnb_config
+)
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id

-        
-        outputs = model.generate(
-            input_ids,
-            max_new_tokens=256,
-            eos_token_id=terminators,
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.85,
-        )
-        #lock0.release()
-        response = outputs[0][input_ids.shape[-1]:]
-        keywords = tokenizer.decode(response, skip_special_tokens=True)
-        #lock1.acquire()
-        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
-       
+# SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
+# And you don't need to provide explanations or additional text.
+# Put each keyword on a single line."
+# """#  Explain your answer step by step.

-        return keywords
-
-    except Exception as inst:
-        print(type(inst))    # the exception type
-        print(inst.args)     # arguments stored in .args
-        print("Exception: " + str(inst))
+SYS_PROMPT = """You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts."""#  Explain your answer step by step.
+SYS_PROMPT = """شما یک دستیار متخصص در استخراج عبارات کلیدی از متون حقوقی فارسی هستید. وظیفه شما این است که از متن ارائه شده توسط کاربر، عبارات اسمی کلیدی مهم و معنادار را استخراج کنید."""#  gemini prompt


-def do_keyword_extract(sections):
-    start_time = time.time()
-    print("start_time: "+str(datetime.now()))
-    
-    try:
-        keywords_dict = []
-        
-        count = 1
-        for index, content_item in enumerate(sections):
-            id = content_item
-            # if not id in not_have_two_token_kw_list:
-            #     continue
-            content = content_item['content']
-            content_len = len(content.split())
-            # کنارگذاشتن محتواهای با حجم زیاد
-            if content_len > 2000:
-                print("too long content " + str(id))
-                continue
+def format_prompt(SENTENCE):
+#   PROMPT = f"Persian legal text: {SENTENCE}."
+  PROMPT = f"متن: {SENTENCE}."
+  return PROMPT

-            keywords = generateKeywords(content) 
-            print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " keyword extracting ... ")
-            content_item['keywords'] = keywords
-            keywords_dict.append({
-                    'id':id,
-                    'keywords':keywords
-                })
-            count+=1
-            
-        # write_to_json(keywords_dict, "../data/sections_kw_110_llama_main.json")
-          
-    except Exception as inst:
-            print(type(inst))    # the exception type
-            print(inst.args)     # arguments stored in .args
-            
-    end_time = time.time()
-    print("end_time: "+ str(datetime.now()))
-    operation_time = (int(end_time-start_time)/60)/60
-    print(f"elapsed time: {operation_time} hours")
-    print(f"Finished!!!")
-    
+def kw_count_calculator(text):
+    keywords_count = (len(text) / 1000) * 15
+    keywords_count = int(keywords_count)
+    if keywords_count == 0: 
+        keywords_count = 1
+    return keywords_count
+
+def generate(formatted_prompt):
+  keywords_count = kw_count_calculator(formatted_prompt)
+
+  # Gemini Prompt
+  USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
+•	یک لیست فارسی
+•	شماره ترتیبی در ابتدای هر عبارت 
+•	هر عبارت در یک خط جداگانه *
+•	بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
+موارد زیر در استخراج عبارات کلیدی الزامی است:
+•	بسیار مهم و حیاتی است که عبارات کلیدی باید دقیقاً در متن موجود باشند، بنابراین هرگز از خلاقیت برای ساختن کلیدواژه‌ای که دقیقا در متن وجود ندارد، استفاده نکن. 
+•	طول هر عبارت کلیدی حداقل دو کلمه باشد. عبارات تک-کلمه‌ای قابل قبول نیستند. 
+•	نام سازمان‌ها، مؤسسات و اشخاص حقوقی باید به عنوان عبارات کلیدی در نظر گرفته شوند. 
+•	عبارات کلیدی نباید فعل یا حرف اضافه باشند و فقط باید شامل اسم‌هایی باشند که به هم اضافه شده‌اند (عبارت اسمی). 
+•	عبارات کلیدی نباید به حرف اضافه یا حرف "و" ختم شوند. 
+•	عبارات کلیدی نباید شامل کلمات "ماده"، "تبصره" و "بند" یا تاریخ‌ها باشند. 
+	به عنوان مثال، اگر متن "قانون مدنی جمهوری اسلامی ایران در ماده ۲۲۲ در خصوص مسئولیت مدنی اشخاص حقوقی در تبصره ۱ به موضوع خسارت ناشی از تقصیر کارکنان اشاره دارد." باشد، خروجی باید به شکل زیر باشد:
+1.	قانون مدنی جمهوری اسلامی ایران 
+2.	مسئولیت مدنی اشخاص حقوقی 
+3.	خسارت ناشی از تقصیر کارکنان 
+اکنون متن مورد نظر را دریافت خواهید کرد.
+"""
+  formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
+  messages = [
+      {"role":"system","content":SYS_PROMPT},
+      {"role":"user","content":USER_PROMPT},
+      {"role":"user","content":formatted_prompt}
+      ]
+  # tell the model to generate
+  input_ids = tokenizer.apply_chat_template(
+      messages,
+      add_generation_prompt=True,
+      return_tensors="pt"
+  ).to(model.device)
+  outputs = model.generate(
+      input_ids,
+      max_new_tokens=2048,
+      eos_token_id=terminators,
+      do_sample=True,
+      temperature=0.6,
+      top_p=0.9,
+  )
+  response = outputs[0][input_ids.shape[-1]:]
+  return tokenizer.decode(response, skip_special_tokens=True)
+
+
+def get_rules(sentence):
+    formatted_prompt = format_prompt(sentence)
+    rules = generate(formatted_prompt).split('\n')
+    result =  [r.strip() for r in rules if r.strip()]
+    return result
+
+
+def get_sections():
+    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
+    eh_obj = ElasticHelper()
+    sections = eh_obj.iterateJsonFile(sections_path, True)
+    sections = convert_to_dict(sections)
    return sections

+def convert_to_dict(sections):
+    sections_dict = {}
+    for item in sections:
+        id = item['id']
+        source = item['source']
+        sections_dict[id] = source
+        
+    return sections_dict
+
+def do_keyword_extract(sections):
+    start_time = datetime.datetime.now()
+    print(f'start time: {start_time}')
+
+    # prev_ids = read_file_by_address('./data/prev_kw_ids_170k.txt').splitlines()
+    counter = 1
+    file_counter = 1
+    temp_dict = []  
+    temp_data_text = '' 
+    period_sections = []  
+    period_ids_text = f'***** file number: {file_counter} *****\n'
+    for index, item in enumerate(sections): 
+        # if item['id'] in prev_ids:
+        #     continue
+        
+        if counter > 10:
+            with open('./data/keyword/prev_kw_ids.txt', 'a+', encoding='utf-8') as file:
+                file.write(period_ids_text)
+            break
+        
+        id = item
+        content = sections[id]['content']
+        try:
+            sections[id]['keywords'] = get_rules(content)
+        except:
+            print(f"section kw error : {id}\n")
+            counter += 1
+            continue
+        
+        period_ids_text += f"{id} \n"
+        
+        print(f"section: {counter}-id: {id}")
+        # temp_dict.append(item)
+        if counter % 1000 == 0:
+            outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}.json', "a+", encoding='utf-8')
+            outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
+            outputfile.close()
+            print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++\n ")
+            
+            file_counter += 1
+            period_sections = []
+            # save proccessed sections id for next executions of this code
+            with open('./data/keyword/prev_kw_ids.txt', 'a+', encoding='utf-8') as file:
+                file.write(period_ids_text)
+        counter += 1
+        period_ids_text = f'***** file number: {file_counter} *****\n'
+
+
+    outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}.json', "w", encoding='utf-8')
+    outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent = 4))
+    outputfile.close()
+    print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++ ")
    
+    end_time = datetime.datetime.now()
+    print(f"end_time: {end_time}")
+    print(f"elapsed time: {(end_time-start_time)} Hours!!! ")
+    print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
+    print("end")
+    
+    return True
+     
 if __name__ == "__main__":
-    pass
+    print(f'start: {datetime.datetime.now()}')
+    sections = get_sections()
+    
+    sections = do_keyword_extract(sections)
+    
+    print(f'end: {datetime.datetime.now()}')