add some readme files

2025-08-16 14:24:11 +03:30 · 2025-08-16 14:24:11 +03:30 · b7f15bd846
commit b7f15bd846
parent f137ba54d2
9 changed files with 218 additions and 1408 deletions
--- a/do_nlp_processes.py
+++ b/do_nlp_processes.py
@ -2,19 +2,30 @@
 سورس اجرای پردازش های مختلف روی اجزای قانونی
 شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و ساده‌سازی(بازنمایی) متن
 """
+    
 from p1_classifier import do_classify
 from p2_ner_recognizer import do_ner_recognize
 from p3_words_embedder import do_word_embedder
-# from p4_keyword_extractor import do_keyword_extract
-# from p5_simplifier import do_simplify
+from p4_keyword_extractor import do_keyword_extract
+from p5_simplifier import do_representation

 from elastic_helper import ElasticHelper
+import json

 def get_sections():
-    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
-    eh_obj = ElasticHelper()
-    sections = eh_obj.iterateJsonFile(sections_path, True)
-    sections = convert_to_dict(sections)
+    
+    # region خواندن کل سکشن ها از فایل جیسون
+    # sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
+    # eh_obj = ElasticHelper()
+    # sections = eh_obj.iterateJsonFile(sections_path, True)
+    # sections = convert_to_dict(sections)
+    # endregion
+    
+    # region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
+    with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
+        sections = json.load(file)
+    # endregion
+    
    return sections

 def convert_to_dict(sections):
@ -31,6 +42,15 @@ def main():
    # get sections to do nlp processes
    sections = get_sections()
    
+    
+    # temp_sections = {}
+    # for i, item in enumerate(sections):
+    #     if i>3:
+    #         break
+    #     temp_sections[item] = sections[item]
+    
+    # sections = temp_sections
+    
    # dictsections = {}
    # for item in sections:
        # if not item['id'] == 'qs2180272':
@ -46,13 +66,21 @@ def main():
    sections = do_ner_recognize(sections)
    
    # 3. word embedder
-    #sections = do_word_embedder(sections)
+    sections = do_word_embedder(sections)
    
    # 4. keyword extract
-    # result_kw = do_keyword_extract(sections)
+    # keyword_extract_result, sections = do_keyword_extract(sections)
+    # if keyword_extract_result: 
+    #     print(f'keyword extraction finished successfully!')
    
    # 5. simpify
-    # result_simp = do_simplify(sections)
+    # representation_result, sections = do_representation(sections)
+    # if representation_result: 
+    #     print(f'representation finished successfully!')
+    
+    with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
+        data = json.dumps(sections, ensure_ascii=False, indent=2)
+        output_file.write(data)
        
    print('all nlp processes finished successfully!')

--- a/copy.py
+++ b/copy.py
--- a/p1_classifier.py
+++ b/p1_classifier.py
@ -4,20 +4,18 @@
 """
 from transformers import pipeline
 from normalizer import cleaning
-from elastic_helper import ElasticHelper 
 import transformers
 import json
 import datetime
 import pandas as pd
 from transformers import AutoTokenizer
-print(transformers.__version__)
+print(f'transformers version: {transformers.__version__}')

 #  finetuned model for classification path
 model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680' 

 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-
-# window_size = tokenizer.model_max_length#512#200
+print(f'Classification Model Loaded: {model_checkpoint}')
 window_size = 512
 """
 (یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
@ -27,8 +25,8 @@ window_size = 512
 step_size = 350#100
 # تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم
 Top_k = 4 
-
-classifier = pipeline("text-classification", model_checkpoint, framework="pt")
+# set device = 0 => to use GPU
+classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0)

 def get_class(sentences, top_k:int=4):
    # sentences = cleaning(sentences)
@ -177,14 +175,15 @@ def do_classify(sections):

    for index, id in enumerate(sections):
        
-        source = sections[id]['source']
+        source = sections[id]
        classification_result, classification_status, desc = single_section_classification(id, source)
        
        if not classification_status:
            print(f'id: {id} classification error. error description: {desc}')
        
        # ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
-        new_sections_dict[id] = classification_result
+        # new_sections_dict[id] = classification_result
+        sections[id]['ai_codes'] = classification_result
        
        """ برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
        # qanon_title = source['qanon_title']
@ -198,8 +197,8 @@ def do_classify(sections):
    print(f'end: {datetime.datetime.now()}')
    print('classification finished!')
    
-    classified_sections_dict = new_sections_dict
+    # classified_sections_dict = new_sections_dict
    
-    return classified_sections_dict
+    return sections


--- a/p2_ner_recognizer.py
+++ b/p2_ner_recognizer.py
@ -15,7 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n
 tagger = SequenceTagger.load(model)
 print('model read and tagger initialized')

-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'

 def prepare_data(ner_obj_list):
    ner_data_list = []
@ -208,7 +208,7 @@ def do_ner_recognize(sections):
        ner_data_list = prepare_data(ner_obj_list)
        sections[id]['ners_v2'] = ner_data_list
        print(f'ner process: {index+1}/{len_sections}')
-    print(f'len_sections ner recognization finished!')
+    print(f'{len_sections} ner recognization finished!')
    
    return sections

--- a/p3_words_embedder.py
+++ b/p3_words_embedder.py
@ -6,7 +6,8 @@ import json
 import datetime
 import numpy as np

-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+date = datetime.datetime.now()
+today = f'{date.year}-{date.month}-{date.day}-{date.hour}'

 model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
 # model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
@ -19,14 +20,16 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
 model = SentenceTransformer(model_name)

 def do_word_embedder(sections):
-    for index, item in enumerate(sections):
-        embeddings = single_section_embedder(sections[item]['content'])
-        sections[item]['embeddings'] = embeddings
+    for index, id in enumerate(sections):
+        embeddings = single_section_embedder(sections[id]['content'])
+        sections[id]['embeddings'] = embeddings.tolist()
    
    with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
        data = json.dumps(sections, ensure_ascii=False)
        output_file.write(data)
    
+    return sections
+    
 def single_section_embedder(sentence):
    """
    این متد، متن ورودی را تبدیل به بردار متناظر آن می کند
--- a/p4_keyword_extractor.py
+++ b/p4_keyword_extractor.py
@ -15,7 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME"
 model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #model_id = "meta-llama/Llama-3.1-70B-Instruct"

-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'

 bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
@ -61,7 +61,7 @@ def generate(formatted_prompt):
  # Gemini Prompt
  USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
 •	یک لیست فارسی
-•	شماره ترتیبی در ابتدای هر عبارت 
+•	هیچ عدد یا علامت و نمادی در ابتدا یا انتهای هر عبارت کلیدی قرار نگیرد
 •	هر عبارت در یک خط جداگانه
 •	بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
 موارد زیر در استخراج عبارات کلیدی الزامی است:
@ -94,7 +94,7 @@ def generate(formatted_prompt):
      max_new_tokens=2048,
      eos_token_id=terminators,
      do_sample=True,
-      temperature=0.6,
+      temperature=0.1,
      top_p=0.9,
  )
  response = outputs[0][input_ids.shape[-1]:]
@ -168,9 +168,9 @@ def do_keyword_extract(sections):
        
        period_ids_text += f"{id} \n"
        
-        print(f"section: {counter}-id: {id}")
+        print(f"section kw extracting: {counter} - id: {id}")
        # temp_dict.append(item)
-        if counter % 1000 == 0:
+        if counter % 5000 == 0:
            outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
            outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
            outputfile.close()
@ -196,12 +196,13 @@ def do_keyword_extract(sections):
    print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
    print("end")
    
-    return True
+    operation_result = True
+    return operation_result, sections
     
 if __name__ == "__main__":
    print(f'start: {datetime.datetime.now()}')
    sections = get_sections()
    
-    sections = do_keyword_extract(sections)
+    operation_result = do_keyword_extract(sections)
    
    print(f'end: {datetime.datetime.now()}')
--- a/p5_simplifier.py
+++ b/p5_simplifier.py
@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import json

-today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
+today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'

 if torch.cuda.is_available():
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
@ -56,17 +56,20 @@ def single_section_representation(content):
        
        outputs = model.generate(
            input_ids,
-            max_new_tokens=500,
+            max_new_tokens=2048,
            eos_token_id=terminators,
            do_sample=True,
-            temperature=0.7,
+            temperature=0.1,
            top_p=0.85,
        )

        response = outputs[0][input_ids.shape[-1]:]
        sentences = tokenizer.decode(response, skip_special_tokens=True)
+        # حذف جملات تکراری
+        sentences = list(set(sentences))
+        
        result = True
-        desc = 'operation successful'
+        desc = 'Operation successful'
        return result, desc, sentences

    except Exception as error:
@ -77,7 +80,7 @@ def single_section_representation(content):
 def do_representation(sections):
    print(f"start time:   {datetime.datetime.now()}")
    
-    for index, id in sections:
+    for index, id in enumerate(sections):
        result, desc, sentences = single_section_representation(sections[id]['content'])
        if not result:
            error_content = f'id: {id} - error: {desc}\n'
@ -85,11 +88,16 @@ def do_representation(sections):
                file.write(error_content)
            
        sections[id]['represented_sentences'] = sentences
+        print(f'representation process. section {index+1}/{len(sections)} - id: {id}')
    
+    with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile:
+        outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4))

    print(f"end time:   {datetime.datetime.now()}")
    print(" *** finished! *** ")

+    operation_result = True
+    return operation_result, sections

 if __name__ == "__main__":
    pass
--- a/readme/readme-classifier.md
+++ b/readme/readme-classifier.md
@ -0,0 +1,71 @@
+# Section Classification Script
+
+This project provides a Python script (`classification.py`) for classifying text sections using a fine-tuned transformer model. The script is designed to suggest the most relevant classes for each section of text, which is useful for legal documents, content categorization, and similar NLP tasks.
+
+## Requirements
+
+Before using this script, please install the required libraries:
+
+```bash
+pip install transformers pandas
+```
+
+You also need a fine-tuned classification model and its tokenizer. Update the `model_checkpoint` path in the script to point to your model.
+
+## How It Works
+
+- The script loads a fine-tuned transformer model for text classification.
+- It processes each section of text, possibly splitting long texts into windows to fit the model's input size.
+- For each section, it predicts the top classes and saves the results.
+
+## Main Functions
+
+- `get_class(sentences, top_k=4)`: Classifies a sentence or text and returns the top `k` classes.
+- `mean_classes(input_classes)`: Aggregates class results from multiple windows of a long text.
+- `get_window_classes(text)`: Handles splitting long texts into windows and aggregates their classification results.
+- `single_section_classification(id, section_source)`: Classifies a single section and returns the best and other suggested classes.
+- `do_classify(sections)`: Classifies all sections in a dictionary and saves the results to a JSON file.
+
+## Usage Example
+
+Suppose you have your sections data as a dictionary:
+
+```python
+sections = {
+    "1": {"content": "First section text", "other_info": {"full_path": "..."}, "qanon_title": "..."},
+    "2": {"content": "Second section text", "other_info": {"full_path": "..."}, "qanon_title": "..."}
+}
+```
+
+You can classify all sections as follows:
+
+```python
+from classification import do_classify
+
+result = do_classify(sections)
+```
+
+After running, the results will be saved in a JSON file in the `./data/classification/` directory.
+
+## Output Structure
+
+Each section will have a new field `ai_codes` with the classification results:
+
+```json
+"1": {
+  "content": "First section text",
+  "ai_codes": {
+    "best-class": {"label": "ClassA", "score": 0.85},
+    "other-classes": [
+      {"label": "ClassB", "score": 0.10},
+      {"label": "ClassC", "score": 0.05}
+    ]
+  }
+}
+```
+
+## Notes
+
+- Make sure the model path in `model_checkpoint` is correct and the model files are available.
+- The script supports Persian and other languages, depending on your model.
+- The output JSON file will be saved in `./data/classification/`.
--- a/readme/readme-words-embedder.md
+++ b/readme/readme-words-embedder.md
@ -0,0 +1,70 @@
+# Sentence Embedding Generator
+
+This project provides a Python script (`embedding.py`) for generating sentence embeddings using the [Sentence Transformers]library.
+
+## Requirements
+
+Before using this script, please install the required libraries:
+
+```bash
+pip install sentence-transformers numpy
+```
+
+## How It Works
+
+- The script uses the pre-trained model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`.
+- There are two main functions:
+  - `single_section_embedder(sentence)`: Takes a sentence (string) and returns its embedding as a vector.
+  - `do_word_embedder(sections)`: Takes a dictionary of sections (each with a `content` field), generates embeddings for each section, and saves the results as a JSON file.
+
+## Usage
+
+### 1. Get Embedding for a Single Sentence
+
+```python
+from embedding import single_section_embedder
+
+sentence = "This is a sample sentence."
+embedding = single_section_embedder(sentence)
+print(embedding)
+```
+
+### 2. Generate Embeddings for Multiple Sections and Save to File
+
+Suppose your data is structured like this:
+
+```python
+sections = {
+    "1": {"content": "First section text"},
+    "2": {"content": "Second section text"}
+}
+```
+
+You can generate and save embeddings as follows:
+
+```python
+from embedding import do_word_embedder
+
+result = do_word_embedder(sections)
+```
+
+After running, a file named like `sections_embeddings_YEAR-MONTH-DAY-HOUR.json` will be created in the `./data/embeddings/` directory, containing the embeddings for each section.
+
+## Output Structure
+
+The output is a JSON file where each section has its embedding added:
+
+```json
+{
+  "1": {
+    "content": "First section text",
+    "embeddings": [0.123, 0.456, ...]
+  },
+  ...
+}
+```
+
+## Notes
+
+- Make sure the folder `./data/embeddings/` exists before running the script.
+- The script supports Persian language.