.

Adding Fast API to def single_section_classification in P3
Adding Fast API to P3
2025-08-26 10:29:49 +03:30 · 2025-08-26 10:23:00 +03:30 · 2025-08-25 19:54:35 +03:30
7 changed files with 82 additions and 204 deletions
--- a/do_nlp_processes.py
+++ b/do_nlp_processes.py
@ -13,6 +13,7 @@ from elastic_helper import ElasticHelper
 import json

 def get_sections():
+    
    # region خواندن کل سکشن ها از فایل جیسون
    # sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
    # eh_obj = ElasticHelper()
--- a/p1_classifier.py
+++ b/p1_classifier.py
@ -4,22 +4,21 @@
 """
 from transformers import pipeline
 from normalizer import cleaning
+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import Dict, Any, List
 import transformers
 import json
 import datetime
 import pandas as pd
 from transformers import AutoTokenizer
 print(f'transformers version: {transformers.__version__}')
-from elastic_helper import ElasticHelper
-
-date = datetime.datetime.now()
-today = f'{date.year}-{date.month}-{date.day}-{date.hour}'

 #  finetuned model for classification path
 model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680' 

 tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-
+print(f'Classification Model Loaded: {model_checkpoint}')
 window_size = 512
 """
 (یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
@ -31,55 +30,13 @@ step_size = 350#100
 Top_k = 4 
 # set device = 0 => to use GPU
 classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0)
-print(f'Classification Model Loaded: {model_checkpoint}')

-def get_sections():
-    """
-    دریافت کل سکشن های قانونی در مسیر مشخص شده
-    """
-    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
-    eh_obj = ElasticHelper()
-    sections = eh_obj.iterateJsonFile(sections_path, True)
-    sections = convert_to_dict(sections)
-    return sections
-
-def convert_to_dict(sections):
-    """
-    تبدیل لیست سکشن های قانون به دیکشنری
-    """
-    sections_dict = {}
-    for item in sections:
-        id = item['id']
-        source = item['source']
-        sections_dict[id] = source
-        
-    return sections_dict
-
-def get_class(sentence, top_k:int=4):
-    """
-    متدی برای تعیین تعدادی از بهترین کلاس های متناسب با متن ورودی
-    
-    Args:
-        sentence(str): متنی که قرار است مدل، کلاس های آن را تشخیص دهد
-        top_k(int): تعداد کلاس هایی که بر اساس اولویت امتیاز، مدل باید تشخیص دهد
-        
-    Returns:
-        recognized_class(list[obj]): لیستی از آبجکت کلاس ها شامل عنوان و امتیاز هر کلاس
-    """
+def get_class(sentences, top_k:int=4):
    # sentences = cleaning(sentences)
-    recognized_class = classifier(sentence, top_k=top_k, truncation=True, max_length=window_size)
-    return recognized_class
+    out = classifier(sentences, top_k=top_k, truncation=True, max_length=window_size)
+    return out

 def mean_classes(input_classes):
-    """
-    محاسبه کلاس بر اساس میانگین امتیازات کلاس های مختلفی که در پنجره شناور برای یک متن به دست آمده است
-    
-    Args:
-        input_classes(list[obj]): لیستی از آبجکت کلاس ها شامل عنوان کلاس و امتیاز مربوط به آن که مدل تشخیص داده است
-        
-    Returns:
-        top_class(obj): تک آبجکتی شامل عنوان و امتیاز بهترین کلاس تشخیص داده شده برای این متن
-    """
    pass
    all_classes = []
    for cclass in input_classes:
@ -115,32 +72,21 @@ def mean_classes(input_classes):
    return top_n_classes

 def get_window_classes(text):
-    """
-    این متد، متن ورودی را بررسی می کند و اگر اندازه آن بیشتر از اندازه ورودی قابل قبول برای مدل بود، یک پنجره به اندازه ای که قبلا تعریف شده روی متن حرکت می دهد و برای هر حرکت، متن پنجره را کلاس بندی می کند و لیست آبجکت کلاس ها را بر می گرداند
-    در صورتی که متن ورودی از اندازه مدل بزرگتر نبود، متن ورودی بدون ورود به فرایند پنجره ها کلاس بندی می شود
-    
-    Args:
-        text(str): متن ورودی که باید کلاس بندی شود
-        
-    Returns:
-        text_classes(list[obj]): لیست آبجکت ها شامل عنوان و امتیاز کلاس تشخیص داده شده برای این متن
-    """
    text_classes = []
    tokens = tokenizer(text)['input_ids'][1:-1]
-
+    #print(len(tokens))
    if len(tokens) > window_size:
-        for i in range(0, len(tokens), step_size):
+        for i in range(0, len(tokens), step_size):#- window_size + 1
            start_window_slice = tokens[0: i]
            window_slice = tokens[i: i + window_size]
            start_char = len(tokenizer.decode(start_window_slice).replace('[UNK]', ''))
            char_len = len(tokenizer.decode(window_slice).replace('[UNK]', ''))
            context_slice = text[start_char: start_char + char_len]
-            # tokens_len = len(tokenizer(context_slice)['input_ids'][1:-1])
+            tokens_len = len(tokenizer(context_slice)['input_ids'][1:-1])
            # print(f'i: {i},token-len: {tokens_len}', flush=True)
            results = get_class(context_slice, Top_k)
            text_classes.append(results)
            
-        # محاسبه بهترین کلاس ها بر اساس میانگین امتیازات کلاسهای تشخیص داده شده    
        text_classes = mean_classes(text_classes)
    else:
        text_classes = get_class(text, Top_k)
@ -151,12 +97,7 @@ def full_path_text_maker(full_path):
    """
    این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند
    
-    Args:
-        full_path_text(str): متن اولیه از مسیر یک سکشن
-        
-    Returns:
-        full_path_text(str): متن بازسازی شده از مسیر یک سکشن
-        
+    full_path_text متن بازسازی شده از مسیر یک سکشن
    """
    full_path_text = ""
    for i, path_item in enumerate(reversed(full_path)):
@ -167,7 +108,11 @@ def full_path_text_maker(full_path):
    full_path_text = full_path_text.strip()
    return full_path_text

-def single_section_classification(id, section_source):
+
+app = FastAPI()
+
+@app.post("/single_section_classification/")
+def single_section_classification(id: str, section_source: Dict[str, Any]):
    """
    این متد، متن ورودی را کلاسبندی می کند
    
@ -180,26 +125,14 @@ def single_section_classification(id, section_source):
        classification_status(bool): بیان می کند که عملیات کلاس بندی موفق بوده یا خیر
        desc(str): توضیحی در مورد موفقیت یا خطای عملیات ارائه می دهد
    """
-    classification_result ={
-        "best-class":{},
-        "other-classes": []}
+    classification_result = {
+        "best-class": {},
+        "other-classes": []
+    }
    content = section_source['content']
-    # اگر متن سکشن خالی بود، کلاس ها را به صورت خالی برگردان
-    if content =='':
-        return classification_result, True, 'Classification was successful'
-    
    # اگر نوع سکشن، عنوان یا موخره یا امضاء باشد، نیازی به فرایند کلاسبندی نیست و لیست کلاس های مربوط به این سکشن را خالی قرار می دهیم
-    filtered_keys = ['فصل','موخره','امضاء','عنوان']
-    section_path = section_source['other_info']['full_path']
-    if '>' in section_path:
-        path_parts = section_path.split('>')
-        for key in filtered_keys:
-            if key in path_parts[-1]:
-                return classification_result, True, 'Classification was successful'
-    else:
-        for key in filtered_keys:
-            if key in section_path:
-                return classification_result, True, 'Classification was successful'
+    if content == '' or section_source['other_info']['full_path'] == 'عنوان' or section_source['other_info']['full_path'] == 'موخره' or section_source['other_info']['full_path'] == 'امضاء':
+        return {"classification_result": classification_result, "classification_status": True, "desc": 'Classification was successful'}
    
    qanon_title = section_source['qanon_title']
    # این متغیر ریشه سکشن اخیر تا رسیدن به قانون را مشخص می کند
@ -219,34 +152,24 @@ def single_section_classification(id, section_source):
        with open('./data/cleaning_content_log.txt', 'a', encoding='utf-8') as output_file:
            output_file.write(id + " >> " + str(error) + "\n")
            print('cleaning content error!')
-            return classification_result, False, str(error)
+        return {"classification_result": classification_result, "classification_status": False, "desc": str(error)}
    try:
        # دریافت کلاس های مربوط به یک سکشن
        section_classes = get_window_classes(f"{pre_content} {content}")
-        
    except Exception as error:
        error_content = f"{id} -- Classification Error Content:{error}\n"
        with open('./data/classification/errors.txt', 'a', encoding='utf-8') as output_file:
            output_file.write(error_content)
            print(error_content)
-        return classification_result, False, error_content
+        return {"classification_result": classification_result, "classification_status": False, "desc": error_content}

-    classification_result ={
-            "best-class" : section_classes[0],
-            "other-classes" : section_classes[1:]
+    classification_result = {
+        "best-class": section_classes[0],
+        "other-classes": section_classes[1:]
    }
-    return classification_result, True, 'Classification was successful'
+    return {"classification_result": classification_result, "classification_status": True, "desc": 'Classification was successful'}

 def do_classify(sections):
-    """
-    کلاسبندی مجموعه ای از سکشن های قانون به صورت یکجا در صورت نیاز
-    
-    Args:
-        sections(list[obj]): لیستی از آبجکت سکشن های قانونی که در هر آبجکت، متادیتاهای مختلف آن سکشن قرار دارد
-        
-    Returns:
-        sections(list[obj]): لیستی از آبجکت متادیتاهای سکشن های قانونی که در ورودی دریافت کرده بود که در این پردازش، متادیتای مربوط به کلاس نیز به آنها اضافه شده است
-    """
    print(f'start classification: {datetime.datetime.now()}')

    test_counter = 1
@ -274,7 +197,7 @@ def do_classify(sections):
        # qanon_title_list.append(qanon_title)
        print(f'section: {all}/{index+1}/{id}', flush=True)
    # ذخیره دیکشنری شناسه های قانون و کلاس های تخمین زده شده در فایل جیسون
-    with open('./data/classification/all_sections_classes_new_1404--.json', 'w', encoding='utf-8') as output_file:
+    with open('./data/classification/all_sections_classes_new_140405.json', 'w', encoding='utf-8') as output_file:
        json_data = json.dumps(new_sections_dict, indent=4, ensure_ascii=False)
        output_file.write(json_data)
        
@ -285,17 +208,4 @@ def do_classify(sections):
    
    return sections

-if __name__ == '__main__':

-    sections = get_sections()
-    
-    # اجرای عملیات کلاس بندی
-    classified_sections = do_classify(sections)
-    
-    with open(f'classified_sections_{today}.json', 'w', encoding='utf-8') as output:
-        data = json.dumps(classified_sections, ensure_ascii=False, indent=4)
-        output.write(data)
-   
-    print(f'end: {datetime.datetime.now()}')
-        
-    print('finished ner recognization!')
--- a/p2_ner_recognizer.py
+++ b/p2_ner_recognizer.py
@ -93,6 +93,8 @@ def find_ner_values_in_text(text, ner_values):
                            'ner_score'      : float(ner_score.strip()),
                            #'ner_tokens'     : ner_tokens,
                            })
+            # if law_id != 0:
+            #     ner_obj[len(ner_obj)-1]['ner_law_id']= law_id

    return ner_obj

@ -212,9 +214,6 @@ def do_ner_recognize(sections):
    return sections

 def get_sections():
-    """
-    دریافت کل سکشن های قانونی در مسیر مشخص شده
-    """
    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
    eh_obj = ElasticHelper()
    sections = eh_obj.iterateJsonFile(sections_path, True)
@ -222,9 +221,6 @@ def get_sections():
    return sections

 def convert_to_dict(sections):
-    """
-    تبدیل لیست سکشن های قانون به دیکشنری
-    """
    sections_dict = {}
    for item in sections:
        id = item['id']
@ -248,7 +244,7 @@ if __name__ == '__main__':
    
    sections = do_ner_recognize(sections)
    
-    with open(f'sections_ner_{today}.json', 'w', encoding='utf-8') as output:
+    with open(f'./data/ner/sections_ner_{today}.json', 'w', encoding='utf-8') as output:
        data = json.dumps(sections, ensure_ascii=False, indent=4)
        output.write(data)
   
--- a/p3_words_embedder.py
+++ b/p3_words_embedder.py
@ -2,10 +2,10 @@
 ایجاد بردار جملات - امبدینگ
 """
 from sentence_transformers import SentenceTransformer
+from fastapi import FastAPI
 import json
 import datetime
 import numpy as np
-from elastic_helper import ElasticHelper

 date = datetime.datetime.now()
 today = f'{date.year}-{date.month}-{date.day}-{date.hour}'
@ -20,32 +20,7 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
 # model_name = 'HooshvareLab/bert-base-parsbert-uncased'#90-54
 model = SentenceTransformer(model_name)

-def get_sections():
-    """
-    دریافت کل سکشن های قانونی در مسیر مشخص شده
-    """
-    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
-    eh_obj = ElasticHelper()
-    sections = eh_obj.iterateJsonFile(sections_path, True)
-    sections = convert_to_dict(sections)
-    return sections
-
-def convert_to_dict(sections):
-    """
-    تبدیل لیست سکشن های قانون به دیکشنری
-    """
-    sections_dict = {}
-    for item in sections:
-        id = item['id']
-        source = item['source']
-        sections_dict[id] = source
-        
-    return sections_dict
-
 def do_word_embedder(sections):
-    """
-    لیستی از آبجکت سکشن های قانون را دریافت می کند و متن سکشن ها را تبدیل به بردار می کند و در نهایت لیست سکشن ها که امبدینگ هم  به آنها اضافه شده را در مسیر مشخص شده در همین متد، ذخیره می کند
-    """
    for index, id in enumerate(sections):
        embeddings = single_section_embedder(sections[id]['content'])
        sections[id]['embeddings'] = embeddings.tolist()
@ -56,18 +31,40 @@ def do_word_embedder(sections):
    
    return sections
    
-def single_section_embedder(sentence):
-    """
-    این متد، متن ورودی را تبدیل به بردار متناظر آن می کند
    
-    **Args:
+    
+
+
+# Create a FastAPI app instance
+# یک نمونه از برنامه FastAPI ایجاد می‌کنیم
+app = FastAPI()
+
+# Your method, wrapped in an API endpoint
+# متد شما که در یک نقطه پایانی API قرار گرفته
+@app.post("/single_section_embedder/")
+def single_section_embedder(sentence: str):
+    """
+    این نقطه پایانی، متن ورودی را تبدیل به بردار متناظر آن می کند.
+    
+    **Args:**
        sentence (str): متن یک سکشن
    
-    **Returns:
-        embeddings: لیست بردار متناظر با متن ورودی
+    **Returns:**
+        list: لیست بردار متناظر با متن ورودی
    """
+    # Use the loaded model to encode the sentence
+    # برای تبدیل متن به بردار از مدلی که در بالا لود شده استفاده می‌کنیم
    embeddings = model.encode(sentence)
-    return embeddings
+    
+    # The output of model.encode is a NumPy array, which needs to be converted
+    # to a list to be returned as a JSON response.
+    # خروجی model.encode یک آرایه NumPy هست که باید به لیست تبدیل بشه
+    # تا به عنوان یک پاسخ JSON برگردانده بشه.
+    return {"embeddings": embeddings.tolist()}
+
+
+
+

 def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)  # ضرب داخلی دو بردار
@ -76,8 +73,8 @@ def cosine_similarity(vec1, vec2):
    return dot_product / (norm_vec1 * norm_vec2)

 if __name__ == '__main__':
-   
-    sections = get_sections()
-    
-    # محاسبه امبدینگ سکشن ها و ذخیره نتایج
-    do_word_embedder(sections)
+    embd1 = single_section_embedder("۲ – درصد مشارکت و سهم پرداختی کارفرما نسبت به مأخذ کسر حق بیمه به صندوقهای فعال در سطح همگانی بیمههای اجتماعی و درمانی یکسان خواهد بود.")
+    embd2 = single_section_embedder("۳ – درصد مشارکت و سهم پرداختی بیمهشده نسبت به مأخذ کسر حق بیمه به صندوقهای فعال در سطح همگانی بیمههای اجتماعی و درمانی یکسان خواهد بود.")
+    # embd2 = get_sentence_embeddings("تو کم گذاشتی وگرنه شکست نمی خوردی")
+    similarity = cosine_similarity(embd1, embd2)
+    print(f'similarity: {similarity}')
--- a/p4_keyword_extractor.py
+++ b/p4_keyword_extractor.py
@ -201,9 +201,9 @@ def do_keyword_extract(sections):
    return operation_result, sections
     
 if __name__ == "__main__":
-
+    print(f'start: {datetime.datetime.now()}')
    sections = get_sections()
    
-    # استخراج کلیدواژه های مربوط به متن سکشن های قانونی و ذخیره سازی
-    do_keyword_extract(sections)
+    operation_result = do_keyword_extract(sections)
    
+    print(f'end: {datetime.datetime.now()}')
--- a/p5_representer.py
+++ b/p5_representer.py
@ -5,7 +5,6 @@ import datetime
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import json
-from elastic_helper import ElasticHelper

 date = datetime.datetime.now()
 today = f'{date.year}-{date.month}-{date.day}-{date.hour}'
@ -20,28 +19,6 @@ total = 0

 id = ''
        
-def get_sections():
-    """
-    دریافت کل سکشن های قانونی در مسیر مشخص شده
-    """
-    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
-    eh_obj = ElasticHelper()
-    sections = eh_obj.iterateJsonFile(sections_path, True)
-    sections = convert_to_dict(sections)
-    return sections
-
-def convert_to_dict(sections):
-    """
-    تبدیل لیست سکشن های قانون به دیکشنری
-    """
-    sections_dict = {}
-    for item in sections:
-        id = item['id']
-        source = item['source']
-        sections_dict[id] = source
-        
-    return sections_dict
-    
 def single_section_representation(content):
    """
    این متد، یک متن قانونی را با جملات ساده تر بازنمایی می کند
@ -123,7 +100,4 @@ def do_representation(sections):
    return operation_result, sections

 if __name__ == "__main__":
-    sections = get_sections()
-    
-    # بازنمایی متن سکشن های قانون و ذخیره سازی
-    do_representation(sections)
+    pass
--- a/readme/readme-classifier-fa.md
+++ b/readme/readme-classifier-fa.md
@ -1,6 +1,6 @@
 # اسکریپت کلاسیفیکیشن سکشن‌های قانون

-این فایل شامل یک اسکریپت پایتون (`p1_classifier.py`) برای کلاسبندی بخش‌های متنی با استفاده از یک مدل ترنسفورمر آموزش‌دیده است. این اسکریپت برای پیشنهاد مرتبط‌ترین کلاس‌ها برای هر بخش از متن قانون طراحی شده و کاربرد دارد.
+این پروژه شامل یک اسکریپت پایتون (`p1_classifier.py`) برای کلاسبندی بخش‌های متنی با استفاده از یک مدل ترنسفورمر آموزش‌دیده است. این اسکریپت برای پیشنهاد مرتبط‌ترین کلاس‌ها برای هر بخش از متن طراحی شده و برای اسناد حقوقی، دسته‌بندی محتوا و وظایف مشابه در پردازش زبان طبیعی (NLP) کاربرد دارد.

 ## پیش‌نیازها

@ -16,15 +16,15 @@ pip install transformers pandas

 - اسکریپت یک مدل ترنسفورمر آموزش‌دیده را برای کلاسیفیکیشن متن بارگذاری می‌کند.
 - هر بخش از متن را پردازش می‌کند و در صورت طولانی بودن متن، آن را به پنجره‌هایی تقسیم می‌کند تا با اندازه ورودی مدل سازگار شود.
- برای هر بخش، بهترین کلاس‌ها را پیش‌بینی و نتایج را بر می گرداند.
+- برای هر بخش، بهترین کلاس‌ها را پیش‌بینی و نتایج را ذخیره می‌کند.

 ## توابع اصلی

 - `get_class(sentences, top_k=4)`: یک جمله یا متن را کلاسیفای می‌کند و برترین کلاس‌ها را برمی‌گرداند.
 - `mean_classes(input_classes)`: نتایج کلاس‌بندی چند پنجره از یک متن طولانی را تجمیع می‌کند.
- `get_window_classes(text)`: تقسیم متن‌های طولانی به پنجره و تجمیع نتایج کلاسیفیکیشن و برگرداندن آن
- `single_section_classification(id, section_source)`: یک متن قانونی را کلاسبندی کرده و بهترین و سایر کلاس‌های پیشنهادی را برمی‌گرداند.
- `do_classify(sections)`: همه بخش‌ها را کلاسیفای کرده و نتایج را در قالب لیستی از آبجکت متادیتاهای مربوط به یک سکشن از قانون بر می گرداند.
+- `get_window_classes(text)`: تقسیم متن‌های طولانی به پنجره و تجمیع نتایج کلاسیفیکیشن آن‌ها را مدیریت می‌کند.
+- `single_section_classification(id, section_source)`: یک بخش را کلاسبندی کرده و بهترین و سایر کلاس‌های پیشنهادی را برمی‌گرداند.
+- `do_classify(sections)`: همه بخش‌ها را کلاسیفای کرده و نتایج را در یک فایل JSON ذخیره می‌کند.

 ## مثال استفاده
Author	SHA1	Message	Date
mdorstkar	013c35ac96	.	2025-08-26 10:29:49 +03:30
mdorstkar	cced2920c1	Adding Fast API to def single_section_classification in P3	2025-08-26 10:23:00 +03:30
mdorstkar	d0d19b2049	Adding Fast API to P3	2025-08-25 19:54:35 +03:30