representation source

2025-08-11 19:56:29 +03:30 · 2025-08-11 19:56:29 +03:30 · 6b84ad4031
commit 6b84ad4031
parent afe4f0b87e
8 changed files with 343 additions and 109 deletions
--- a/do_nlp_processes.py
+++ b/do_nlp_processes.py
@ -4,7 +4,7 @@
 """
 from p1_classifier import do_classify
 from p2_ner_recognizer import do_ner_recognize
-# from p3_words_embedder import do_word_embedder
+from p3_words_embedder import do_word_embedder
 # from p4_keyword_extractor import do_keyword_extract
 # from p5_simplifier import do_simplify
@ -14,8 +14,18 @@ def get_sections():
    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
    eh_obj = ElasticHelper()
    sections = eh_obj.iterateJsonFile(sections_path, True)
    sections = convert_to_dict(sections)
    return sections
 def convert_to_dict(sections):
    sections_dict = {}
    for item in sections:
        id = item['id']
        source = item['source']
        sections_dict[id] = source
    return sections_dict
 def main():
    # get sections to do nlp processes
@ -36,7 +46,7 @@ def main():
    sections = do_ner_recognize(sections)
    # 3. word embedder
-    # sections = do_word_embedder(sections)
+    #sections = do_word_embedder(sections)
    # 4. keyword extract
    # result_kw = do_keyword_extract(sections)
--- a/p1_classifier.py
+++ b/p1_classifier.py
@ -107,80 +107,88 @@ def full_path_text_maker(full_path):
    full_path_text = full_path_text.strip()
    return full_path_text
 def single_section_classification(id, section_source):
    """
    این متد، متن ورودی را کلاسبندی می کند
    **Args:
        id (str): شناسه سکشن
        section_source (obj): سورس یک سکشن که شامل متن قانون و متادیتاهای مربوط به آن می شود
    **Returns:
        classification_result(obj): چند کلاس پیشنهادی به ترتیب اولویت
        classification_status(bool): بیان می کند که عملیات کلاس بندی موفق بوده یا خیر
        desc(str): توضیحی در مورد موفقیت یا خطای عملیات ارائه می دهد
    """
    classification_result ={
        "best-class":{},
        "other-classes": []}
    content = section_source['content']
    # اگر نوع سکشن، عنوان یا موخره یا امضاء باشد، نیازی به فرایند کلاسبندی نیست و لیست کلاس های مربوط به این سکشن را خالی قرار می دهیم
    if content =='' or section_source['other_info']['full_path'] == 'عنوان' or section_source['other_info']['full_path'] == 'موخره' or section_source['other_info']['full_path'] == 'امضاء':        
        return classification_result, True, 'Classification was successful'
    qanon_title = section_source['qanon_title']
    # این متغیر ریشه سکشن اخیر تا رسیدن به قانون را مشخص می کند
    # مثلا: ماده5>تبصره یک>بند الف
    full_path = section_source['other_info']['full_path'].split(">")
    # بازسازی متن مسیر سکشن از جزء به کل
    full_path_text = full_path_text_maker(full_path)
    """
    به دلیل اینکه متن برخی از سکشن ها به تنهایی معنادار نیستند، مسیر جزء به کل ماده و عنوان قانون را به اول همه سکشن ها اضافه می کنیم تا هم نقیصه معنادار نبودن موارد این چنینی را برطرف کنیم و هم برای کلاس بندی، انتخاب مدل را بر اساس عنوان قانون جهت دهی کنیم. به این صورت، مدل، عنوان قانون را هم در کلاس بندی دخیل خواهد کرد
    """
    pre_content = f"محتوای {full_path_text} {cleaning(qanon_title)} متن زیر است."
    try:
        content = cleaning(content)
    except Exception as error:
        # شناسه ماده هایی که محتوای آنها خالی است در این مسیر ذخیره می شوند
        with open('./data/cleaning_content_log.txt', 'a', encoding='utf-8') as output_file:
            output_file.write(id + " >> " + str(error) + "\n")
            print('cleaning content error!')
            return classification_result, False, str(error)
    try:
        # دریافت کلاس های مربوط به یک سکشن
        section_classes = get_window_classes(f"{pre_content} {content}")
    except Exception as error:
        error_content = f"{id} -- Classification Error Content:{error}\n"
        with open('./data/classification/errors.txt', 'a', encoding='utf-8') as output_file:
            output_file.write(error_content)
        print(error_content)
        return classification_result, False, error_content
    classification_result ={
            "best-class" : section_classes[0],
            "other-classes" : section_classes[1:]
        }
    return classification_result, True, 'Classification was successful'
 def do_classify(sections):
    print(f'start classification: {datetime.datetime.now()}')
    test_counter = 1
    # all = تعداد سکشن های فایل قبلی 282671
-    all = 285839 
+    all = len(sections) 
    # لیستی جهت ذخیره عناوین قانون ها
    qanon_title_list = []
    # دیکشنری برای ذخیره نتایج که شامل شناسه سکشن ها و 4 کلاس به ترتیب اولویت است
    new_sections_dict = {}
-    for index, item in enumerate(sections):
+    for index, id in enumerate(sections):
-        id = item
+        source = sections[id]['source']
        classification_result, classification_status, desc = single_section_classification(id, source)
-        source = sections[id]
+        if not classification_status:
-        # اگر نوع سکشن، عنوان یا موخره یا امضاء باشد، نیازی به فرایند کلاسبندی نیست و لیست کلاس های مربوط به این سکشن را خالی قرار می دهیم
+            print(f'id: {id} classification error. error description: {desc}')
        if source['other_info']['full_path'] == 'عنوان' or source['other_info']['full_path'] == 'موخره' or source['other_info']['full_path'] == 'امضاء':
            new_sections_dict[id] ={
            "best-class":{},
            "other-classes": []}
            print(f'section: {all}/{index+1}/{id}', flush=True)
            continue
        content = source['content']
        qanon_title = source['qanon_title']
        # این متغیر ریشه سکشن اخیر تا رسیدن به قانون را مشخص می کند
        # مثلا: ماده5>تبصره یک>بند الف
        full_path = source['other_info']['full_path'].split(">")
        # بازسازی متن مسیر سکشن از جزء به کل
        full_path_text = full_path_text_maker(full_path)
        """
        به دلیل اینکه متن برخی از سکشن ها به تنهایی معنادار نیستند، مسیر جزء به کل ماده و عنوان قانون را به اول همه سکشن ها اضافه می کنیم تا هم نقیصه معنادار نبودن موارد این چنینی را برطرف کنیم و هم برای کلاس بندی، انتخاب مدل را بر اساس عنوان قانون جهت دهی کنیم. به این صورت، مدل، عنوان قانون را هم در کلاس بندی دخیل خواهد کرد
        """
        pre_content = f"محتوای {full_path_text} {cleaning(qanon_title)} متن زیر است."
        try:
            content = cleaning(content)
        except Exception as e:
            # شناسه ماده هایی که محتوای آنها خالی است در این مسیر ذخیره می شوند
            with open('./data/empty_content_log.txt', 'a', encoding='utf-8') as output_file:
                output_file.write(id + " >> " + str(e) + "\n")
                continue
        try:
            # دریافت کلاس های مربوط به یک سکشن
            section_classes = get_window_classes(f"{pre_content} {content}")
            #region collect data for evaluation
            """ این قسمت تا 7 خط بعدی، صرفا جهت جمع آوری و ذخیره تعدادی از سکشن ها و کلاس های پیش بینی شده برای آنها، به منظور ارزیابی عملکرد مدل توسط کاربر انسانی است و دخیل در فرایند کلاسبندی نیست"""
            if (len(tokenizer(f"{pre_content} {content}")['input_ids'][1:-1]) < 1500) and not qanon_title in qanon_title_list:
                with open('./data/classification/test_log_60e_hoosh_fp3.txt', 'a', encoding='utf-8') as output_file:
                        message = f"\n{test_counter}\n{id} : {pre_content} {content}\nclasses:\n"
                        for cls in section_classes:
                            message += f"{cls['label']} >> {cls['score']}\n"
                        output_file.write(message + "\n")
                        test_counter+=1
            #endregion  
        except Exception as e:
            error = e
            with open('./data/classification/errors.txt', 'a', encoding='utf-8') as output_file:
                output_file.write(f"{id} -- Error Content:{error}\n")
            continue
        # item['classes'] = section_classes
        # ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
-        new_sections_dict[id] ={
+        new_sections_dict[id] = classification_result
            "content" : content,
            "best-class" : section_classes[0],
            "other-classes" : section_classes[1:]
        }
        """ برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
-        qanon_title_list.append(qanon_title)
+        # qanon_title = source['qanon_title']
        # qanon_title_list.append(qanon_title)
        print(f'section: {all}/{index+1}/{id}', flush=True)
    # ذخیره دیکشنری شناسه های قانون و کلاس های تخمین زده شده در فایل جیسون
    with open('./data/classification/all_sections_classes_new_140405.json', 'w', encoding='utf-8') as output_file:
--- a/p2_ner_recognizer.py
+++ b/p2_ner_recognizer.py
@ -15,6 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n
 tagger = SequenceTagger.load(model)
 print('model read and tagger initialized')
 today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
 def prepare_data(ner_obj_list):
    ner_data_list = []
@ -97,7 +98,17 @@ def find_ner_values_in_text(text, ner_values):
    return ner_obj
-def inference_main(input_sentence):
+def single_ner_recognizer(input_sentence):
    """
    این متد، موجودیت های نامدار را از متن ورودی استخراج می کند
    **Args:
        input_sentence (str): متن سکشن
    **Returns:
        ner_obj_list, (obj): آبجکتی از موجودیت های نامدار تشخیص داده شده
        proccess_result(obj): بخش اول شامل یک کلید از نوع بولین است که موفقیت عملیات را بیان می کند. بخش دوم نیز توضیحی در مورد وضعیت نتیجه عملیات را در بر دارد
    """
    try:
        proccess_result = True, ''
@ -128,10 +139,10 @@ def inference_main(input_sentence):
        ner_obj_list = find_ner_values_in_text(input_sentence, ner_values)
    except Exception as error:
-        proccess_result = False , error.args[0]
+        proccess_result = False , str(error.args[0])
        ner_obj_list = []
-    return ner_obj_list, input_sentence, proccess_result 
+    return ner_obj_list, proccess_result 
 # تابع بازگشتی برای تقسیم متن به تکه های کوچکتر از 512 کاراکتر
 def split_sentence(input_sentence):
@ -180,6 +191,27 @@ def split_sentence(input_sentence):
    return parts
 def do_ner_recognize(sections):
    len_sections = len(sections)
    for index, id in enumerate(sections):
        content = sections[id]['content']
        if not content == '':
            content = cleaning(content)
        ner_obj_list, ner_result = single_ner_recognizer(content)
        if not ner_result[0]:
            print(f'ner recognization error. id: {id}')
            error_content = f'id: {id} - error: {ner_result[1]}\n'
            with open(f'./data/ner/ner_errors_{today}.txt', 'a+', encoding='utf-8') as file:
                file.write(error_content)
            continue
        ner_data_list = prepare_data(ner_obj_list)
        sections[id]['ners_v2'] = ner_data_list
        print(f'ner process: {index+1}/{len_sections}')
    print(f'len_sections ner recognization finished!')
    return sections
 def get_sections():
    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
    eh_obj = ElasticHelper()
@ -187,20 +219,6 @@ def get_sections():
    sections = convert_to_dict(sections)
    return sections
 def do_ner_recognize(sections):
    len_sections = len(sections)
    for index, item in enumerate(sections):
        content = sections[item]['content']
        if not content == '':
            content = cleaning(content)
        ner_obj_list, content_ai, ner_result = inference_main(content)
        ner_data_list = prepare_data(ner_obj_list)
        sections[item]['ners_v2'] = ner_data_list
        print(f'ner process: {index+1}/{len_sections}')
    print(f'len_sections ner recognization finished!')
    return sections
 def convert_to_dict(sections):
    sections_dict = {}
    for item in sections:
@ -225,7 +243,7 @@ if __name__ == '__main__':
    sections = do_ner_recognize(sections)
-    with open('./data/ner/sections_ner.json', 'w', encoding='utf-8') as output:
+    with open(f'./data/ner/sections_ner_{today}.json', 'w', encoding='utf-8') as output:
        data = json.dumps(sections, ensure_ascii=False, indent=4)
        output.write(data)
--- a/p3_words_embedder.py
+++ b/p3_words_embedder.py
@ -3,28 +3,52 @@
 """
 from sentence_transformers import SentenceTransformer
 import json
 import datetime
 import numpy as np
-def load_embedder_model(model_name):
+today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
-    model = SentenceTransformer(model_name)
+
-    return model
+model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
 # model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
 # model_name = 'sharif-dal/dal-bert'#90-41
 # model_name = 'lifeweb-ai/shiraz'#97-67
 # model_name = 'BAAI/bge-m3'#90-35
 # model_name = 'jinaai/jina-embeddings-v3'#??
 # model_name = 'jinaai/jina-embeddings-v4'#??
 # model_name = 'HooshvareLab/bert-base-parsbert-uncased'#90-54
 model = SentenceTransformer(model_name)
 def do_word_embedder(sections):
    model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
    model = load_embedder_model(model_name)
    for index, item in enumerate(sections):
-        embeddings = get_sentence_embeddings(sections[item]['content'])
+        embeddings = single_section_embedder(sections[item]['content'])
        sections[item]['embeddings'] = embeddings
-    with open('./data/embeddings/sections_embeddings.json', 'w', encoding='utf-8') as outpu_file:
+    with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
        data = json.dumps(sections, ensure_ascii=False)
-        outpu_file.write(data)
+        output_file.write(data)
-def get_sentence_embeddings(sentence):
+def single_section_embedder(sentence):
-    model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
+    """
-    model = load_embedder_model(model_name)
+    این متد، متن ورودی را تبدیل به بردار متناظر آن می کند
    **Args:
        sentence (str): متن یک سکشن
    **Returns:
        embeddings: لیست بردار متناظر با متن ورودی
    """
    embeddings = model.encode(sentence)
    return embeddings
-if __name__ == '__main__':
+def cosine_similarity(vec1, vec2):
-    pass
+    dot_product = np.dot(vec1, vec2)  # ضرب داخلی دو بردار
    norm_vec1 = np.linalg.norm(vec1) # نُرم بردار اول
    norm_vec2 = np.linalg.norm(vec2) # نُرم بردار دوم
    return dot_product / (norm_vec1 * norm_vec2)
 if __name__ == '__main__':
    embd1 = single_section_embedder("۲ – درصد مشارکت و سهم پرداختی کارفرما نسبت به مأخذ کسر حق بیمه به صندوقهای فعال در سطح همگانی بیمههای اجتماعی و درمانی یکسان خواهد بود.")
    embd2 = single_section_embedder("۳ – درصد مشارکت و سهم پرداختی بیمهشده نسبت به مأخذ کسر حق بیمه به صندوقهای فعال در سطح همگانی بیمههای اجتماعی و درمانی یکسان خواهد بود.")
    # embd2 = get_sentence_embeddings("تو کم گذاشتی وگرنه شکست نمی خوردی")
    similarity = cosine_similarity(embd1, embd2)
    print(f'similarity: {similarity}')
--- a/p4_keyword_extractor.py
+++ b/p4_keyword_extractor.py
@ -15,6 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME"
 model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 #model_id = "meta-llama/Llama-3.1-70B-Instruct"
 today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
 bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
@ -61,7 +62,7 @@ def generate(formatted_prompt):
  USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
 •	یک لیست فارسی
 •	شماره ترتیبی در ابتدای هر عبارت 
-•	هر عبارت در یک خط جداگانه *
+•	هر عبارت در یک خط جداگانه
 •	بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
 موارد زیر در استخراج عبارات کلیدی الزامی است:
 •	بسیار مهم و حیاتی است که عبارات کلیدی باید دقیقاً در متن موجود باشند، بنابراین هرگز از خلاقیت برای ساختن کلیدواژه‌ای که دقیقا در متن وجود ندارد، استفاده نکن. 
@ -100,12 +101,21 @@ def generate(formatted_prompt):
  return tokenizer.decode(response, skip_special_tokens=True)
-def get_rules(sentence):
+def single_section_get_keyword(sentence):
-    formatted_prompt = format_prompt(sentence)
+    """
-    rules = generate(formatted_prompt).split('\n')
+    این متد، کلیدواژه های قانونی موجود در متن ورودی را استخراج می کند
    result =  [r.strip() for r in rules if r.strip()]
    return result
    **Args:
        sentence(str): متن یک سکشن قانونی
    **Returns:
        kws(list): لیستی از کلیدواژه های تشخیص داده شده
    """
    formatted_prompt = format_prompt(sentence)
    keywords = generate(formatted_prompt).split('\n')
    kws =  [kw.strip() for kw in keywords if kw.strip()]
    # حذف کلیدواژه های تکراری
    kws = list(set(kws))
    return kws
 def get_sections():
    sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
@ -138,17 +148,21 @@ def do_keyword_extract(sections):
        # if item['id'] in prev_ids:
        #     continue
-        if counter > 10:
+        # # برای تست خروجی
-            with open('./data/keyword/prev_kw_ids.txt', 'a+', encoding='utf-8') as file:
+        # if counter > 10:
-                file.write(period_ids_text)
+        #     with open('./data/keyword/prev_kw_ids.txt', 'a+', encoding='utf-8') as file:
-            break
+        #         file.write(period_ids_text)
        #     break
        id = item
        content = sections[id]['content']
        try:
-            sections[id]['keywords'] = get_rules(content)
+            sections[id]['keywords'] = single_section_get_keyword(content)
-        except:
+        except Exception as error:
            print(f"section kw error : {id}\n")
            error_content = f'id: {id} - error: {str(error)}\n'
            with open(f'./data/keyword/keyword_errors_{today}.txt', 'a+', encoding='utf-8') as file:
                file.write(error_content)
            counter += 1
            continue
@ -157,7 +171,7 @@ def do_keyword_extract(sections):
        print(f"section: {counter}-id: {id}")
        # temp_dict.append(item)
        if counter % 1000 == 0:
-            outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}.json', "a+", encoding='utf-8')
+            outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
            outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
            outputfile.close()
            print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++\n ")
@ -165,13 +179,13 @@ def do_keyword_extract(sections):
            file_counter += 1
            period_sections = []
            # save proccessed sections id for next executions of this code
-            with open('./data/keyword/prev_kw_ids.txt', 'a+', encoding='utf-8') as file:
+            with open(f'./data/keyword/prev_kw_ids_{today}.txt', 'a+', encoding='utf-8') as file:
                file.write(period_ids_text)
        counter += 1
        period_ids_text = f'***** file number: {file_counter} *****\n'
-    outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}.json', "w", encoding='utf-8')
+    outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "w", encoding='utf-8')
    outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent = 4))
    outputfile.close()
    print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++ ")
--- a/p5_simplifier.py
+++ b/p5_simplifier.py
@ -1,4 +1,85 @@
 """
 ساده سازی (بازنمایی) جملات قانونی به تعدادی جمله ساده تر و روان تر
 """
 import datetime
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 import json
 today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
 if torch.cuda.is_available():
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
 counter = 0
 total = 0
 remained = 0
 id = ''
 keywords_count = 15
 def single_section_representation(text):
    global remained
    try:
        sen_count = (len(text) / 1000) * 15
        sen_count = int(sen_count)
        if sen_count == 0: 
            sen_count = 1
        messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
                     {"role": "user", "content": 
                    f"متن زیر را در قالب {sen_count} جمله جداگانه، ساده و روان به زبان فارسی، برای کسی که حقوق دان نیست، بازنویسی کن و بین دو * قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن. جملاتی که تولید می کنی، از نظر معنایی تکراری نباشند و از مجموع جملات بتوان منظور و معنای دقیق متن داده شده را فهم کرد. در پایان هر جمله، علامت نقطه قرار بده و به هیچ وجه جمله آخر را به صورت ناقص رها نکن.\n متن:{text}"
        }]
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id
-def do_simplify(sections):
+        outputs = model.generate(
            input_ids,
            max_new_tokens=500,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.7,
            top_p=0.85,
        )
        response = outputs[0][input_ids.shape[-1]:]
        sentences = tokenizer.decode(response, skip_special_tokens=True)
        result = True
        desc = 'operation successful'
        return result, desc, sentences
    except Exception as error:
        result = False
        desc = str(error)
        return result, desc, []
 def do_representation(sections):
    print(f"start time:   {datetime.datetime.now()}")
    for index, id in sections:
        result, desc, sentences = single_section_representation(sections[id]['content'])
        if not result:
            error_content = f'id: {id} - error: {desc}\n'
            with open(f'./data/represent/represent_errors_{today}.txt', 'a+', encoding='utf-8') as file:
                file.write(error_content)
        sections[id]['represented_sentences'] = sentences
    print(f"end time:   {datetime.datetime.now()}")
    print(" *** finished! *** ")
 if __name__ == "__main__":
    pass
--- a/test.py
+++ b/test.py
@ -0,0 +1,18 @@
 from p3_words_embedder import get_sentence_embeddings
 sentences = [
    'جمله یک',
    'جمله 2',
    'جمله 3',
    'جمله 4',
    'جمله 5',
    'جمله 6',
    'جمله 7',
 ]
 emds = {}
 for item in sentences:
    edm = get_sentence_embeddings(item)
    emds[item] = edm
 print()
--- a/test/test__p3_words_embedder.py
+++ b/test/test__p3_words_embedder.py
@ -0,0 +1,61 @@
 import os
 import sys
 parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 sys.path.append(parent_dir)
 from p3_words_embedder import do_word_embedder, get_sentence_embeddings, cosine_similarity
 import unittest
 import numpy as np
 import json
 from sentence_transformers import SentenceTransformer
 class TestSentenceEmbedder(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # مدل را یکبار بارگذاری می‌کنیم تا در همه تست‌ها استفاده شود
        cls.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    def test_get_sentence_embeddings(self):
        # بررسی خروجی تابع get_sentence_embeddings
        sentence = "This is a test sentence."
        embeddings = get_sentence_embeddings(sentence)
        self.assertEqual(len(embeddings.shape), 1, "Embedding should be a 1D vector.")
        self.assertGreater(embeddings.shape[0], 0, "Embedding vector should not be empty.")
    def test_cosine_similarity(self):
        # تست تابع cosine_similarity برای دو بردار
        vec1 = np.array([1, 0, 0])
        vec2 = np.array([0, 1, 0])
        vec3 = np.array([1, 1, 0])
        sim1 = cosine_similarity(vec1, vec2)
        sim2 = cosine_similarity(vec1, vec3)
        self.assertAlmostEqual(sim1, 0.0, places=5, msg="Cosine similarity between orthogonal vectors should be 0.")
        self.assertAlmostEqual(sim2, 0.7071, places=4, msg="Cosine similarity between vec1 and vec3 should be approximately 0.7071.")
    def test_do_word_embedder(self):
        # تست تابع do_word_embedder
        sections = {
            "sec1": {"content": "This is the first section."},
            "sec2": {"content": "This is the second section."}
        }
        do_word_embedder(sections)
        # فایل خروجی را بررسی می‌کنیم
        with open('sect_embeddings.json', 'r', encoding='utf-8') as f:
            data = json.load(f)
        for key in sections.keys():
            self.assertIn('embeddings', data[key], f"Section {key} should contain 'embeddings'.")
            self.assertEqual(len(data[key]['embeddings']), 384, f"Embedding size for section {key} should be 384.")
    def test_cosine_similarity_edge_cases(self):
        # تست موارد خاص برای cosine_similarity
        vec1 = np.array([0, 0, 0])
        vec2 = np.array([1, 1, 1])
        with self.assertRaises(ZeroDivisionError, msg="Cosine similarity should handle zero vectors."):
            cosine_similarity(vec1, vec2)
 if __name__ == '__main__':
    unittest.main()