data_processes/p5_simplifier.py

"""
ساده سازی (بازنمایی) جملات قانونی به تعدادی جمله ساده تر و روان تر
"""
import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json

today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'

if torch.cuda.is_available():
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

counter = 0
total = 0

id = ''
keywords_count = 15

def single_section_representation(content):
    """
    این متد، یک متن قانونی را با جملات ساده تر بازنمایی می کند

    **Args:
        content(str): متن یک سکشن قانونی
    **Returns:
        result(bool): بیانگر وضعیت عملیات با یک کلید بولین
        desc(str): توضیحی در مورد نتیجه عملیات
        sentences(list): لیستی از جملاتی که متن ورودی را بازنمایی می کند
    """

    try:
        sen_count = (len(content) / 1000) * 15
        sen_count = int(sen_count)
        if sen_count == 0:
            sen_count = 1
        messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
                     {"role": "user", "content":
                    f"متن زیر را در قالب {sen_count} جمله جداگانه، ساده و روان به زبان فارسی، برای کسی که حقوق دان نیست، بازنویسی کن و بین دو * قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن. جملاتی که تولید می کنی، از نظر معنایی تکراری نباشند و از مجموع جملات بتوان منظور و معنای دقیق متن داده شده را فهم کرد. در پایان هر جمله، علامت نقطه قرار بده و به هیچ وجه جمله آخر را به صورت ناقص رها نکن.\n متن:{content}"
        }]

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id


        outputs = model.generate(
            input_ids,
            max_new_tokens=2048,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.1,
            top_p=0.85,
        )

        response = outputs[0][input_ids.shape[-1]:]
        sentences = tokenizer.decode(response, skip_special_tokens=True)
        # حذف جملات تکراری
        sentences = list(set(sentences))

        result = True
        desc = 'Operation successful'
        return result, desc, sentences

    except Exception as error:
        result = False
        desc = str(error)
        return result, desc, []

def do_representation(sections):
    print(f"start time:   {datetime.datetime.now()}")

    for index, id in enumerate(sections):
        result, desc, sentences = single_section_representation(sections[id]['content'])
        if not result:
            error_content = f'id: {id} - error: {desc}\n'
            with open(f'./data/represent/represent_errors_{today}.txt', 'a+', encoding='utf-8') as file:
                file.write(error_content)

        sections[id]['represented_sentences'] = sentences
        print(f'representation process. section {index+1}/{len(sections)} - id: {id}')

    with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile:
        outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4))

    print(f"end time:   {datetime.datetime.now()}")
    print(" *** finished! *** ")

    operation_result = True
    return operation_result, sections

if __name__ == "__main__":
    pass