llama/llama3_represent_110_sections_test.py

""" این فایل، یک جیسون که حاوی لیستی مواد قانونی است را دریافت می کند.
مواد قانونی شامل 110 مورد از 43 دسته مختلف است که با حساسیت کاربر، انتخاب شده است.
خروجی این روال، بازنمایی هر ماده به تعدادی جمله ساده تر و روان تر است
"""
from html import escape
from elasticsearch import Elasticsearch
from lxml import etree
from datetime import datetime
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
from threading import Thread
import torch
import time
from concurrent.futures import ThreadPoolExecutor
import concurrent
import threading
import json
import os.path

from funcs import write_to_json, read_from_excel
import os
from normalizer import cleaning

if torch.cuda.is_available():
    model_id = "PartAI/Dorna-Llama3-8B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

counter = 0
total = 0
remained = 0
id = ''
keywords_count = 15

def generateKeywords(text):
    global remained
    try:
        sen_count = (len(text) / 1000) * 15
        sen_count = int(sen_count)
        if sen_count == 0:
            sen_count = 1
        messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
                     {"role": "user", "content":
                    f"متن زیر را در قالب {sen_count} جمله جداگانه، ساده و روان به زبان فارسی، برای کسی که حقوق دان نیست، بازنویسی کن و بین دو * قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن. جملاتی که تولید می کنی، از نظر معنایی تکراری نباشند و از مجموع جملات بتوان منظور و معنای دقیق متن داده شده را فهم کرد. در پایان هر جمله، علامت نقطه قرار بده و به هیچ وجه جمله آخر را به صورت ناقص رها نکن.\n متن:{text}"
        }]

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id


        outputs = model.generate(
            input_ids,
            max_new_tokens=500,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.7,
            top_p=0.85,
        )
        #lock0.release()
        response = outputs[0][input_ids.shape[-1]:]
        keywords = tokenizer.decode(response, skip_special_tokens=True)
        #lock1.acquire()
        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})


        return keywords

    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print("Exception: " + str(inst))


if __name__ == "__main__":

    base_address = os.getcwd()
    # base_address = "/home/gpu/tnlp/jokar/llama" # terminal

    sections_110_address = base_address + "/data/sections_110.xlsx"

    datalist_110 = read_from_excel(sections_110_address)
    # import numpy as np
    # np_sections = np.array(datalist_110)
    start_time = time.time()

    result_list = []
    try:
        # part = datalist_3k[:600]
        # datalist_85 = datalist_85[:20]
        for i in range(0,len(datalist_110)):
            id = datalist_110["id"][i]
            qanon_id = datalist_110["qanon_id"][i]
            content = datalist_110["content"][i]
            main_topic = datalist_110["main_topic"][i]
            print(i+1)
            # id = line['id']
            # qanon_id = line['qanon_id']
            # content = line['content']
            # topic = line['main_topic']
            content = cleaning(content)
            result = generateKeywords(content)
            print("++++++++++++++++++++++++++++++++++++++++++++++++++++")
            print(result)
            print("++++++++++++++++++++++++++++++++++++++++++++++++++++")
            result_parts = result.split("\n*")
            # result = result.replace("*","")
            # result = result.strip()
            result_list.append({
                "id": id,
                "qanon_id": qanon_id,
                "topic": main_topic,
                "content": content,
                "result": result_parts
            })
            print()

        destination_address = base_address + "/data/simplized_sentences_110_2.json"
        write_to_json(result_list, destination_address)


    except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
            print(inst)          # __str__ allows args to be printed directly,
                                # but may be overridden in exception subclasses
            print("Exception:=> %s -> %.2f " % (id , counter / total))


    end_time = time.time()
    print(f"elapsed time:   {end_time-start_time}")
    print(" *** finished! *** ")