llama/keywords_llama3.py

from html import escape
from lxml import etree
from datetime import datetime
from elasticsearch import Elasticsearch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
from threading import Thread
import torch
import time
from concurrent.futures import ThreadPoolExecutor
import concurrent
import threading
import json
import os.path
#lock = threading.Lock()
#lock1 = threading.Lock()
#from cleantext import clean
#import re


if torch.cuda.is_available():
    model_id = "PartAI/Dorna-Llama3-8B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_id)


# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )


counter = 0
total = 0
remained = 0
id = ''
keywords_count = 15


def generateKeywords(text):
    global remained
    try:
        keywords_count = (len(text) / 1000) * 15
        keywords_count = int(keywords_count)
        if keywords_count == 0:
            keywords_count = 1
        messages =  [{"role": "user", "content":
                    '''از "متن" حداقل {} کلیدواژه مهم و پراهمیت را استخراج کن و در قالب لیست به زبان فارسی چاپ کن. "متن": {}
                    '''.format(keywords_count, text)
        }]

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id


        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.85,
        )
        #lock0.release()
        response = outputs[0][input_ids.shape[-1]:]
        keywords = tokenizer.decode(response, skip_special_tokens=True)
        #lock1.acquire()
        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})


        return keywords

    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print("Exception: " + str(inst))


if __name__ == "__main__":
    start_time = time.time()
    text_arr = []
    content_file = open('test_sections.json', "r")#, encoding='utf-8'
    readfile = content_file.read()
    text_arr =json.loads(readfile)
    content_file.close()
    remained = len(text_arr)
    try:
        keywords_file = open('test_sections2.json', "w", encoding='utf-8')
        keywords_array = []
        for content_data in text_arr:
            real_keywords = content_data['real_keywords']
            ai_keywords = content_data['ai_keywords']
            content = content_data['content']
            ai_keywords = generateKeywords(content)
            # keywords_array.append({
            #     'id':id,
            #     'keywords':keywords
            # })
            keywords_array.append({
                    'content':content,
                    'ai_keywords':ai_keywords,
                    'real_keywords': real_keywords,
                })

            remained = remained - 1
            print("remained items = {} ".format(remained))

        keywords_file.write(json.dumps(keywords_array, ensure_ascii=False))
        keywords_file.close()
        #for k_item in keywords_array:
        #    resp = es.update(index=index_name_i, id= k_item.id , doc={"content_keywords-llama3-str": str(k_item.keywords)})

    except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
            print(inst)          # __str__ allows args to be printed directly,
                                # but may be overridden in exception subclasses
            print("Exception:=> %s -> %.2f " % (id , counter / total))


    end_time = time.time()
    print(f"elapsed time:   {end_time-start_time}")