keyword_llama/do_extractor.py

"""
این فایل با نرمالایزر پارسیور کار می کند
"""
from datetime import datetime
# from elasticsearch import Elasticsearch
from threading import Thread
import torch
import time
import json 
import os.path
import os
os.environ['HF_HOME'] = "/home/admin/HFHOME"
# from general_functions import normalize_content
from funcs import write_to_json, read_from_json

from normalizer import Normalizer
from tokenizer import *
_normalizer = Normalizer(date_normalizing_needed=True)
address = os.getcwd()
# sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
# sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
# sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
# sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File


# not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
# not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
not_have_two_token_kw_list = []

import json
from tqdm import tqdm
import time

from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
os.environ['HF_HOME'] = "/home/admin/HFHOME"

#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"

model_id = "meta-llama/Llama-3.1-70B-Instruct"

# use quantization to lower GPU usage
# 4 bit:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )
# 8 bit:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
)
print("Model Loading START:")
print(str(datetime.now()))
print()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
print("Model Loading END:")
print(str(datetime.now()))
print()

sections_list = read_from_json('./data/sections_110.json') # Main File
# if torch.cuda.is_available():
#     #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
#     # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    
#     model_id = "meta-llama/Llama-3.1-70B-Instruct"
#     model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
#     tokenizer = AutoTokenizer.from_pretrained(model_id)

# index_name_i = 'semantic_search-v10'

# es = Elasticsearch(
#     "http://127.0.0.1:6900",
#     # ca_certs="/path/to/http_ca.crt",
#     basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
# )

counter = 0
total = 0
remained = 0
id = ''
keywords_count = 15
   
def generateKeywords(text):
    global remained
    try:
        keywords_count = (len(text) / 1000) * 15
        keywords_count = int(keywords_count)
        if keywords_count == 0: 
            keywords_count = 1
       
        
        # messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
        #              {"role": "user", "content": 
        #             '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
        #             هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
        #             .format(keywords_count)
        # },
        #              {"role": "user", "content": 
        #             '''"متن": {}'''.format(text)
        # },]
        messages =  [{"role": "system", "content": "You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text." },
                     {"role": "user", "content": 
                    '''Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".'''
                    .format(keywords_count)
        },
                     {"role": "user", "content": 
                    '''"متن": {}'''.format(text)
        },]
       
        
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        terminators = [
                    tokenizer.eos_token_id,
                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]
        model.generation_config.pad_token_id = tokenizer.pad_token_id

        
        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.85,
        )
        #lock0.release()
        response = outputs[0][input_ids.shape[-1]:]
        keywords = tokenizer.decode(response, skip_special_tokens=True)
        #lock1.acquire()
        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
       

        return keywords

    except Exception as inst:
        print(type(inst))    # the exception type
        print(inst.args)     # arguments stored in .args
        print("Exception: " + str(inst))


if __name__ == "__main__":
    start_time = time.time()
    print("start_time: "+str(datetime.now()))
    
    try:
        keywords_dict = []
        
        count = 1
        for content_item in sections_list:
            id = content_item['id']
            # if not id in not_have_two_token_kw_list:
            #     continue
            content = content_item['content']
            content_len = len(content.split())
            # کنارگذاشتن محتواهای با حجم زیاد
            if content_len > 2000:
                print("too long content " + str(id))
                continue
            content = _normalizer.sub_alphabets(content)
            keywords = generateKeywords(content) 
            print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " keyword extracting ... ")
            keywords_dict.append({
                    'id':id,
                    'keywords':keywords
                })
            if count % 500 == 0:
                write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
                keywords_dict = []
            count+=1
            
        write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
          
    except Exception as inst:
            print(type(inst))    # the exception type
            print(inst.args)     # arguments stored in .args
            
    end_time = time.time()
    print("end_time: "+ str(datetime.now()))
    operation_time = (int(end_time-start_time)/60)/60
    print(f"elapsed time: {operation_time} hours")
    print(f"Finished!!!")