135 lines
4.3 KiB
Python
135 lines
4.3 KiB
Python
from html import escape
|
|
from lxml import etree
|
|
from datetime import datetime
|
|
from elasticsearch import Elasticsearch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
|
|
from threading import Thread
|
|
import torch
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import concurrent
|
|
import threading
|
|
import json
|
|
import os.path
|
|
#lock = threading.Lock()
|
|
#lock1 = threading.Lock()
|
|
#from cleantext import clean
|
|
#import re
|
|
|
|
|
|
if torch.cuda.is_available():
|
|
model_id = "PartAI/Dorna-Llama3-8B-Instruct"
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
|
|
# pipe = pipeline(
|
|
# "text-generation",
|
|
# model=model,
|
|
# tokenizer=tokenizer,
|
|
# torch_dtype=torch.float16,
|
|
# device_map="auto",
|
|
# )
|
|
|
|
|
|
|
|
counter = 0
|
|
total = 0
|
|
remained = 0
|
|
id = ''
|
|
keywords_count = 15
|
|
|
|
|
|
def generateKeywords(text):
|
|
global remained
|
|
try:
|
|
keywords_count = (len(text) / 1000) * 15
|
|
keywords_count = int(keywords_count)
|
|
if keywords_count == 0:
|
|
keywords_count = 1
|
|
messages = [{"role": "user", "content":
|
|
'''از "متن" حداقل {} کلیدواژه مهم و پراهمیت را استخراج کن و در قالب لیست به زبان فارسی چاپ کن. "متن": {}
|
|
'''.format(keywords_count, text)
|
|
}]
|
|
|
|
input_ids = tokenizer.apply_chat_template(
|
|
messages,
|
|
add_generation_prompt=True,
|
|
return_tensors="pt"
|
|
).to(model.device)
|
|
|
|
terminators = [
|
|
tokenizer.eos_token_id,
|
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
|
]
|
|
model.generation_config.pad_token_id = tokenizer.pad_token_id
|
|
|
|
|
|
outputs = model.generate(
|
|
input_ids,
|
|
max_new_tokens=256,
|
|
eos_token_id=terminators,
|
|
do_sample=True,
|
|
temperature=0.6,
|
|
top_p=0.85,
|
|
)
|
|
#lock0.release()
|
|
response = outputs[0][input_ids.shape[-1]:]
|
|
keywords = tokenizer.decode(response, skip_special_tokens=True)
|
|
#lock1.acquire()
|
|
# resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
|
|
|
|
|
|
return keywords
|
|
|
|
except Exception as inst:
|
|
print(type(inst)) # the exception type
|
|
print(inst.args) # arguments stored in .args
|
|
print("Exception: " + str(inst))
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
start_time = time.time()
|
|
text_arr = []
|
|
content_file = open('test_sections.json', "r")#, encoding='utf-8'
|
|
readfile = content_file.read()
|
|
text_arr =json.loads(readfile)
|
|
content_file.close()
|
|
remained = len(text_arr)
|
|
try:
|
|
keywords_file = open('test_sections2.json', "w", encoding='utf-8')
|
|
keywords_array = []
|
|
for content_data in text_arr:
|
|
real_keywords = content_data['real_keywords']
|
|
ai_keywords = content_data['ai_keywords']
|
|
content = content_data['content']
|
|
ai_keywords = generateKeywords(content)
|
|
# keywords_array.append({
|
|
# 'id':id,
|
|
# 'keywords':keywords
|
|
# })
|
|
keywords_array.append({
|
|
'content':content,
|
|
'ai_keywords':ai_keywords,
|
|
'real_keywords': real_keywords,
|
|
})
|
|
|
|
remained = remained - 1
|
|
print("remained items = {} ".format(remained))
|
|
|
|
keywords_file.write(json.dumps(keywords_array, ensure_ascii=False))
|
|
keywords_file.close()
|
|
#for k_item in keywords_array:
|
|
# resp = es.update(index=index_name_i, id= k_item.id , doc={"content_keywords-llama3-str": str(k_item.keywords)})
|
|
|
|
except Exception as inst:
|
|
print(type(inst)) # the exception type
|
|
print(inst.args) # arguments stored in .args
|
|
print(inst) # __str__ allows args to be printed directly,
|
|
# but may be overridden in exception subclasses
|
|
print("Exception:=> %s -> %.2f " % (id , counter / total))
|
|
|
|
|
|
end_time = time.time()
|
|
print(f"elapsed time: {end_time-start_time}") |