200 lines
9.2 KiB
Python
200 lines
9.2 KiB
Python
|
"""
|
|||
|
این فایل با نرمالایزر پارسیور کار می کند
|
|||
|
"""
|
|||
|
from datetime import datetime
|
|||
|
# from elasticsearch import Elasticsearch
|
|||
|
from threading import Thread
|
|||
|
import torch
|
|||
|
import time
|
|||
|
import json
|
|||
|
import os.path
|
|||
|
import os
|
|||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
|||
|
# from general_functions import normalize_content
|
|||
|
from funcs import write_to_json, read_from_json
|
|||
|
|
|||
|
from normalizer import Normalizer
|
|||
|
from tokenizer import *
|
|||
|
_normalizer = Normalizer(date_normalizing_needed=True)
|
|||
|
address = os.getcwd()
|
|||
|
# sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
|
|||
|
# sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
|
|||
|
# sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
|
|||
|
# sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
|
|||
|
|
|||
|
|
|||
|
# not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
|
|||
|
# not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
|
|||
|
not_have_two_token_kw_list = []
|
|||
|
|
|||
|
import json
|
|||
|
from tqdm import tqdm
|
|||
|
import time
|
|||
|
|
|||
|
from transformers import AutoTokenizer, AutoModel
|
|||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
|||
|
|
|||
|
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|||
|
#model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
|
|||
|
|
|||
|
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
|||
|
|
|||
|
# use quantization to lower GPU usage
|
|||
|
# 4 bit:
|
|||
|
# bnb_config = BitsAndBytesConfig(
|
|||
|
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
|
|||
|
# )
|
|||
|
# 8 bit:
|
|||
|
bnb_config = BitsAndBytesConfig(
|
|||
|
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
|||
|
)
|
|||
|
print("Model Loading START:")
|
|||
|
print(str(datetime.now()))
|
|||
|
print()
|
|||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|||
|
model = AutoModelForCausalLM.from_pretrained(
|
|||
|
model_id,
|
|||
|
torch_dtype=torch.bfloat16,
|
|||
|
device_map="auto",
|
|||
|
quantization_config=bnb_config
|
|||
|
)
|
|||
|
terminators = [
|
|||
|
tokenizer.eos_token_id,
|
|||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
|||
|
]
|
|||
|
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
|||
|
print("Model Loading END:")
|
|||
|
print(str(datetime.now()))
|
|||
|
print()
|
|||
|
|
|||
|
sections_list = read_from_json('./data/sections_110.json') # Main File
|
|||
|
# if torch.cuda.is_available():
|
|||
|
# #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
|
|||
|
# # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|||
|
|
|||
|
# model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
|||
|
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
|||
|
# tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|||
|
|
|||
|
# index_name_i = 'semantic_search-v10'
|
|||
|
|
|||
|
# es = Elasticsearch(
|
|||
|
# "http://127.0.0.1:6900",
|
|||
|
# # ca_certs="/path/to/http_ca.crt",
|
|||
|
# basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
|
|||
|
# )
|
|||
|
|
|||
|
counter = 0
|
|||
|
total = 0
|
|||
|
remained = 0
|
|||
|
id = ''
|
|||
|
keywords_count = 15
|
|||
|
|
|||
|
def generateKeywords(text):
|
|||
|
global remained
|
|||
|
try:
|
|||
|
keywords_count = (len(text) / 1000) * 15
|
|||
|
keywords_count = int(keywords_count)
|
|||
|
if keywords_count == 0:
|
|||
|
keywords_count = 1
|
|||
|
|
|||
|
|
|||
|
# messages = [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
|
|||
|
# {"role": "user", "content":
|
|||
|
# '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
|
|||
|
# هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
|
|||
|
# .format(keywords_count)
|
|||
|
# },
|
|||
|
# {"role": "user", "content":
|
|||
|
# '''"متن": {}'''.format(text)
|
|||
|
# },]
|
|||
|
messages = [{"role": "system", "content": "You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text." },
|
|||
|
{"role": "user", "content":
|
|||
|
'''Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
|||
|
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".'''
|
|||
|
.format(keywords_count)
|
|||
|
},
|
|||
|
{"role": "user", "content":
|
|||
|
'''"متن": {}'''.format(text)
|
|||
|
},]
|
|||
|
|
|||
|
|
|||
|
input_ids = tokenizer.apply_chat_template(
|
|||
|
messages,
|
|||
|
add_generation_prompt=True,
|
|||
|
return_tensors="pt"
|
|||
|
).to(model.device)
|
|||
|
|
|||
|
terminators = [
|
|||
|
tokenizer.eos_token_id,
|
|||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
|||
|
]
|
|||
|
model.generation_config.pad_token_id = tokenizer.pad_token_id
|
|||
|
|
|||
|
|
|||
|
outputs = model.generate(
|
|||
|
input_ids,
|
|||
|
max_new_tokens=256,
|
|||
|
eos_token_id=terminators,
|
|||
|
do_sample=True,
|
|||
|
temperature=0.6,
|
|||
|
top_p=0.85,
|
|||
|
)
|
|||
|
#lock0.release()
|
|||
|
response = outputs[0][input_ids.shape[-1]:]
|
|||
|
keywords = tokenizer.decode(response, skip_special_tokens=True)
|
|||
|
#lock1.acquire()
|
|||
|
# resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
|
|||
|
|
|||
|
|
|||
|
return keywords
|
|||
|
|
|||
|
except Exception as inst:
|
|||
|
print(type(inst)) # the exception type
|
|||
|
print(inst.args) # arguments stored in .args
|
|||
|
print("Exception: " + str(inst))
|
|||
|
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
start_time = time.time()
|
|||
|
print("start_time: "+str(datetime.now()))
|
|||
|
|
|||
|
try:
|
|||
|
keywords_dict = []
|
|||
|
|
|||
|
count = 1
|
|||
|
for content_item in sections_list:
|
|||
|
id = content_item['id']
|
|||
|
# if not id in not_have_two_token_kw_list:
|
|||
|
# continue
|
|||
|
content = content_item['content']
|
|||
|
content_len = len(content.split())
|
|||
|
# کنارگذاشتن محتواهای با حجم زیاد
|
|||
|
if content_len > 2000:
|
|||
|
print("too long content " + str(id))
|
|||
|
continue
|
|||
|
content = _normalizer.sub_alphabets(content)
|
|||
|
keywords = generateKeywords(content)
|
|||
|
print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " keyword extracting ... ")
|
|||
|
keywords_dict.append({
|
|||
|
'id':id,
|
|||
|
'keywords':keywords
|
|||
|
})
|
|||
|
if count % 500 == 0:
|
|||
|
write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
|
|||
|
keywords_dict = []
|
|||
|
count+=1
|
|||
|
|
|||
|
write_to_json(keywords_dict, f"./data/sections_kw_llama_8b_main_{count}.json")
|
|||
|
|
|||
|
except Exception as inst:
|
|||
|
print(type(inst)) # the exception type
|
|||
|
print(inst.args) # arguments stored in .args
|
|||
|
|
|||
|
end_time = time.time()
|
|||
|
print("end_time: "+ str(datetime.now()))
|
|||
|
operation_time = (int(end_time-start_time)/60)/60
|
|||
|
print(f"elapsed time: {operation_time} hours")
|
|||
|
print(f"Finished!!!")
|