keyword_llama/llama_givechi.py
2025-01-20 16:35:35 +00:00

236 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
این فایل با نرمالایزر پارسیور کار می کند
"""
from datetime import datetime
# from elasticsearch import Elasticsearch
from threading import Thread
import torch
import time
import json
import os.path
import os
from funcs import write_to_json, read_from_json
from normalizer import Normalizer
from tokenizer import *
_normalizer = Normalizer(date_normalizing_needed=True)
address = os.getcwd()
# sections_list = read_from_json(address + '/data/clean_sections_11k.json') # Main File
# sections_list = read_from_json('../data/clean_sections_11k.json') # Main File
# sections_list = read_from_json('../data/simplized_sentences_110_2.json') # Main File
# sections_list = read_from_json('./data/main_sections_170k_metadata.json') # Main File
def read_from_json1(file_address):
data_dict = []
# خواندن اطلاعات از فایل JSON
with open(file_address, 'r', encoding='utf-8') as file:
loaded_data = json.load(file)
# نمایش اطلاعات خوانده شده
# for item in loaded_data:
# data_dict.append(item)
return loaded_data
sections_list = read_from_json1("./data_givechi/main_classes_dataset_03.json") # Main File
# not_have_two_token_kw = read_from_json('../data/not_have_two_token_kw.json')
# not_have_two_token_kw_list = [item3["id"] for item3 in not_have_two_token_kw]
not_have_two_token_kw_list = []
import json
os.environ['HF_HOME'] = "/home/admin/HFHOME"
from tqdm import tqdm
import time
from transformers import AutoTokenizer, AutoModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#model_id = "PartAI/Dorna2-Llama3.1-8B-Instruct"
model_id = "meta-llama/Llama-3.1-70B-Instruct"
# use quantization to lower GPU usage
# 4 bit:
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )
# 8 bit:
bnb_config = BitsAndBytesConfig(
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
)
print("Model Loading START:")
print(str(datetime.now()))
print()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
quantization_config=bnb_config
)
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
print("Model Loading END:")
print(str(datetime.now()))
print()
# if torch.cuda.is_available():
# #model_id = "PartAI/Dorna-Llama3-8B-Instruct"
# # model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "meta-llama/Llama-3.1-70B-Instruct"
# model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# index_name_i = 'semantic_search-v10'
# es = Elasticsearch(
# "http://127.0.0.1:6900",
# # ca_certs="/path/to/http_ca.crt",
# basic_auth=("elastic", "SG*7eGwg+KG2_*-1_mMm")
# )
counter = 0
total = 0
remained = 0
id = ''
keywords_count = 15
def generateKeywords(law):
global remained
try:
keywords_count = (len(law) / 1000) * 15
keywords_count = int(keywords_count)
if keywords_count == 0:
keywords_count = 1
# messages = [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
# {"role": "user", "content":
# '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
# هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.'''
# .format(keywords_count)
# },
# {"role": "user", "content":
# '''"متن": {}'''.format(text)
# },]
messages = [{"role": "system", "content": '''You are a lawyer and legal expert who can interpret legal texts well, understand the semantic layers of the text and infer logical relationships in the text.
You can understand Persian and English and interpret expressions in a Legal Context. You are also an expert in Deontic Logic and can understand the logical structure of law and rules in text.
Use uppercase English letters such as A, B, C, etc. to identify all possible propositions. Do not include negative tones such as ""not"" in the propositions. For example, if the sentence is ""It is not bored,"" you should use ""A: bored"" to represent it.
Next, for each proposition, use the symbol to represent its negative form. For example, the negative form of proposition A can be expressed as ¬A.
Now, carefully analyze the context and find causal relationship between propositions seriously. A causal expression is only established when the context directly and explicitly supports this relationship. Use arrows (→) to indicate causal relationships, for example, ""If A, then B"", ""B if A"" and ""A causes B"" etc. must be represented as A → B.
another example, ""if A and not C then B"" should be represented as A ∧ ¬C → B
if the causal expression is not strict and is somehow loose, do not state it.
Extraction Criteria:
1- Defintion of a propostion:
a proposition is an statement which could be true or false. ""I'm an animal"" is a propostion, but ""Ministry of Roads and Urban Development"" is not.
Defintion of strict caustion:
A strictly causes B if and only if:
1. Whenever A occurs, B necessarily occurs.
2. The occurrence of B is entirely dependent on the occurrence of A.
3. There are no other factors or conditions that can independently cause B.
4. The relationship between A and B is invariant and holds in all relevant circumstances.
Additional Instructions:
Do Not Add Extra Information: Provide only the extracted rules in the specified format without additional commentary or explanations.
Formatting Instructions:
Language: Output must be in English.
output only propositions and causal expressions that are explicitly stated in the following text.''' },
{"role": "user", "content": f'''"this is your text to process:{law}'''
},]
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
terminators = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
model.generation_config.pad_token_id = tokenizer.pad_token_id
outputs = model.generate(
input_ids,
max_new_tokens=256,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
top_p=0.85,
)
#lock0.release()
response = outputs[0][input_ids.shape[-1]:]
keywords = tokenizer.decode(response, skip_special_tokens=True)
#lock1.acquire()
# resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
return keywords
except Exception as inst:
print(type(inst)) # the exception type
print(inst.args) # arguments stored in .args
print("Exception: " + str(inst))
if __name__ == "__main__":
start_time = time.time()
print("start_time: "+str(datetime.now()))
try:
keywords_dict = []
count = 1
finall_data = []
for q_class, current_section_list in sections_list.items():
for content_item in current_section_list:
id = content_item['id']
qanon_id = content_item['qanon_id']
# if not id in not_have_two_token_kw_list:
# continue
content = content_item['content']
content_len = len(content.split())
# کنارگذاشتن محتواهای با حجم زیاد
if content_len > 2000:
print("too long content " + str(id))
continue
content = _normalizer.sub_alphabets(content)
prompt_result = generateKeywords(content)
print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " prompting ... ")
keywords_dict.append({
'id':id,
'qanon-id':qanon_id,
'content':content,
'prompt-result':prompt_result
})
if count ==5:
write_to_json(keywords_dict, f"./data_givechi/main_test_{count}.json")
# keywords_dict = []
count+=1
# finall_data.append({'class':q_class, 'sections': keywords_dict})
finall_data.append({q_class : keywords_dict})
write_to_json(finall_data, "./data_givechi/main_classes_dataset_result.json")
except Exception as inst:
print(type(inst)) # the exception type
print(inst.args) # arguments stored in .args
end_time = time.time()
print("end_time: "+ str(datetime.now()))
operation_time = (int(end_time-start_time)/60)/60
print(f"elapsed time: {operation_time} hours")
print(f"Finished!!!")