199 lines
9.7 KiB
Python
199 lines
9.7 KiB
Python
|
import json
|
|||
|
# from tqdm import tqdm
|
|||
|
import time
|
|||
|
import datetime
|
|||
|
|
|||
|
from funcs import save_to_file_by_address, read_file_by_address#, read_from_json
|
|||
|
# from pandas import read_excel
|
|||
|
|
|||
|
import torch
|
|||
|
import os
|
|||
|
from transformers import AutoTokenizer, AutoModel
|
|||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|||
|
|
|||
|
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
|||
|
|
|||
|
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
|||
|
#model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
|||
|
|
|||
|
|
|||
|
bnb_config = BitsAndBytesConfig(
|
|||
|
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
|||
|
)
|
|||
|
|
|||
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|||
|
model = AutoModelForCausalLM.from_pretrained(
|
|||
|
model_id,
|
|||
|
torch_dtype=torch.bfloat16,
|
|||
|
device_map="auto",
|
|||
|
quantization_config=bnb_config
|
|||
|
)
|
|||
|
terminators = [
|
|||
|
tokenizer.eos_token_id,
|
|||
|
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
|||
|
]
|
|||
|
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
|||
|
|
|||
|
# SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
|
|||
|
# And you don't need to provide explanations or additional text.
|
|||
|
# Put each keyword on a single line."
|
|||
|
# """# Explain your answer step by step.
|
|||
|
|
|||
|
SYS_PROMPT = """You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts."""# Explain your answer step by step.
|
|||
|
SYS_PROMPT = """شما یک دستیار متخصص در استخراج عبارات کلیدی از متون حقوقی فارسی هستید. وظیفه شما این است که از متن ارائه شده توسط کاربر، عبارات اسمی کلیدی مهم و معنادار را استخراج کنید."""# gemini prompt
|
|||
|
|
|||
|
|
|||
|
def format_prompt(SENTENCE):
|
|||
|
# PROMPT = f"Persian legal text: {SENTENCE}."
|
|||
|
PROMPT = f"متن: {SENTENCE}."
|
|||
|
return PROMPT
|
|||
|
|
|||
|
def kw_count_calculator(text):
|
|||
|
keywords_count = (len(text) / 1000) * 15
|
|||
|
keywords_count = int(keywords_count)
|
|||
|
if keywords_count == 0:
|
|||
|
keywords_count = 1
|
|||
|
return keywords_count
|
|||
|
|
|||
|
def generate(formatted_prompt):
|
|||
|
keywords_count = kw_count_calculator(formatted_prompt)
|
|||
|
|
|||
|
# Gemini Prompt
|
|||
|
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
|
|||
|
• یک لیست فارسی
|
|||
|
• شماره ترتیبی در ابتدای هر عبارت
|
|||
|
• هر عبارت در یک خط جداگانه *
|
|||
|
• بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
|
|||
|
موارد زیر در استخراج عبارات کلیدی الزامی است:
|
|||
|
• بسیار مهم و حیاتی است که عبارات کلیدی باید دقیقاً در متن موجود باشند، بنابراین هرگز از خلاقیت برای ساختن کلیدواژهای که دقیقا در متن وجود ندارد، استفاده نکن.
|
|||
|
• طول هر عبارت کلیدی حداقل دو کلمه باشد. عبارات تک-کلمهای قابل قبول نیستند.
|
|||
|
• نام سازمانها، مؤسسات و اشخاص حقوقی باید به عنوان عبارات کلیدی در نظر گرفته شوند.
|
|||
|
• عبارات کلیدی نباید فعل یا حرف اضافه باشند و فقط باید شامل اسمهایی باشند که به هم اضافه شدهاند (عبارت اسمی).
|
|||
|
• عبارات کلیدی نباید به حرف اضافه یا حرف "و" ختم شوند.
|
|||
|
• عبارات کلیدی نباید شامل کلمات "ماده"، "تبصره" و "بند" یا تاریخها باشند.
|
|||
|
به عنوان مثال، اگر متن "قانون مدنی جمهوری اسلامی ایران در ماده ۲۲۲ در خصوص مسئولیت مدنی اشخاص حقوقی در تبصره ۱ به موضوع خسارت ناشی از تقصیر کارکنان اشاره دارد." باشد، خروجی باید به شکل زیر باشد:
|
|||
|
1. قانون مدنی جمهوری اسلامی ایران
|
|||
|
2. مسئولیت مدنی اشخاص حقوقی
|
|||
|
3. خسارت ناشی از تقصیر کارکنان
|
|||
|
اکنون متن مورد نظر را دریافت خواهید کرد.
|
|||
|
"""
|
|||
|
formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
|
|||
|
messages = [
|
|||
|
{"role":"system","content":SYS_PROMPT},
|
|||
|
{"role":"user","content":USER_PROMPT},
|
|||
|
{"role":"user","content":formatted_prompt}
|
|||
|
]
|
|||
|
# tell the model to generate
|
|||
|
input_ids = tokenizer.apply_chat_template(
|
|||
|
messages,
|
|||
|
add_generation_prompt=True,
|
|||
|
return_tensors="pt"
|
|||
|
).to(model.device)
|
|||
|
outputs = model.generate(
|
|||
|
input_ids,
|
|||
|
max_new_tokens=2048,
|
|||
|
eos_token_id=terminators,
|
|||
|
do_sample=True,
|
|||
|
temperature=0.6,
|
|||
|
top_p=0.9,
|
|||
|
)
|
|||
|
response = outputs[0][input_ids.shape[-1]:]
|
|||
|
return tokenizer.decode(response, skip_special_tokens=True)
|
|||
|
|
|||
|
|
|||
|
def get_rules(sentence):
|
|||
|
formatted_prompt = format_prompt(sentence)
|
|||
|
rules = generate(formatted_prompt).split('\n')
|
|||
|
result = [r.strip() for r in rules if r.strip()]
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
start_time = datetime.datetime.now()
|
|||
|
print(f'start time: {start_time}')
|
|||
|
# inputfile = open('./data/main_classes_dataset_03.json', "r", encoding='utf-8')
|
|||
|
inputfile = open('./data/sections_170k_normalized.json', "r", encoding='utf-8')
|
|||
|
data = json.load(inputfile)
|
|||
|
inputfile.close()
|
|||
|
|
|||
|
prev_ids = read_file_by_address('./data/prev_kw_ids_170k.txt').splitlines()
|
|||
|
counter = 1
|
|||
|
file_counter = 1
|
|||
|
temp_dict = []
|
|||
|
temp_data_text = ''
|
|||
|
period_sections = []
|
|||
|
period_ids_text = f'***** file number: {file_counter} *****\n'
|
|||
|
for item in (data):
|
|||
|
if item['id'] in prev_ids:
|
|||
|
continue
|
|||
|
content = item['content']
|
|||
|
try:
|
|||
|
item['keywords'] = get_rules(content)
|
|||
|
except:
|
|||
|
print(f"section kw error : {item[id]}\n")
|
|||
|
counter += 1
|
|||
|
continue
|
|||
|
if counter % 1000 == 0:
|
|||
|
print(f"period ==>> {(start_time-datetime.datetime.now())} hours for {counter} sections +++ \n")
|
|||
|
|
|||
|
period_ids_text += f"{item['id']} \n"
|
|||
|
period_sections.append(item)
|
|||
|
print(f"section:{counter}-id:{item['id']}")
|
|||
|
# temp_dict.append(item)
|
|||
|
if counter % 1000 == 0:
|
|||
|
print(f"period ==>> {(start_time-datetime.datetime.now())} hours for {counter} sections +++ \n")
|
|||
|
outputfile = open(f'./data/sections_kw_llama8b_170k_{str(file_counter)}.json', "a+", encoding='utf-8')
|
|||
|
outputfile.write(json.dumps(period_sections, ensure_ascii=False))
|
|||
|
outputfile.close()
|
|||
|
print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++\n ")
|
|||
|
|
|||
|
file_counter += 1
|
|||
|
period_sections = []
|
|||
|
# save proccessed sections id for next executions of this code
|
|||
|
save_to_file_by_address('./data/prev_kw_ids_170k.txt', period_ids_text)
|
|||
|
counter += 1
|
|||
|
period_ids_text = f'***** file number: {file_counter} *****\n'
|
|||
|
|
|||
|
|
|||
|
outputfile = open(f'./data/sections_kw_llama8b_170k_{str(file_counter)}.json', "w", encoding='utf-8')
|
|||
|
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent = 4))
|
|||
|
outputfile.close()
|
|||
|
print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++ ")
|
|||
|
|
|||
|
end_time = datetime.datetime.now()
|
|||
|
print(f"end_time: {end_time}")
|
|||
|
print(f"elapsed time: {(end_time-start_time)} Hours!!! ")
|
|||
|
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
|||
|
print("end")
|
|||
|
|
|||
|
exit()
|
|||
|
|
|||
|
"""
|
|||
|
system prompt version 2 for test:
|
|||
|
|
|||
|
You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
|
|||
|
|
|||
|
user prompt version 2 for test:
|
|||
|
|
|||
|
Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
|||
|
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
|
|||
|
"""
|
|||
|
|
|||
|
# Deepseek suggestion
|
|||
|
"""
|
|||
|
system prompt:
|
|||
|
You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts.
|
|||
|
|
|||
|
user prompt:
|
|||
|
Extract at least {} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
|||
|
Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
|||
|
Do not add any explanations, introductions, or conclusions to the output.
|
|||
|
Each key phrase must:
|
|||
|
Be present exactly in the text.
|
|||
|
Consist of at least two tokens (single-token key phrases are not acceptable).
|
|||
|
Be a noun phrase (no verbs, prepositions, or single-token words).
|
|||
|
Not end with a preposition or the letter "و".
|
|||
|
Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ ها".
|
|||
|
Include names of organizations, institutions, and legal entities as key phrases.
|
|||
|
Ensure the output is clean and adheres to all the above rules.
|
|||
|
"""
|