1
This commit is contained in:
commit
f170eb5001
18
.vscode/launch.json
vendored
Normal file
18
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python Debugger: Current File",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
//"console": "integratedTerminal",
|
||||
"console": "internalConsole",
|
||||
"justMyCode": false,
|
||||
"python": "/home/admin/NLP/.env/bin/python",
|
||||
}
|
||||
]
|
||||
}
|
27
data_1060_vs_3800 .py
Normal file
27
data_1060_vs_3800 .py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# this code reads a json file which has sections and generates
|
||||
# keywords for it by llama and then stores merge of old data and
|
||||
# new generated keywords
|
||||
|
||||
import json
|
||||
|
||||
|
||||
inputfile0 = open('./data/new_3800_sections.json', "r", encoding='utf-8')
|
||||
data_3800 = json.load(inputfile0)
|
||||
inputfile0.close()
|
||||
|
||||
data_3800_ids = [sec['id'] for sec in data_3800]
|
||||
|
||||
# read sections in json file
|
||||
inputfile = open('./data/sections_1060.json', "r", encoding='utf-8')
|
||||
data = json.load(inputfile)
|
||||
inputfile.close()
|
||||
|
||||
absent_1060_ids = []
|
||||
for subject , sections in data.items():
|
||||
for item in sections:
|
||||
id_1060 = item['id']
|
||||
if id_1060 in data_3800_ids:
|
||||
absent_1060_ids.append(id_1060)
|
||||
|
||||
for idd in absent_1060_ids:
|
||||
print(idd)
|
115
funcs.py
Normal file
115
funcs.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
import re
|
||||
import os
|
||||
import json
|
||||
|
||||
# print(os.getcwd())
|
||||
def remove_signs():
|
||||
str = read_file()
|
||||
# lines =
|
||||
pattern = r"\(|\)"
|
||||
str = re.sub(pattern,'', str)
|
||||
# str = re.sub(')','', str)
|
||||
# str = re.sub('/','', str)
|
||||
|
||||
return str
|
||||
|
||||
def read_file():
|
||||
with open('./data/DATASET_2.txt', 'r', encoding='utf-8') as file:
|
||||
text = ''
|
||||
try:
|
||||
text = str(file.read())
|
||||
except:
|
||||
pass
|
||||
return text
|
||||
|
||||
def read_file_by_address(file_address):
|
||||
with open(file_address, 'a+', encoding='utf-8') as file:
|
||||
text = ''
|
||||
try:
|
||||
# Move the cursor to the beginning of the file to read its content
|
||||
file.seek(0)
|
||||
text = file.read()
|
||||
except:
|
||||
pass
|
||||
return text
|
||||
|
||||
def save_to_file(result):
|
||||
with open('./data/DATASET_3.txt', 'a+', encoding='utf-8') as file:
|
||||
previous_result = ''
|
||||
try:
|
||||
previous_result = file.read()
|
||||
except:
|
||||
pass
|
||||
file.write(result)
|
||||
file.close()
|
||||
|
||||
def save_to_file_by_address(file_address, text):
|
||||
with open(file_address, 'a+', encoding='utf-8') as file:
|
||||
previous_result = ''
|
||||
try:
|
||||
previous_result = file.read()
|
||||
except:
|
||||
pass
|
||||
file.write(text)
|
||||
file.close()
|
||||
|
||||
|
||||
# def read_from_excel(file_address, column_name):
|
||||
# # خواندن فایل اکسل
|
||||
# data = read_excel(file_address)
|
||||
|
||||
# # استخراج محتوای ستون مورد نظر
|
||||
# column_data = data[column_name]
|
||||
# return column_data
|
||||
|
||||
# def add_columndata_to_excel(file_address, column_name, columndata):
|
||||
|
||||
# # خواندن فایل اکسل
|
||||
# data = read_excel(file_address)
|
||||
|
||||
# # اضافه کردن ستون جدید به دادهها
|
||||
# data[column_name] = columndata
|
||||
|
||||
# # ذخیره کردن دادهها در فایل اکسل
|
||||
# data.to_excel(file_address, index=False)
|
||||
|
||||
def write_to_json(dict, file_address):
|
||||
|
||||
# تبدیل دیکشنری به فرمت JSON
|
||||
json_data = json.dumps(dict, indent=2, ensure_ascii=False)
|
||||
|
||||
# ذخیره فایل
|
||||
with open(file_address, 'w+', encoding='utf-8') as file:
|
||||
file.write(json_data)
|
||||
|
||||
return True
|
||||
|
||||
def read_from_json(file_address):
|
||||
data_dict = []
|
||||
# خواندن اطلاعات از فایل JSON
|
||||
with open(file_address, 'r', encoding='utf-8') as file:
|
||||
loaded_data = json.load(file)
|
||||
|
||||
# نمایش اطلاعات خوانده شده
|
||||
for item in loaded_data:
|
||||
data_dict.append(item)
|
||||
return data_dict
|
||||
|
||||
|
||||
def separated_date_format_finder(date_ner):
|
||||
result = False
|
||||
date_ner = date_ner.replace('.','/')
|
||||
date_ner = date_ner.replace('،','/')
|
||||
date_ner = date_ner.replace('ر','/')
|
||||
#date_pattern = r'\d{1,2} /\d{1,2} /\d{2,4}|\d{1,2}/\d{1,2}/\d{2,4}|\d{2,4} /\d{1,2} /\d{1,2}|\d{2,4}/\d{1,2}/\d{1,2}'
|
||||
date_pattern = r'\b(?:(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9])|(?:1[0-2]|0?[1-9])/?(?:3[01]|[12][0-9]|0?[1-9])/?(?:14[0-7][0-9]|13[0-9][0-9]|128[0-9]|[0-9]{2}))\b'
|
||||
regex = re.compile(date_pattern)
|
||||
match_dates = regex.finditer(date_ner)
|
||||
for date_item in match_dates:
|
||||
result = True
|
||||
break
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
pass
|
2
get_time.py
Normal file
2
get_time.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
import datetime
|
||||
print(datetime.datetime.now())
|
265
keyword_extractor.py
Normal file
265
keyword_extractor.py
Normal file
|
@ -0,0 +1,265 @@
|
|||
import json
|
||||
# from tqdm import tqdm
|
||||
import time
|
||||
|
||||
from funcs import save_to_file_by_address, read_file_by_address#, read_from_json
|
||||
# from pandas import read_excel
|
||||
|
||||
import torch
|
||||
import os
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||
|
||||
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||
|
||||
# inputfile = open('./data/sections_170k_normalized.json', "r", encoding='utf-8')
|
||||
# data = json.load(inputfile)
|
||||
# inputfile.close()
|
||||
# counter = 1
|
||||
# temp_dict = []
|
||||
# temp_data_text = ''
|
||||
# for item in (data):
|
||||
|
||||
# if counter <= 50:
|
||||
# temp_data_text = item
|
||||
# outputfile = open('./data/gemini_edited_prmpt.jsom', "a+", encoding='utf-8')
|
||||
# outputfile.write(json.dumps(temp_data_text, ensure_ascii=False))
|
||||
# outputfile.write("\n")
|
||||
# outputfile.close()
|
||||
# counter+=1
|
||||
# else:
|
||||
# break
|
||||
|
||||
# for item in (data):
|
||||
|
||||
# if counter <= 50:
|
||||
# temp_data_text = item
|
||||
# outputfile = open('./data/gemini_edited_prmpt.txt', "a+", encoding='utf-8')
|
||||
# outputfile.write(str(temp_data_text))
|
||||
# outputfile.write("\n")
|
||||
# outputfile.close()
|
||||
# counter+=1
|
||||
# else:
|
||||
# break
|
||||
|
||||
# outputfile = open('./data/gemini_edited_prmpt.txt', "r", encoding='utf-8')
|
||||
# ddata = outputfile.read()
|
||||
# ddata = ddata.split('\n')
|
||||
# # ddata = json.load(outputfile)
|
||||
# outputfile.close()
|
||||
# for ddd in ddata:
|
||||
# ddd = ddd.replace('\n',',')
|
||||
# json_data = json.dumps(ddd, ensure_ascii=False)
|
||||
# print()
|
||||
# exit()
|
||||
# use quantization to lower GPU usage
|
||||
# 4 bit:
|
||||
# bnb_config = BitsAndBytesConfig(
|
||||
# load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
|
||||
# )
|
||||
# 8 bit:
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=bnb_config
|
||||
)
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||
]
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||
|
||||
# SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
|
||||
# And you don't need to provide explanations or additional text.
|
||||
# Put each keyword on a single line."
|
||||
# """# Explain your answer step by step.
|
||||
|
||||
SYS_PROMPT = """You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts."""# Explain your answer step by step.
|
||||
SYS_PROMPT = """شما یک دستیار متخصص در استخراج عبارات کلیدی از متون حقوقی فارسی هستید. وظیفه شما این است که از متن ارائه شده توسط کاربر، عبارات اسمی کلیدی مهم و معنادار را استخراج کنید."""# gemini prompt
|
||||
|
||||
|
||||
def format_prompt(SENTENCE):
|
||||
# PROMPT = f"Persian legal text: {SENTENCE}."
|
||||
PROMPT = f"متن: {SENTENCE}."
|
||||
return PROMPT
|
||||
|
||||
def kw_count_calculator(text):
|
||||
keywords_count = (len(text) / 1000) * 15
|
||||
keywords_count = int(keywords_count)
|
||||
if keywords_count == 0:
|
||||
keywords_count = 1
|
||||
return keywords_count
|
||||
|
||||
def generate(formatted_prompt):
|
||||
keywords_count = kw_count_calculator(formatted_prompt)
|
||||
# USER_PROMPT = f"""Extract at least {keywords_count} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
||||
# 1. Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
||||
# 2. Do not add any explanations, introductions, or conclusions to the output.
|
||||
# 3. Each key phrase must:
|
||||
# - Be present exactly in the text.
|
||||
# - Consist of at least two tokens (single-token key phrases are not acceptable).
|
||||
# - Be a noun phrase (no verbs, prepositions, or single-token words).
|
||||
# - Not end with a preposition or the letter "و".
|
||||
# - Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ".
|
||||
# 4. Include names of organizations, institutions, and legal entities as key phrases.
|
||||
# 5. Ensure the output is clean and adheres to all the above rules."""
|
||||
|
||||
# USER_PROMPT = f"""Extract at least {keywords_count} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
||||
# 1. Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
||||
# 2. Do not add any explanations, introductions, or conclusions to the output.
|
||||
# 3. Each key phrase must:
|
||||
# - Be present exactly in the text.
|
||||
# - Consist of at least two words (single-word key phrases are not acceptable).
|
||||
# - Be a noun phrase (no verbs, prepositions, or single-words).
|
||||
# - Not end with a preposition or the letter "و".
|
||||
# - Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ".
|
||||
# 4. Include names of organizations, institutions, and legal entities as key phrases.
|
||||
# 5. Ensure the output is clean and adheres to all the above rules."""
|
||||
|
||||
# Gemini Prompt
|
||||
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
|
||||
• یک لیست فارسی
|
||||
• شماره ترتیبی در ابتدای هر عبارت
|
||||
• هر عبارت در یک خط جداگانه *
|
||||
• بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
|
||||
موارد زیر در استخراج عبارات کلیدی الزامی است:
|
||||
• بسیار مهم و حیاتی است که عبارات کلیدی باید دقیقاً در متن موجود باشند، بنابراین هرگز از خلاقیت برای ساختن کلیدواژهای که دقیقا در متن وجود ندارد، استفاده نکن.
|
||||
• طول هر عبارت کلیدی حداقل دو کلمه باشد. عبارات تک-کلمهای قابل قبول نیستند.
|
||||
• نام سازمانها، مؤسسات و اشخاص حقوقی باید به عنوان عبارات کلیدی در نظر گرفته شوند.
|
||||
• عبارات کلیدی نباید فعل یا حرف اضافه باشند و فقط باید شامل اسمهایی باشند که به هم اضافه شدهاند (عبارت اسمی).
|
||||
• عبارات کلیدی نباید به حرف اضافه یا حرف "و" ختم شوند.
|
||||
• عبارات کلیدی نباید شامل کلمات "ماده"، "تبصره" و "بند" یا تاریخها باشند.
|
||||
به عنوان مثال، اگر متن "قانون مدنی جمهوری اسلامی ایران در ماده ۲۲۲ در خصوص مسئولیت مدنی اشخاص حقوقی در تبصره ۱ به موضوع خسارت ناشی از تقصیر کارکنان اشاره دارد." باشد، خروجی باید به شکل زیر باشد:
|
||||
1. قانون مدنی جمهوری اسلامی ایران
|
||||
2. مسئولیت مدنی اشخاص حقوقی
|
||||
3. خسارت ناشی از تقصیر کارکنان
|
||||
اکنون متن مورد نظر را دریافت خواهید کرد.
|
||||
"""
|
||||
formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
|
||||
messages = [
|
||||
{"role":"system","content":SYS_PROMPT},
|
||||
{"role":"user","content":USER_PROMPT},
|
||||
{"role":"user","content":formatted_prompt}
|
||||
]
|
||||
# tell the model to generate
|
||||
input_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
outputs = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=2048,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.6,
|
||||
top_p=0.9,
|
||||
)
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
return tokenizer.decode(response, skip_special_tokens=True)
|
||||
|
||||
|
||||
def get_rules(sentence):
|
||||
formatted_prompt = format_prompt(sentence)
|
||||
rules = generate(formatted_prompt).split('\n')
|
||||
result = [r.strip() for r in rules if r.strip()]
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
print(f'start time: {start_time}')
|
||||
# inputfile = open('./data/main_classes_dataset_03.json', "r", encoding='utf-8')
|
||||
inputfile = open('./data/sections_170k_normalized.json', "r", encoding='utf-8')
|
||||
data = json.load(inputfile)
|
||||
inputfile.close()
|
||||
|
||||
prev_ids = read_file_by_address('./data/prev_ids.txt').splitlines()
|
||||
counter = 1
|
||||
file_counter = 2
|
||||
temp_dict = []
|
||||
temp_data_text = ''
|
||||
period_sections = []
|
||||
period_ids_text = f'***** file number: {file_counter} *****\n'
|
||||
for item in (data):
|
||||
if item['id'] in prev_ids:
|
||||
continue
|
||||
content = item['content']
|
||||
try:
|
||||
item['keywords'] = get_rules(content)
|
||||
except:
|
||||
print(f"section kw error : {item[id]}\n")
|
||||
counter += 1
|
||||
continue
|
||||
if counter % 1000 == 0:
|
||||
print(f"period ==>> {(start_time-time.time())/3600} hours for {counter} sections +++ \n")
|
||||
|
||||
period_ids_text += f"{item['id']} \n"
|
||||
period_sections.append(item)
|
||||
print(f"section:{counter}-id:{item['id']}")
|
||||
# temp_dict.append(item)
|
||||
if counter % 1000 == 0:
|
||||
print(f"period ==>> {(start_time-time.time())/3600} hours for {counter} sections +++ \n")
|
||||
outputfile = open(f'./data/sections_llama70b_gemini_edited_prmpt_{str(file_counter)}.json', "a+", encoding='utf-8')
|
||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(time.time())} +++++++++++++++++++++++++\n ")
|
||||
|
||||
file_counter += 1
|
||||
period_sections = []
|
||||
# save proccessed sections id for next executions of this code
|
||||
save_to_file_by_address('./data/prev_ids.txt', period_ids_text.strip())
|
||||
period_ids_text = ''
|
||||
counter += 1
|
||||
|
||||
outputfile = open(f'./data/sections_llama70b_gemini_edited_prmpt{str(file_counter)}.json', "w", encoding='utf-8')
|
||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent = 4))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(time.time())} +++++++++++++++++++++++++ ")
|
||||
|
||||
end_time = time.time()
|
||||
print(f"end_time: {end_time}")
|
||||
print(f"elapsed time: {(end_time-start_time)/3600} Hours!!! ")
|
||||
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
||||
print("end")
|
||||
|
||||
exit()
|
||||
|
||||
"""
|
||||
system prompt version 2 for test:
|
||||
|
||||
You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
|
||||
|
||||
user prompt version 2 for test:
|
||||
|
||||
Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
||||
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
|
||||
"""
|
||||
|
||||
# Deepseek suggestion
|
||||
"""
|
||||
system prompt:
|
||||
You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts.
|
||||
|
||||
user prompt:
|
||||
Extract at least {} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
||||
Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
||||
Do not add any explanations, introductions, or conclusions to the output.
|
||||
Each key phrase must:
|
||||
Be present exactly in the text.
|
||||
Consist of at least two tokens (single-token key phrases are not acceptable).
|
||||
Be a noun phrase (no verbs, prepositions, or single-token words).
|
||||
Not end with a preposition or the letter "و".
|
||||
Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ ها".
|
||||
Include names of organizations, institutions, and legal entities as key phrases.
|
||||
Ensure the output is clean and adheres to all the above rules.
|
||||
"""
|
205
keyword_extractor_1060.py
Normal file
205
keyword_extractor_1060.py
Normal file
|
@ -0,0 +1,205 @@
|
|||
# this code reads a json file which has sections and generates
|
||||
# keywords for it by llama and then stores merge of old data and
|
||||
# new generated keywords
|
||||
|
||||
import json
|
||||
# from tqdm import tqdm
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from funcs import save_to_file_by_address, read_file_by_address#, read_from_json
|
||||
# from pandas import read_excel
|
||||
|
||||
import torch
|
||||
import os
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
# read sections in json file
|
||||
inputfile = open('./data/sections_1060.json', "r", encoding='utf-8')
|
||||
data = json.load(inputfile)
|
||||
inputfile.close()
|
||||
|
||||
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||
|
||||
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=bnb_config
|
||||
)
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||
]
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||
|
||||
# SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
|
||||
# And you don't need to provide explanations or additional text.
|
||||
# Put each keyword on a single line."
|
||||
# """# Explain your answer step by step.
|
||||
|
||||
SYS_PROMPT = """You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts."""# Explain your answer step by step.
|
||||
SYS_PROMPT = """شما یک دستیار متخصص در استخراج عبارات کلیدی از متون حقوقی فارسی هستید. وظیفه شما این است که از متن ارائه شده توسط کاربر، عبارات اسمی کلیدی مهم و معنادار را استخراج کنید."""# gemini prompt
|
||||
|
||||
|
||||
def format_prompt(SENTENCE):
|
||||
# PROMPT = f"Persian legal text: {SENTENCE}."
|
||||
PROMPT = f"متن: {SENTENCE}."
|
||||
return PROMPT
|
||||
|
||||
def kw_count_calculator(text):
|
||||
keywords_count = (len(text) / 1000) * 15
|
||||
keywords_count = int(keywords_count)
|
||||
if keywords_count == 0:
|
||||
keywords_count = 1
|
||||
return keywords_count
|
||||
|
||||
def generate(formatted_prompt):
|
||||
keywords_count = kw_count_calculator(formatted_prompt)
|
||||
|
||||
# Gemini Prompt
|
||||
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
|
||||
• یک لیست فارسی
|
||||
• شماره ترتیبی در ابتدای هر عبارت
|
||||
• هر عبارت در یک خط جداگانه *
|
||||
• بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
|
||||
موارد زیر در استخراج عبارات کلیدی الزامی است:
|
||||
• بسیار مهم و حیاتی است که عبارات کلیدی باید دقیقاً در متن موجود باشند، بنابراین هرگز از خلاقیت برای ساختن کلیدواژهای که دقیقا در متن وجود ندارد، استفاده نکن.
|
||||
• طول هر عبارت کلیدی حداقل دو کلمه باشد. عبارات تک-کلمهای قابل قبول نیستند.
|
||||
• نام سازمانها، مؤسسات و اشخاص حقوقی باید به عنوان عبارات کلیدی در نظر گرفته شوند.
|
||||
• عبارات کلیدی نباید فعل یا حرف اضافه باشند و فقط باید شامل اسمهایی باشند که به هم اضافه شدهاند (عبارت اسمی).
|
||||
• عبارات کلیدی نباید به حرف اضافه یا حرف "و" ختم شوند.
|
||||
• عبارات کلیدی نباید شامل کلمات "ماده"، "تبصره" و "بند" یا تاریخها باشند.
|
||||
به عنوان مثال، اگر متن "قانون مدنی جمهوری اسلامی ایران در ماده ۲۲۲ در خصوص مسئولیت مدنی اشخاص حقوقی در تبصره ۱ به موضوع خسارت ناشی از تقصیر کارکنان اشاره دارد." باشد، خروجی باید به شکل زیر باشد:
|
||||
1. قانون مدنی جمهوری اسلامی ایران
|
||||
2. مسئولیت مدنی اشخاص حقوقی
|
||||
3. خسارت ناشی از تقصیر کارکنان
|
||||
اکنون متن مورد نظر را دریافت خواهید کرد.
|
||||
"""
|
||||
formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
|
||||
messages = [
|
||||
{"role":"system","content":SYS_PROMPT},
|
||||
{"role":"user","content":USER_PROMPT},
|
||||
{"role":"user","content":formatted_prompt}
|
||||
]
|
||||
# tell the model to generate
|
||||
input_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
outputs = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=2048,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.6,
|
||||
top_p=0.9,
|
||||
)
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
return tokenizer.decode(response, skip_special_tokens=True)
|
||||
|
||||
|
||||
def get_rules(sentence):
|
||||
formatted_prompt = format_prompt(sentence)
|
||||
rules = generate(formatted_prompt).split('\n')
|
||||
result = [r.strip() for r in rules if r.strip()]
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = datetime.datetime.now()
|
||||
print(f'start time: {start_time}')
|
||||
|
||||
|
||||
prev_ids = read_file_by_address('./data/prev_ids_1060.txt').splitlines()
|
||||
counter = 1
|
||||
file_counter = 1
|
||||
temp_dict = []
|
||||
temp_data_text = ''
|
||||
period_sections = []
|
||||
period_ids_text = f'***** file number: {file_counter} *****\n'
|
||||
for subject , sections in data.items():
|
||||
for item in sections:
|
||||
if item['id'] in prev_ids:
|
||||
continue
|
||||
content = item['content']
|
||||
try:
|
||||
item['keywords'] = get_rules(content)
|
||||
except:
|
||||
print(f"section kw error : {item[id]}\n")
|
||||
counter += 1
|
||||
continue
|
||||
if counter % 1200 == 0:
|
||||
print(f"period ==>> {(start_time-datetime.datetime.now())/3600} hours for {counter} sections +++ \n")
|
||||
|
||||
period_ids_text += f"{item['id']} \n"
|
||||
period_sections.append(item)
|
||||
print(f"section:{counter}-id:{item['id']}")
|
||||
# temp_dict.append(item)
|
||||
if counter % 1200 == 0:
|
||||
print(f"period ==>> {(start_time-datetime.datetime.now())/3600} hours for {counter} sections +++ \n")
|
||||
outputfile = open(f'./data/sections_llama70b_1060_{str(file_counter)}.json', "a+", encoding='utf-8')
|
||||
outputfile.write(json.dumps(data, ensure_ascii=False))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++\n ")
|
||||
|
||||
file_counter += 1
|
||||
period_sections = []
|
||||
# save proccessed sections id for next executions of this code
|
||||
save_to_file_by_address('./data/prev_ids_1060.txt', period_ids_text.strip())
|
||||
period_ids_text = '\n'
|
||||
counter += 1
|
||||
|
||||
outputfile = open(f'./data/sections_llama70b_1060_{str(file_counter)}.json', "w", encoding='utf-8')
|
||||
outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++ ")
|
||||
|
||||
end_time = datetime.datetime.now()
|
||||
print(f"end_time: {end_time}")
|
||||
print(f"elapsed time: {(end_time-start_time)/3600} Hours!!! ")
|
||||
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
||||
print("end")
|
||||
|
||||
exit()
|
||||
|
||||
"""
|
||||
system prompt version 2 for test:
|
||||
|
||||
You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
|
||||
|
||||
user prompt version 2 for test:
|
||||
|
||||
Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
||||
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
|
||||
"""
|
||||
|
||||
# Deepseek suggestion
|
||||
"""
|
||||
system prompt:
|
||||
You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts.
|
||||
|
||||
user prompt:
|
||||
Extract at least {} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
||||
Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
||||
Do not add any explanations, introductions, or conclusions to the output.
|
||||
Each key phrase must:
|
||||
Be present exactly in the text.
|
||||
Consist of at least two tokens (single-token key phrases are not acceptable).
|
||||
Be a noun phrase (no verbs, prepositions, or single-token words).
|
||||
Not end with a preposition or the letter "و".
|
||||
Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ ها".
|
||||
Include names of organizations, institutions, and legal entities as key phrases.
|
||||
Ensure the output is clean and adheres to all the above rules.
|
||||
"""
|
198
keyword_extractor_3800.py
Normal file
198
keyword_extractor_3800.py
Normal file
|
@ -0,0 +1,198 @@
|
|||
import json
|
||||
# from tqdm import tqdm
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from funcs import save_to_file_by_address, read_file_by_address#, read_from_json
|
||||
# from pandas import read_excel
|
||||
|
||||
import torch
|
||||
import os
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||
|
||||
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=bnb_config
|
||||
)
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||
]
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||
|
||||
# SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
|
||||
# And you don't need to provide explanations or additional text.
|
||||
# Put each keyword on a single line."
|
||||
# """# Explain your answer step by step.
|
||||
|
||||
SYS_PROMPT = """You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts."""# Explain your answer step by step.
|
||||
SYS_PROMPT = """شما یک دستیار متخصص در استخراج عبارات کلیدی از متون حقوقی فارسی هستید. وظیفه شما این است که از متن ارائه شده توسط کاربر، عبارات اسمی کلیدی مهم و معنادار را استخراج کنید."""# gemini prompt
|
||||
|
||||
|
||||
def format_prompt(SENTENCE):
|
||||
# PROMPT = f"Persian legal text: {SENTENCE}."
|
||||
PROMPT = f"متن: {SENTENCE}."
|
||||
return PROMPT
|
||||
|
||||
def kw_count_calculator(text):
|
||||
keywords_count = (len(text) / 1000) * 15
|
||||
keywords_count = int(keywords_count)
|
||||
if keywords_count == 0:
|
||||
keywords_count = 1
|
||||
return keywords_count
|
||||
|
||||
def generate(formatted_prompt):
|
||||
keywords_count = kw_count_calculator(formatted_prompt)
|
||||
|
||||
# Gemini Prompt
|
||||
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
|
||||
• یک لیست فارسی
|
||||
• شماره ترتیبی در ابتدای هر عبارت
|
||||
• هر عبارت در یک خط جداگانه *
|
||||
• بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
|
||||
موارد زیر در استخراج عبارات کلیدی الزامی است:
|
||||
• بسیار مهم و حیاتی است که عبارات کلیدی باید دقیقاً در متن موجود باشند، بنابراین هرگز از خلاقیت برای ساختن کلیدواژهای که دقیقا در متن وجود ندارد، استفاده نکن.
|
||||
• طول هر عبارت کلیدی حداقل دو کلمه باشد. عبارات تک-کلمهای قابل قبول نیستند.
|
||||
• نام سازمانها، مؤسسات و اشخاص حقوقی باید به عنوان عبارات کلیدی در نظر گرفته شوند.
|
||||
• عبارات کلیدی نباید فعل یا حرف اضافه باشند و فقط باید شامل اسمهایی باشند که به هم اضافه شدهاند (عبارت اسمی).
|
||||
• عبارات کلیدی نباید به حرف اضافه یا حرف "و" ختم شوند.
|
||||
• عبارات کلیدی نباید شامل کلمات "ماده"، "تبصره" و "بند" یا تاریخها باشند.
|
||||
به عنوان مثال، اگر متن "قانون مدنی جمهوری اسلامی ایران در ماده ۲۲۲ در خصوص مسئولیت مدنی اشخاص حقوقی در تبصره ۱ به موضوع خسارت ناشی از تقصیر کارکنان اشاره دارد." باشد، خروجی باید به شکل زیر باشد:
|
||||
1. قانون مدنی جمهوری اسلامی ایران
|
||||
2. مسئولیت مدنی اشخاص حقوقی
|
||||
3. خسارت ناشی از تقصیر کارکنان
|
||||
اکنون متن مورد نظر را دریافت خواهید کرد.
|
||||
"""
|
||||
formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
|
||||
messages = [
|
||||
{"role":"system","content":SYS_PROMPT},
|
||||
{"role":"user","content":USER_PROMPT},
|
||||
{"role":"user","content":formatted_prompt}
|
||||
]
|
||||
# tell the model to generate
|
||||
input_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
outputs = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=2048,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.6,
|
||||
top_p=0.9,
|
||||
)
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
return tokenizer.decode(response, skip_special_tokens=True)
|
||||
|
||||
|
||||
def get_rules(sentence):
|
||||
formatted_prompt = format_prompt(sentence)
|
||||
rules = generate(formatted_prompt).split('\n')
|
||||
result = [r.strip() for r in rules if r.strip()]
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = datetime.datetime.now()
|
||||
print(f'start time: {start_time}')
|
||||
# inputfile = open('./data/main_classes_dataset_03.json', "r", encoding='utf-8')
|
||||
inputfile = open('./data/new_3800_sections.json', "r", encoding='utf-8')
|
||||
data = json.load(inputfile)
|
||||
inputfile.close()
|
||||
|
||||
prev_ids = read_file_by_address('./data/prev_ids_3800.txt').splitlines()
|
||||
counter = 1
|
||||
file_counter = 2
|
||||
temp_dict = []
|
||||
temp_data_text = ''
|
||||
period_sections = []
|
||||
period_ids_text = f'***** file number: {file_counter} *****\n'
|
||||
for item in (data):
|
||||
if item['id'] in prev_ids:
|
||||
continue
|
||||
content = item['content']
|
||||
try:
|
||||
item['keywords'] = get_rules(content)
|
||||
except:
|
||||
print(f"section kw error : {item[id]}\n")
|
||||
counter += 1
|
||||
continue
|
||||
if counter % 4000 == 0:
|
||||
print(f"period ==>> {(start_time-datetime.datetime.now())/3600} hours for {counter} sections +++ \n")
|
||||
|
||||
period_ids_text += f"{item['id']} \n"
|
||||
period_sections.append(item)
|
||||
print(f"section:{counter}-id:{item['id']}")
|
||||
# temp_dict.append(item)
|
||||
if counter % 1000 == 0:
|
||||
print(f"period ==>> {(start_time-datetime.datetime.now())/3600} hours for {counter} sections +++ \n")
|
||||
outputfile = open(f'./data/sections_llama70b_3800_{str(file_counter)}.json', "a+", encoding='utf-8')
|
||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++\n ")
|
||||
|
||||
file_counter += 1
|
||||
period_sections = []
|
||||
# save proccessed sections id for next executions of this code
|
||||
save_to_file_by_address('./data/prev_ids_3800.txt', period_ids_text.strip())
|
||||
period_ids_text = ''
|
||||
counter += 1
|
||||
|
||||
outputfile = open(f'./data/sections_llama70b_3800_{str(file_counter)}.json', "w", encoding='utf-8')
|
||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent = 4))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(datetime.datetime.now())} +++++++++++++++++++++++++ ")
|
||||
|
||||
end_time = datetime.datetime.now()
|
||||
print(f"end_time: {end_time}")
|
||||
print(f"elapsed time: {(end_time-start_time)/3600} Hours!!! ")
|
||||
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
||||
print("end")
|
||||
|
||||
exit()
|
||||
|
||||
"""
|
||||
system prompt version 2 for test:
|
||||
|
||||
You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
|
||||
|
||||
user prompt version 2 for test:
|
||||
|
||||
Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
||||
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
|
||||
"""
|
||||
|
||||
# Deepseek suggestion
|
||||
"""
|
||||
system prompt:
|
||||
You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts.
|
||||
|
||||
user prompt:
|
||||
Extract at least {} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
||||
Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
||||
Do not add any explanations, introductions, or conclusions to the output.
|
||||
Each key phrase must:
|
||||
Be present exactly in the text.
|
||||
Consist of at least two tokens (single-token key phrases are not acceptable).
|
||||
Be a noun phrase (no verbs, prepositions, or single-token words).
|
||||
Not end with a preposition or the letter "و".
|
||||
Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ ها".
|
||||
Include names of organizations, institutions, and legal entities as key phrases.
|
||||
Ensure the output is clean and adheres to all the above rules.
|
||||
"""
|
233
llama_givechi_ttl.py
Normal file
233
llama_givechi_ttl.py
Normal file
|
@ -0,0 +1,233 @@
|
|||
"""
|
||||
this code reads a json file which has sections and generates
|
||||
keywords for it by llama and then stores merge of old data and
|
||||
new generated keywords
|
||||
"""
|
||||
|
||||
import json
|
||||
# from tqdm import tqdm
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from funcs import save_to_file_by_address, read_file_by_address#, read_from_json
|
||||
# from pandas import read_excel
|
||||
|
||||
import torch
|
||||
import os
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
# read sections in json file
|
||||
inputfile = open('./data/sections_1060.json', "r", encoding='utf-8')
|
||||
data = json.load(inputfile)
|
||||
inputfile.close()
|
||||
|
||||
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||
|
||||
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=bnb_config
|
||||
)
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||
]
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||
|
||||
# SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
|
||||
# And you don't need to provide explanations or additional text.
|
||||
# Put each keyword on a single line."
|
||||
# """# Explain your answer step by step.
|
||||
|
||||
SYS_PROMPT = """Purpose
|
||||
The purpose of this prompt is to identify translatable verbs in TTL files derived from VerbNet and add Persian labels to them. This should be done taking into account the ontology structure and semantic constraints from VerbNet.
|
||||
|
||||
|
||||
## Input
|
||||
A TTL file that contains:
|
||||
- Definition of verb classes in English
|
||||
- Hierarchical relationships (by rdfs:subClassOf)
|
||||
- Semantic constraints (by owl:Restriction)
|
||||
- Prefixes and base namespaces
|
||||
|
||||
## Analysis and translation instructions
|
||||
1. Check the basic structure of the file:
|
||||
- Identify the main class and subclasses
|
||||
- Check the constraints, such as EventHasAgent, EventHasMaterial.
|
||||
|
||||
2. Verb analysis:
|
||||
- Identify verbs that do not have a number in their identifier.
|
||||
- Remove superclasses (such as build-26.1) from the translation list
|
||||
- Check the semantic relationship for each verb and pay attention to subtle semantic differences
|
||||
|
||||
## The main rule of multiple equivalent verbs
|
||||
Some English verbs have multiple valid translation to Persian. For these cases, you should:
|
||||
- Create a separate triple for each Persian equivalent with rdfs:label, following turtle syntax.
|
||||
|
||||
For example, if the verb cut has three equivalents:
|
||||
go:cut rdfs:label "بريدن"@fa ,
|
||||
"برش دادن"@fa ,
|
||||
"قطع کردن"@fa .
|
||||
|
||||
## Translation instructions
|
||||
1. Identify verbs: Only translate verbs without numbers in the URI
|
||||
2. Find equivalents:
|
||||
- Identify all valid Persian translation.
|
||||
- Each translation should be recorded separately.
|
||||
|
||||
3. Output writing:
|
||||
- Fixed format for a triple: URI + rdfs:label + equivalent in a quote + @fa .
|
||||
|
||||
## some examples
|
||||
For the verb arrange which has three equivalents:
|
||||
go:arrange rdfs:label "چیدن"@fa,
|
||||
"مرتب کردن"@fa,
|
||||
"تنظیم کردن"@fa.
|
||||
|
||||
For the verb build which has two equivalents:
|
||||
go:build rdfs:label "ساختن"@fa .
|
||||
go:build rdfs:label "بنا کردن"@fa .
|
||||
|
||||
## Final output
|
||||
The output should only contain translation lines, contain no triple from the original file.
|
||||
|
||||
## Important Note
|
||||
Note that your answer should only be like an example and not have any explanation.
|
||||
"""# givechi prompt
|
||||
|
||||
|
||||
def format_prompt(SENTENCE):
|
||||
# PROMPT = f"Persian legal text: {SENTENCE}."
|
||||
PROMPT = f"متن: {SENTENCE}."
|
||||
return PROMPT
|
||||
|
||||
def kw_count_calculator(text):
|
||||
keywords_count = (len(text) / 1000) * 15
|
||||
keywords_count = int(keywords_count)
|
||||
if keywords_count == 0:
|
||||
keywords_count = 1
|
||||
return keywords_count
|
||||
|
||||
def generate(formatted_prompt):
|
||||
keywords_count = kw_count_calculator(formatted_prompt)
|
||||
|
||||
# Gemini Prompt
|
||||
formatted_prompt = formatted_prompt[:50000] # to avoid GPU OOM
|
||||
messages = [
|
||||
{"role":"system","content":SYS_PROMPT},
|
||||
{"role":"user","content":formatted_prompt}
|
||||
]
|
||||
# tell the model to generate
|
||||
input_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
outputs = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=2048,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.6,
|
||||
top_p=0.9,
|
||||
)
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
return tokenizer.decode(response, skip_special_tokens=True)
|
||||
|
||||
|
||||
def get_rules(sentence):
|
||||
formatted_prompt = format_prompt(sentence)
|
||||
rules = generate(formatted_prompt).split('\n')
|
||||
result = [r.strip() for r in rules if r.strip()]
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = datetime.datetime.now()
|
||||
print(f'start time: {start_time}')
|
||||
|
||||
# Get the list of all files and directories
|
||||
path = "."
|
||||
dir_list = os.listdir(path)
|
||||
|
||||
counter = 1
|
||||
file_counter = 1
|
||||
temp_dict = []
|
||||
temp_data_text = ''
|
||||
period_sections = []
|
||||
period_ids_text = f'***** file number: {file_counter} *****\n'
|
||||
final_results = []
|
||||
for ttl_file_address in dir_list:
|
||||
ttl_content = read_file_by_address(ttl_file_address)
|
||||
|
||||
try:
|
||||
prompt_result = get_rules(ttl_content)
|
||||
except:
|
||||
print(f"ttl_error: {ttl_file_address}\n")
|
||||
counter += 1
|
||||
continue
|
||||
|
||||
final_results.append({
|
||||
"ttl_file_address": ttl_file_address,
|
||||
"ttl_file_content": ttl_content,
|
||||
"prompt_result": prompt_result
|
||||
})
|
||||
print(f"file: {counter} - {ttl_file_address}")
|
||||
# temp_dict.append(item)
|
||||
|
||||
file_counter += 1
|
||||
# save proccessed sections id for next executions of this code
|
||||
# save_to_file_by_address('./data/prev_ids_ttl.txt', period_ids_text.strip())
|
||||
counter += 1
|
||||
|
||||
outputfile = open(f'./data/result_ttl_prompt.json', "w", encoding='utf-8')
|
||||
outputfile.write(json.dumps(final_results, ensure_ascii=False, indent = 4))
|
||||
outputfile.close()
|
||||
|
||||
end_time = datetime.datetime.now()
|
||||
print(f"end_time: {end_time}")
|
||||
print(f"elapsed time: {(end_time-start_time)/3600} Hours!!! ")
|
||||
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
||||
print("end")
|
||||
|
||||
exit()
|
||||
|
||||
"""
|
||||
system prompt version 2 for test:
|
||||
|
||||
You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
|
||||
|
||||
user prompt version 2 for test:
|
||||
|
||||
Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
||||
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
|
||||
"""
|
||||
|
||||
# Deepseek suggestion
|
||||
"""
|
||||
system prompt:
|
||||
You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts.
|
||||
|
||||
user prompt:
|
||||
Extract at least {} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
||||
Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
||||
Do not add any explanations, introductions, or conclusions to the output.
|
||||
Each key phrase must:
|
||||
Be present exactly in the text.
|
||||
Consist of at least two tokens (single-token key phrases are not acceptable).
|
||||
Be a noun phrase (no verbs, prepositions, or single-token words).
|
||||
Not end with a preposition or the letter "و".
|
||||
Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ ها".
|
||||
Include names of organizations, institutions, and legal entities as key phrases.
|
||||
Ensure the output is clean and adheres to all the above rules.
|
||||
"""
|
207
ner_extractor_3800.py
Normal file
207
ner_extractor_3800.py
Normal file
|
@ -0,0 +1,207 @@
|
|||
import json
|
||||
# from tqdm import tqdm
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from funcs import save_to_file_by_address, read_file_by_address#, read_from_json
|
||||
# from pandas import read_excel
|
||||
|
||||
import torch
|
||||
import os
|
||||
from transformers import AutoTokenizer, AutoModel
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||
|
||||
os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||
|
||||
#model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=bnb_config
|
||||
)
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||
]
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id #tokenizer.pad_token_id
|
||||
|
||||
# SYS_PROMPT = """You receive a Persian legal text and extract from it the keywords that are most important.
|
||||
# And you don't need to provide explanations or additional text.
|
||||
# Put each keyword on a single line."
|
||||
# """# Explain your answer step by step.
|
||||
|
||||
SYS_PROMPT = """شما یک مدل زبانی هوش مصنوعی هستید که برای استخراج موجودیتهای نامدار (NER) از متون طراحی شدهاید. وظیفه شما استخراج دقیق موجودیتهای مشخصشده از متن ورودی است، بدون تولید یا افزودن هیچ اطلاعاتی خارج از متن اصلی. شما تنها اطلاعاتی را که مستقیماً از متن استخراج شده است، ارائه میدهید. موجودیتهای تکراری را تنها یک بار ذکر میکنید. هر موجودیت باید در فرمت زیر ارائه شود:
|
||||
|
||||
1. هر موجودیت یک خط جدید باشد.
|
||||
2. نوع موجودیت، مقدار آن، و جایگاه توکنهای شروع و پایان آن در متن مشخص شود.
|
||||
3. در هیچ کدام از موجودیت های نامدار، از استنباط پرهیز کن و دقیقا بر کلماتی که در متن وجود دارد تمرکز کن
|
||||
4. انواع موجودیت های نامدار مورد نظر:
|
||||
- `ref`: شامل عناوین دقیق قوانین در متن.
|
||||
- `h_ref`: عباراتی مرتبط با عناوین قوانین مانند "ماده"، "تبصره"، "بند"، "این قانون"، "قانون مذکور"، "قانون یادشده"، و ...
|
||||
- `per`: نام اشخاص حقیقی که دقیقا در متن ذکر شده باشد.
|
||||
- `org`: سازمانها، وزارتخانهها، شرکتها، تشکیلات نظامی و هر مجموعه حقوقی و ساختار مردم نهاد و NGO.
|
||||
- `loc`: مکانها، شهرها، کشورها، و مناطق جغرافیایی.
|
||||
- `event`: رویدادهای رسمی و تقویمی.
|
||||
- `fac`: امکانات و تاسیسات و زیرساخت ها.
|
||||
- `date`: انواع فرمت های تاریخ به صورت عددی یا حروفی. دقت شود که اعداد با تاریخ اشتباه نشود
|
||||
- `sub`: موضوع اصلی متن که دقیقاً در متن ذکر شده است.
|
||||
|
||||
هیچ توضیح اضافی در پاسخ وجود نداشته باشد. تنها موجودیتهای استخراجشده را در فرمت زیر ارائه کن.
|
||||
فرمت خروجی: لیستی از موجودیت ها به صورت زیر:
|
||||
[{'type':'org', 'value':'دیوان محاسبات کشور', 'token_start':'5', 'token_end':'8'}, {'type':'sub', 'value':'حقوق بازنشستگان لشکری', 'token_start':'27', 'token_end':'30'}]
|
||||
"""# gpt prompt
|
||||
|
||||
|
||||
def format_prompt(SENTENCE):
|
||||
# PROMPT = f"Persian legal text: {SENTENCE}."
|
||||
PROMPT = f"متن: {SENTENCE}"
|
||||
return PROMPT
|
||||
|
||||
def kw_count_calculator(text):
|
||||
keywords_count = (len(text) / 1000) * 15
|
||||
keywords_count = int(keywords_count)
|
||||
if keywords_count == 0:
|
||||
keywords_count = 1
|
||||
return keywords_count
|
||||
|
||||
def generate(input_text):
|
||||
|
||||
USER_PROMPT = f"""متن زیر را پردازش کن و موجودیتهای نامدار را طبق دستورالعمل استخراج کن:
|
||||
|
||||
"""
|
||||
formatted_prompt = input_text[:50000] # to avoid GPU OOM
|
||||
messages = [
|
||||
{"role":"system","content":SYS_PROMPT},
|
||||
{"role":"user","content":USER_PROMPT},
|
||||
{"role":"user","content":formatted_prompt}
|
||||
]
|
||||
# tell the model to generate
|
||||
input_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
outputs = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=2048,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.6,
|
||||
top_p=0.9,
|
||||
)
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
return tokenizer.decode(response, skip_special_tokens=True)
|
||||
|
||||
|
||||
def do_prompt(sentence):
|
||||
formatted_prompt = format_prompt(sentence)
|
||||
results = generate(formatted_prompt).split('\n')
|
||||
result = [r.strip() for r in results if r.strip()]
|
||||
return result
|
||||
|
||||
def get_tehran_time():
|
||||
return datetime.datetime.now() + datetime.timedelta(hours=3, minutes=30)
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time = get_tehran_time()
|
||||
print(f'start time: {start_time}')
|
||||
# inputfile = open('./data/main_classes_dataset_03.json', "r", encoding='utf-8')
|
||||
inputfile = open('./data/new_3800_sections.json', "r", encoding='utf-8')
|
||||
data = json.load(inputfile)
|
||||
inputfile.close()
|
||||
|
||||
prev_ids = read_file_by_address('./data/prev_ner_ids_3800.txt').splitlines()
|
||||
counter = 1
|
||||
file_counter = 1
|
||||
temp_dict = []
|
||||
temp_data_text = ''
|
||||
period_sections = []
|
||||
period_ids_text = f'***** file number: {file_counter} *****\n'
|
||||
for item in (data):
|
||||
if item['id'] in prev_ids:
|
||||
continue
|
||||
content = item['content']
|
||||
try:
|
||||
item['ners_prompt'] = do_prompt(content)
|
||||
except:
|
||||
print(f"section ner error : {item[id]}\n")
|
||||
counter += 1
|
||||
continue
|
||||
|
||||
period_ids_text += f"{item['id']} \n"
|
||||
period_sections.append(item)
|
||||
temp_dict.append(item)
|
||||
print(f"section:{counter}-id:{item['id']}")
|
||||
# temp_dict.append(item)
|
||||
if counter % 1000 == 0:
|
||||
print(f"period ==>> {(start_time-get_tehran_time())/3600} hours for {counter} sections +++ \n")
|
||||
outputfile = open(f'./data/sections_ner_llama70b_3800_2_{str(file_counter)}.json', "a+", encoding='utf-8')
|
||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(get_tehran_time())} +++++++++++++++\n ")
|
||||
# temp_dict.append(item)
|
||||
|
||||
file_counter += 1
|
||||
period_sections = []
|
||||
# save proccessed sections id for next executions of this code
|
||||
save_to_file_by_address('./data/prev_ner_ids_3800.txt', period_ids_text.strip())
|
||||
period_ids_text = ''
|
||||
if counter == 10:
|
||||
test_file_name = './data/sections_ner_llama70b_3800_test2_.json'
|
||||
outputfile = open(test_file_name, "a+", encoding='utf-8')
|
||||
outputfile.write(json.dumps(temp_dict, ensure_ascii=False))
|
||||
outputfile.close()
|
||||
print(f'test file {test_file_name} created ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ')
|
||||
counter += 1
|
||||
|
||||
outputfile = open(f'./data/sections_ner_llama70b_3800_2_{str(file_counter)}.json', "w", encoding='utf-8')
|
||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent = 4))
|
||||
outputfile.close()
|
||||
print(f"file {str(file_counter)} created in {str(get_tehran_time())} +++++++++++++++++++++++++ ")
|
||||
|
||||
end_time = get_tehran_time()
|
||||
print(f"end_time: {end_time}")
|
||||
print(f"elapsed time: {(end_time-start_time)/3600} Hours!!! ")
|
||||
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
||||
print("end")
|
||||
|
||||
exit()
|
||||
|
||||
"""
|
||||
system prompt version 2 for test:
|
||||
|
||||
You are a lawyer and you must be able to explain legal texts without changing technical terms in a way that non-lawyers can understand the meaning of the text.
|
||||
|
||||
user prompt version 2 for test:
|
||||
|
||||
Extract at least {} important and significant key phrases from the "text" and print the key phrases in the form of a list in Persian and put each key phrase on a new line and do not add any explanation at the beginning or end of the answer.
|
||||
Each key phrase has a sequential number at the beginning. The key phrases must be present exactly in the text. It is very important and essential that the length of each key phrase has at least two tokens and a single-token key phrase is not acceptable. I emphasize that no key phrase should have only one token. The names of organizations, institutions and legal entities must be considered as key phrases. No key phrase should be a verb or a preposition and should only include nouns that are added together. No key phrase should end with a preposition or the letter "و". It is essential that key phrases do not include "ماده", "تبصره", "بند" or "تاریخ ها".
|
||||
"""
|
||||
|
||||
# Deepseek suggestion
|
||||
"""
|
||||
system prompt:
|
||||
You are a highly accurate and detail-oriented assistant specialized in analyzing Persian legal texts.
|
||||
|
||||
user prompt:
|
||||
Extract at least {} important and significant key phrases from the provided text. Follow these guidelines strictly:
|
||||
Print the key phrases as a numbered list in Persian, with each key phrase on a new line.
|
||||
Do not add any explanations, introductions, or conclusions to the output.
|
||||
Each key phrase must:
|
||||
Be present exactly in the text.
|
||||
Consist of at least two tokens (single-token key phrases are not acceptable).
|
||||
Be a noun phrase (no verbs, prepositions, or single-token words).
|
||||
Not end with a preposition or the letter "و".
|
||||
Exclude the following terms: "ماده", "تبصره", "بند", "تاریخ ها".
|
||||
Include names of organizations, institutions, and legal entities as key phrases.
|
||||
Ensure the output is clean and adheres to all the above rules.
|
||||
"""
|
39
new_3800_sections_convertor.py
Normal file
39
new_3800_sections_convertor.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
import json
|
||||
# from tqdm import tqdm
|
||||
import time
|
||||
|
||||
from funcs import save_to_file_by_address, read_file_by_address, read_from_json,write_to_json
|
||||
# from pandas import read_excel
|
||||
|
||||
old_3800_sections = read_from_json('./data/sections_3_8K.json')
|
||||
|
||||
all_sections = read_from_json('./data/sections_170k_normalized.json')
|
||||
|
||||
new_3800_sections = []
|
||||
x = 1
|
||||
for section in old_3800_sections:
|
||||
print(f'3.8K data: {x}')
|
||||
x+=1
|
||||
id = section['id']
|
||||
y = 1
|
||||
for item in all_sections:
|
||||
|
||||
|
||||
if id == item['id']:
|
||||
new_3800_sections.append({
|
||||
'id': id,
|
||||
'content': item['content'],
|
||||
'topics': section['topics']
|
||||
})
|
||||
print(f'large_data: {y}')
|
||||
print('***********************************')
|
||||
break
|
||||
y+=1
|
||||
|
||||
print('------------------------')
|
||||
|
||||
|
||||
write_to_json(new_3800_sections, './data/new_3800_sections.json')
|
||||
print('finished!')
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user