دیتاست کلاسیفیکیشن
This commit is contained in:
parent
889cd4ed07
commit
420979be7d
|
@ -1,3 +1,6 @@
|
|||
"""
|
||||
این فایل با نرمالایزر هضم کار می کند
|
||||
"""
|
||||
import json
|
||||
|
||||
with open('./data/classes51.txt', 'r') as file:
|
||||
|
@ -8,4 +11,127 @@ with open('./data/classification_ds.json', 'r') as file:
|
|||
|
||||
# send content of some sections and classes to llama chat
|
||||
# and ask about the best class
|
||||
|
||||
|
||||
|
||||
from html import escape
|
||||
from lxml import etree
|
||||
from datetime import datetime
|
||||
from elasticsearch import Elasticsearch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
|
||||
from threading import Thread
|
||||
import torch
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import concurrent
|
||||
import threading
|
||||
import json
|
||||
import os.path
|
||||
import os
|
||||
import normalizer
|
||||
from funcs import write_to_json, read_from_json
|
||||
#lock = threading.Lock()
|
||||
#lock1 = threading.Lock()
|
||||
#from cleantext import clean
|
||||
#import re
|
||||
|
||||
|
||||
if torch.cuda.is_available():
|
||||
model_id = "PartAI/Dorna-Llama3-8B-Instruct"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
counter = 0
|
||||
total = 0
|
||||
remained = 0
|
||||
id = ''
|
||||
keywords_count = 15
|
||||
|
||||
|
||||
def command(text):
|
||||
global remained
|
||||
try:
|
||||
|
||||
messages = [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را به خوبی تفسیر کنی. " },
|
||||
|
||||
{"role": "user", "content":
|
||||
'''با توجه به 51 کلاسی که در ادامه می آید، بهترین کلاس که از نظر محتوایی با متن زیر مطابقت دارد را از میان کلاس های ارائه شده انتخاب کن. تاکید می کنم که فقط اجازه داری یک کلاس را انتخاب کنی.
|
||||
نام کلاس باید دقیقا مطابق با عنوان های کلاس های 51 گانه باشد.
|
||||
هیچ توضیح اضافه ای پیش یا پس از عنوان کلاس ننویس.
|
||||
"متن": {}
|
||||
'''.format(text)
|
||||
},
|
||||
{"role": "user", "content":
|
||||
'''کلاس های 51 گانه عبارت اند از: {}
|
||||
'''.format(classes)
|
||||
},
|
||||
]
|
||||
|
||||
input_ids = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
add_generation_prompt=True,
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
|
||||
terminators = [
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
||||
]
|
||||
model.generation_config.pad_token_id = tokenizer.pad_token_id
|
||||
|
||||
outputs = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=256,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.1,
|
||||
top_p=0.85,
|
||||
)
|
||||
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
result = tokenizer.decode(response, skip_special_tokens=True)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as inst:
|
||||
print(type(inst)) # the exception type
|
||||
print(inst.args) # arguments stored in .args
|
||||
print("Exception: " + str(inst))
|
||||
|
||||
counter = 1
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
try:
|
||||
classes_dict = []
|
||||
count = 1
|
||||
for content_item in sections:
|
||||
|
||||
id = sections[counter]['id']
|
||||
|
||||
prev_class = sections[counter]['domain_name']
|
||||
content = sections[counter]['content']
|
||||
|
||||
new_class = command(content)
|
||||
print("section " + str(count) + "/" + str(len(sections)) + " class extracting ... ")
|
||||
classes_dict.append({
|
||||
'content':content,
|
||||
'prev-class': prev_class,
|
||||
'new-class': new_class
|
||||
})
|
||||
count+= 1
|
||||
counter+= 500
|
||||
|
||||
if counter > 49387:
|
||||
break
|
||||
write_to_json(classes_dict, "./data/result.json")
|
||||
|
||||
except Exception as inst:
|
||||
print(type(inst)) # the exception type
|
||||
print(inst.args) # arguments stored in .args
|
||||
|
||||
|
||||
|
||||
end_time = time.time()
|
||||
print(end_time)
|
||||
operation_time = (int(end_time-start_time)/60)/60
|
||||
print(f"elapsed time: {operation_time} hours")
|
||||
print(f" Finished!!! ")
|
||||
|
|
Loading…
Reference in New Issue
Block a user