90 lines
2.9 KiB
Python
90 lines
2.9 KiB
Python
from transformers import pipeline
|
|
from normalizer import cleaning
|
|
from elastic_helper import ElasticHelper
|
|
import transformers
|
|
import json
|
|
import datetime
|
|
import pandas as pd
|
|
from transformers import AutoTokenizer
|
|
print(transformers.__version__)
|
|
|
|
#model_checkpoint = "./BERT/findtuned_classification_model_15"# 15 epoch
|
|
model_checkpoint = '/home/gpu/tnlp/jokar/Classifier/Models/findtuned_classification_model-15'
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
|
|
|
window_size = tokenizer.model_max_length#512#200
|
|
step_size = 350#100
|
|
Top_k = 10
|
|
|
|
# with open('./data/errors.txt', 'r', encoding='utf-8') as input_file:
|
|
# error_sections_id = input_file.read().splitlines()
|
|
|
|
eh_obj = ElasticHelper()
|
|
path = "/home/gpu/data_11/mj_qa_section.zip"
|
|
sections = eh_obj.iterateJsonFile(path, True)
|
|
|
|
classifier = pipeline("text-classification", model_checkpoint, framework="pt")
|
|
|
|
print(f'start: {datetime.datetime.now()}')
|
|
|
|
with open('./data/dataset_new.json', 'r', encoding='utf-8') as input_file:
|
|
dataset = json.load(input_file)
|
|
|
|
dataset_ids = [item for item in dataset]
|
|
print(f'len(dataset_ids): {len(dataset_ids)}')
|
|
cc_counter = 1
|
|
test_counter = 1
|
|
all = 282671
|
|
qanon_title_list = []
|
|
new_sections_dict = {}
|
|
failed_counter = 1
|
|
new_dataset = []
|
|
for index, item in enumerate(sections):
|
|
# if index > 500:
|
|
# break
|
|
id = item['id']
|
|
if not id in dataset_ids:
|
|
continue
|
|
|
|
source = item['source']
|
|
|
|
content0 = source['content']
|
|
qanon_title = source['qanon_title']
|
|
full_path = source['other_info']['full_path'].split(">")
|
|
full_path_text = ''
|
|
for i, path_item in enumerate(reversed(full_path)):
|
|
if i == len(full_path) - 1:
|
|
full_path_text += ''.join(f'{path_item}')
|
|
break
|
|
full_path_text += ''.join(f'{path_item} از ')
|
|
full_path_text = full_path_text.strip()
|
|
try:
|
|
content = cleaning(dataset[id]['content'])
|
|
pre_content = f"{full_path_text} {cleaning(qanon_title)} عبارت است از: "
|
|
new_content = f"{pre_content} {content}"
|
|
if (len(tokenizer(new_content)['input_ids'][1:-1]) > 512):
|
|
failed_counter +=1
|
|
continue
|
|
|
|
new_dataset.append({
|
|
"id": id,
|
|
"domain_id": dataset[id]["domain_id"],
|
|
"domain_name": dataset[id]['domain_name'],
|
|
"content": new_content
|
|
})
|
|
except Exception as e:
|
|
with open('./data/errors_log_dataset.txt', 'a', encoding='utf-8') as output_file:
|
|
output_file.write(id + " >> " + str(e) + "\n")
|
|
continue
|
|
|
|
print(f'section: {all}/{id}/{index+1}', flush=True)
|
|
|
|
|
|
with open('./data/fullpath_dataset.json', 'w', encoding='utf-8') as output_file:
|
|
json_data = json.dumps(new_dataset, indent=4, ensure_ascii=False)
|
|
output_file.write(json_data)
|
|
|
|
print(f"failed_counter ::: {failed_counter}")
|
|
print(f'end: {datetime.datetime.now()}')
|
|
print('finished!')
|