59 lines
1.6 KiB
Python
59 lines
1.6 KiB
Python
from elastic_helper import ElasticHelper
|
|
from transformers import pipeline
|
|
from normalizer import cleaning
|
|
import transformers
|
|
import datetime
|
|
import json
|
|
|
|
eh_obj = ElasticHelper()
|
|
print(transformers.__version__)
|
|
|
|
path = "/home/gpu/data_11/mj_qa_section.zip"
|
|
model_path = '/home/gpu/tnlp/jokar/Classifier/Models/findtuned_classification_model-15'
|
|
data = eh_obj.iterateJsonFile(path, True)
|
|
|
|
# classifier = pipeline("text-classification", "text_classification_model-finetuned-4", framework="tf")
|
|
classifier = pipeline("text-classification", model_path, framework="tf")
|
|
|
|
def get_class(sentence):
|
|
sentence = cleaning(sentence)
|
|
out = classifier(sentence, top_k=4)
|
|
result = {
|
|
"best": out[0],
|
|
"other": out[1:]
|
|
}
|
|
|
|
return result
|
|
|
|
print(f'start: {datetime.datetime.now()}')
|
|
count = 1
|
|
all = 282671
|
|
sections_classes_list = []
|
|
for item in data:
|
|
source = item['source']
|
|
content = source['content']
|
|
section_id = item['id']
|
|
|
|
try:
|
|
result = get_class(content)
|
|
except:
|
|
print(f'error --> count: {count} --> id: {section_id}')
|
|
continue
|
|
sections_classes_list.append({
|
|
'id': section_id,
|
|
'best-class': result['best'],
|
|
'other-classes': result['other']
|
|
})
|
|
print(f'from {all} --> count: {count} --> id: {section_id}')
|
|
count +=1
|
|
# if count == 10:
|
|
# break
|
|
|
|
with open('./data/sections_classes.json', 'w', encoding='utf-8') as file:
|
|
json_data = json.dumps(sections_classes_list, ensure_ascii=False, indent=4)
|
|
file.write(json_data)
|
|
|
|
print(f'end: {datetime.datetime.now()}')
|
|
print('finished!')
|
|
|