Classifier/sections_classifier.py
2025-07-13 17:32:37 +03:30

59 lines
1.6 KiB
Python

from elastic_helper import ElasticHelper
from transformers import pipeline
from normalizer import cleaning
import transformers
import datetime
import json
eh_obj = ElasticHelper()
print(transformers.__version__)
path = "/home/gpu/data_11/mj_qa_section.zip"
model_path = '/home/gpu/tnlp/jokar/Classifier/Models/findtuned_classification_model-15'
data = eh_obj.iterateJsonFile(path, True)
# classifier = pipeline("text-classification", "text_classification_model-finetuned-4", framework="tf")
classifier = pipeline("text-classification", model_path, framework="tf")
def get_class(sentence):
sentence = cleaning(sentence)
out = classifier(sentence, top_k=4)
result = {
"best": out[0],
"other": out[1:]
}
return result
print(f'start: {datetime.datetime.now()}')
count = 1
all = 282671
sections_classes_list = []
for item in data:
source = item['source']
content = source['content']
section_id = item['id']
try:
result = get_class(content)
except:
print(f'error --> count: {count} --> id: {section_id}')
continue
sections_classes_list.append({
'id': section_id,
'best-class': result['best'],
'other-classes': result['other']
})
print(f'from {all} --> count: {count} --> id: {section_id}')
count +=1
# if count == 10:
# break
with open('./data/sections_classes.json', 'w', encoding='utf-8') as file:
json_data = json.dumps(sections_classes_list, ensure_ascii=False, indent=4)
file.write(json_data)
print(f'end: {datetime.datetime.now()}')
print('finished!')