from elastic_helper import ElasticHelper from transformers import pipeline from normalizer import cleaning import transformers import datetime import json eh_obj = ElasticHelper() print(transformers.__version__) path = "/home/gpu/data_11/mj_qa_section.zip" model_path = '/home/gpu/tnlp/jokar/Classifier/Models/findtuned_classification_model-15' data = eh_obj.iterateJsonFile(path, True) # classifier = pipeline("text-classification", "text_classification_model-finetuned-4", framework="tf") classifier = pipeline("text-classification", model_path, framework="tf") def get_class(sentence): sentence = cleaning(sentence) out = classifier(sentence, top_k=4) result = { "best": out[0], "other": out[1:] } return result print(f'start: {datetime.datetime.now()}') count = 1 all = 282671 sections_classes_list = [] for item in data: source = item['source'] content = source['content'] section_id = item['id'] try: result = get_class(content) except: print(f'error --> count: {count} --> id: {section_id}') continue sections_classes_list.append({ 'id': section_id, 'best-class': result['best'], 'other-classes': result['other'] }) print(f'from {all} --> count: {count} --> id: {section_id}') count +=1 # if count == 10: # break with open('./data/sections_classes.json', 'w', encoding='utf-8') as file: json_data = json.dumps(sections_classes_list, ensure_ascii=False, indent=4) file.write(json_data) print(f'end: {datetime.datetime.now()}') print('finished!')