Classifier/dataset_helper.py
2025-07-13 17:32:37 +03:30

43 lines
1.1 KiB
Python

from elastic_helper import ElasticHelper
import json
from collections import Counter
eh_obj = ElasticHelper()
path = "/home/gpu/data_11/mj_qa_section.zip"
data = eh_obj.iterateJsonFile(path, True)
classes =[]
failed = 0
for i,item in enumerate(data):
try:
class_ = item['source']['code-ai']['label']
print(i+1)
classes.append(class_)
except:
failed += 1
# with open('./data/alldata_classes.txt', 'a', encoding='utf-8') as file:
# file.write(str(source))
# exit(1)
# with open('./data/fullpath_dataset.json', 'r', encoding='utf-8') as file:
# data = json.load(file)
# classes = [itm['domain_name'] for itm in data]
classes_count = Counter(classes)
sorted_elements = sorted(classes_count.items(), key=lambda x: x[1], reverse=True)
text = ''
for k,v in sorted_elements:
text += f'{k} -> {v}\n'
print(f'{k} -> {v}')
# print(text)
with open('./data/alldata_classes.txt', 'a', encoding='utf-8') as file:
file.write(text)
# print(classes_count)
print('*****************************************')
print(f'len(classes_count): {len(classes_count)}')
print(f'failed: {failed}')
pass