90 lines
3.3 KiB
Python
90 lines
3.3 KiB
Python
import json
|
|
from elastic_helper import ElasticHelper
|
|
|
|
def list_classes():
|
|
sections = ["qs2737382", "qs894826", "qs1043309", "qs894807", "qs887945", "qs1023813", "qs1988520", "qs997880", "qs919411", "qs3012247", "qs1036224", "qs1636663", "qs1704019", "qs1555153", "qs895753", "qs1024032", "qs3024418", "qs925237", "qs999331", "qs1073335", "qs2964496", "qs2652755", "qs2241523", "qs939369", "qs894037", "qs2148972", "qs885382", "qs929738", "qs886389", "qs2580013", "qs898204", "qs2305848"]
|
|
|
|
from elastic_helper import ElasticHelper
|
|
|
|
eh_obj = ElasticHelper()
|
|
path = "/home/gpu/data_11/mj_qa_section.zip"
|
|
all_sections = eh_obj.iterateJsonFile(path, True)
|
|
|
|
classes_list = ''
|
|
for item in all_sections:
|
|
id = item['id']
|
|
if not id in sections:
|
|
continue
|
|
source = item['source']
|
|
try:
|
|
class_ = source['code-ai']['label']
|
|
except:
|
|
class_ = 'None'
|
|
classes_list += id + " - " + class_ + "\n"
|
|
|
|
with open('./data/classes_list.txt', 'w') as file:
|
|
file.write(classes_list.strip())
|
|
|
|
def classification_error_handler():
|
|
with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file:
|
|
sections = json.load(_file)
|
|
|
|
with open('./data/errors.txt', 'r', encoding='utf-8') as _file:
|
|
error_sections = _file.read()
|
|
errors = error_sections.splitlines()
|
|
|
|
counter = 1
|
|
for item, value in sections.items():
|
|
id = item
|
|
if not id in errors:
|
|
continue
|
|
|
|
errors.remove(id)
|
|
result = f'id: {id} -- best-class: {value["best-class"]["label"]}\n'
|
|
|
|
with open('./data/errors_classes.txt', 'a', encoding='utf-8') as _file:
|
|
_file.write(result)
|
|
|
|
print(errors)
|
|
|
|
def find_large_sections_in_classified():
|
|
with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file:
|
|
sections = json.load(_file)
|
|
|
|
classified_ids = [item for item in sections]
|
|
classified_ids_set = set(classified_ids)
|
|
large_not_classified = []
|
|
with open('./data/large_sections.json', 'r', encoding='utf-8') as _file:
|
|
large_sections = json.load(_file)
|
|
for item in large_sections:
|
|
if not item in classified_ids:
|
|
large_not_classified.append(item)
|
|
return large_not_classified
|
|
|
|
def classified_sections():
|
|
with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file:
|
|
classified_sections = json.load(_file)
|
|
|
|
print(len(classified_sections))
|
|
|
|
eh_obj = ElasticHelper()
|
|
path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
|
|
all_sections = eh_obj.iterateJsonFile(path, True)
|
|
# count = 285839
|
|
for index, item in enumerate(all_sections):
|
|
print(index+1)
|
|
print(len(all_sections))
|
|
classified_ids = [item for item in sections]
|
|
classified_ids_set = set(classified_ids)
|
|
large_not_classified = []
|
|
with open('./data/large_sections.json', 'r', encoding='utf-8') as _file:
|
|
large_sections = json.load(_file)
|
|
for item in large_sections:
|
|
if not item in classified_ids:
|
|
large_not_classified.append(item)
|
|
return large_not_classified
|
|
|
|
if __name__ == '__main__':
|
|
result = classified_sections()
|
|
print(len(result))
|