import json from elastic_helper import ElasticHelper eh_obj = ElasticHelper() path = "/home/gpu/data_11/mj_qa_section.zip" all_sections = eh_obj.iterateJsonFile(path, True) def list_classes(): sections = ["qs2737382", "qs894826", "qs1043309", "qs894807", "qs887945", "qs1023813", "qs1988520", "qs997880", "qs919411", "qs3012247", "qs1036224", "qs1636663", "qs1704019", "qs1555153", "qs895753", "qs1024032", "qs3024418", "qs925237", "qs999331", "qs1073335", "qs2964496", "qs2652755", "qs2241523", "qs939369", "qs894037", "qs2148972", "qs885382", "qs929738", "qs886389", "qs2580013", "qs898204", "qs2305848"] classes_list = '' for item in all_sections: id = item['id'] if not id in sections: continue source = item['source'] try: class_ = source['code-ai']['label'] except: class_ = 'None' classes_list += id + " - " + class_ + "\n" with open('./data/classes_list.txt', 'w') as file: file.write(classes_list.strip()) def classification_error_handler(): with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file: sections = json.load(_file) with open('./data/errors.txt', 'r', encoding='utf-8') as _file: error_sections = _file.read() errors = error_sections.splitlines() counter = 1 for item, value in sections.items(): id = item if not id in errors: continue errors.remove(id) result = f'id: {id} -- best-class: {value["best-class"]["label"]}\n' with open('./data/errors_classes.txt', 'a', encoding='utf-8') as _file: _file.write(result) print(errors) def find_large_sections_in_classified(): with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file: sections = json.load(_file) classified_ids = [item for item in sections] classified_ids_set = set(classified_ids) large_not_classified = [] with open('./data/large_sections.json', 'r', encoding='utf-8') as _file: large_sections = json.load(_file) for item in large_sections: if not item in classified_ids: large_not_classified.append(item) return large_not_classified def classified_sections(): with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file: classified_sections = json.load(_file) print(len(classified_sections)) eh_obj = ElasticHelper() path = "/home/gpu/data_11/14040423/mj_qa_section.zip" all_sections = eh_obj.iterateJsonFile(path, True) # count = 285839 for index, item in enumerate(all_sections): print(index+1) print(len(all_sections)) classified_ids = [item for item in sections] classified_ids_set = set(classified_ids) large_not_classified = [] with open('./data/large_sections.json', 'r', encoding='utf-8') as _file: large_sections = json.load(_file) for item in large_sections: if not item in classified_ids: large_not_classified.append(item) return large_not_classified gold_test_sections = ["qs2737382", "qs894826", "qs1043309", "qs894807", "qs887945", "qs1023813", "qs1988520", "qs997880", "qs919411", "qs3012247", "qs1036224", "qs1636663", "qs1704019", "qs1555153", "qs895753", "qs1024032", "qs3024418", "qs925237", "qs999331", "qs1073335", "qs2964496", "qs2652755", "qs2241523", "qs939369", "qs894037", "qs2148972", "qs885382", "qs929738", "qs886389", "qs2580013", "qs898204", "qs2305848"] if __name__ == '__main__': with open('./data/all_sections_classes_new_140405.json', 'r', encoding='utf-8') as _file: sections = json.load(_file) gold_sections_classes = {} for item in sections : if item in gold_test_sections: gold_sections_classes[item] = sections[item] classes_list = '' for item in all_sections: id = item['id'] if not id in gold_sections_classes: continue source = item['source'] try: content = source['content'] except: pass gold_sections_classes[id]['content'] = content # record = gold_sections_classes[item] # record['content'] = content gold_sections_classes_text = '' for index, idx in enumerate(gold_sections_classes): gold_sections_classes_text += ''.join(f'order: {index+1}\nid: {idx}\ncontent: {gold_sections_classes[idx]["content"]}\nbest-class: {gold_sections_classes[idx]["best-class"]}\nother-classes: {gold_sections_classes[idx]["other-classes"]}\n\n') with open('./data/gold_sections_classes.txt', 'w') as file: file.write(gold_sections_classes_text.strip()) # region large sections which send to window faults = [] for item in sections: itm = sections[item] try: best = itm['best-class']['score'] except: continue if best > 1: print(best) faults.append((item,best)) faults_text = '' for item in faults: faults_text += ''.join(item[0]) + '\n' with open('./data/large_sections.txt', 'a+') as file: file.write(faults_text.strip()) # endregion # result = classified_sections() # print(len(result))