from elastic_helper import ElasticHelper import datetime import json with open('./data/sections_classes.json', 'r', encoding='utf-8') as file: extracted_class = json.load(file) print(len(extracted_class)) extracted_class_ids = [item['id'] for item in extracted_class] eh_obj = ElasticHelper() path = "/home/gpu/data_11/mj_qa_section.zip" data = eh_obj.iterateJsonFile(path, True) print(f'start: {datetime.datetime.now()}') count = 1 all = 282671 large_sections = [] for item in data: source = item['source'] content = source['content'] section_id = item['id'] if not section_id in extracted_class_ids: section = { "id": section_id, "len_content": len(content.split()), "content" : content } large_sections.append(section) print(f'###### --> id: {section_id} ######') print(f'from {all} --> count: {count} --> id: {section_id}') count +=1 # if count == 10: # break with open('./data/large_sections.json', 'w', encoding='utf-8') as file: json_data = json.dumps(large_sections, ensure_ascii=False, indent=4) file.write(json_data) print(f'end: {datetime.datetime.now()}') print('finished!')