import json from tqdm import tqdm import pandas as pd import time print('start') start_time = time.time() all_data_file = open('./data/sections_all.json', "r", encoding='utf-8') all_data = json.load(all_data_file) all_data_file.close() all_data_df = pd.DataFrame(all_data) inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8') data = json.load(inputfile) inputfile.close() dict = {} count = 0 for item in tqdm(data): section_id = item['id'] try: section = all_data_df[all_data_df['id'] == section_id] except: print("missing section: {section_id}".format(section_id)) continue key = list(section['q-id']).pop() child_order = list(section['child-order']).pop() content = list(section['content']).pop() level = list(section['level']).pop() parent_id = list(section['parent-id']).pop() number_text = list(section['number-text']).pop() if not key in dict: dict[key] = [] dict[key].append({'section_id':section_id,'content':content, 'child_order':child_order, 'level': level, 'number-text': number_text, 'parent_id':parent_id}) print(f"Section ---> {count}") count += 1 outputfile = open('./data/main_qanon_170k_new.json', "w", encoding='utf-8') outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4)) outputfile.close() end_time = time.time() print(f"elapsed time: {end_time-start_time}") print("end")