49 lines
1.5 KiB
Python
49 lines
1.5 KiB
Python
import json
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
import time
|
|
|
|
print('start')
|
|
start_time = time.time()
|
|
|
|
|
|
all_data_file = open('./data/sections_all.json', "r", encoding='utf-8')
|
|
all_data = json.load(all_data_file)
|
|
all_data_file.close()
|
|
all_data_df = pd.DataFrame(all_data)
|
|
|
|
inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8')
|
|
|
|
data = json.load(inputfile)
|
|
|
|
inputfile.close()
|
|
|
|
dict = {}
|
|
|
|
count = 0
|
|
for item in tqdm(data):
|
|
section_id = item['id']
|
|
try: section = all_data_df[all_data_df['id'] == section_id]
|
|
except:
|
|
print("missing section: {section_id}".format(section_id))
|
|
continue
|
|
|
|
key = list(section['q-id']).pop()
|
|
child_order = list(section['child-order']).pop()
|
|
content = list(section['content']).pop()
|
|
level = list(section['level']).pop()
|
|
parent_id = list(section['parent-id']).pop()
|
|
number_text = list(section['number-text']).pop()
|
|
if not key in dict:
|
|
dict[key] = []
|
|
dict[key].append({'section_id':section_id,'content':content, 'child_order':child_order, 'level': level, 'number-text': number_text, 'parent_id':parent_id})
|
|
print(f"Section ---> {count}")
|
|
count += 1
|
|
|
|
|
|
outputfile = open('./data/main_qanon_170k_new2.json', "w", encoding='utf-8')
|
|
outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4))
|
|
outputfile.close()
|
|
end_time = time.time()
|
|
print(f"elapsed time: {end_time-start_time}")
|
|
print("end") |