RD_relation/relation/sections2qanon.py
2025-01-22 20:51:09 +03:30

49 lines
1.5 KiB
Python

import json
from tqdm import tqdm
import pandas as pd
import time
print('start')
start_time = time.time()
all_data_file = open('./data/sections_all.json', "r", encoding='utf-8')
all_data = json.load(all_data_file)
all_data_file.close()
all_data_df = pd.DataFrame(all_data)
inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8')
data = json.load(inputfile)
inputfile.close()
dict = {}
count = 0
for item in tqdm(data):
section_id = item['id']
try: section = all_data_df[all_data_df['id'] == section_id]
except:
print("missing section: {section_id}".format(section_id))
continue
key = list(section['q-id']).pop()
child_order = list(section['child-order']).pop()
content = list(section['content']).pop()
level = list(section['level']).pop()
parent_id = list(section['parent-id']).pop()
number_text = list(section['number-text']).pop()
if not key in dict:
dict[key] = []
dict[key].append({'section_id':section_id,'content':content, 'child_order':child_order, 'level': level, 'number-text': number_text, 'parent_id':parent_id})
print(f"Section ---> {count}")
count += 1
outputfile = open('./data/main_qanon_170k_new2.json', "w", encoding='utf-8')
outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4))
outputfile.close()
end_time = time.time()
print(f"elapsed time: {end_time-start_time}")
print("end")