RD_relation/relation/sections2qanon.py

49 lines
1.4 KiB
Python
Raw Normal View History

2025-01-19 15:46:21 +00:00
import json
from tqdm import tqdm
2025-01-19 16:42:36 +00:00
import pandas as pd
2025-01-19 15:46:21 +00:00
import time
print('start')
start_time = time.time()
2025-01-19 16:42:36 +00:00
all_data_file = open('./data/sections_all.json', "r", encoding='utf-8')
all_data = json.load(all_data_file)
all_data_file.close()
all_data_df = pd.DataFrame(all_data)
2025-01-19 15:46:21 +00:00
inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8')
data = json.load(inputfile)
inputfile.close()
dict = {}
2025-01-19 16:42:36 +00:00
count = 0
2025-01-19 15:46:21 +00:00
for item in tqdm(data):
2025-01-19 16:42:36 +00:00
section_id = item['id']
try: section = all_data_df[all_data_df['id'] == section_id]
except:
print("missing section: {section_id}".format(section_id))
continue
key = list(section['q-id']).pop()
child_order = list(section['child-order']).pop()
content = list(section['content']).pop()
level = list(section['level']).pop()
parent_id = list(section['parent-id']).pop()
number_text = list(section['number-text']).pop()
2025-01-19 15:46:21 +00:00
if not key in dict:
dict[key] = []
2025-01-19 16:42:36 +00:00
dict[key].append({'section_id':section_id,'content':content, 'child_order':child_order, 'level': level, 'number-text': number_text, 'parent_id':parent_id})
print(f"Section ---> {count}")
count += 1
2025-01-19 15:46:21 +00:00
2025-01-19 16:42:36 +00:00
outputfile = open('./data/main_qanon_170k_new.json', "w", encoding='utf-8')
2025-01-19 15:46:21 +00:00
outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4))
outputfile.close()
end_time = time.time()
print(f"elapsed time: {end_time-start_time}")
print("end")