2025-01-19 15:46:21 +00:00
|
|
|
import json
|
|
|
|
from tqdm import tqdm
|
|
|
|
import numpy as np
|
|
|
|
import time
|
|
|
|
from hazm import *
|
|
|
|
from nltk.chunk import tree2conlltags
|
|
|
|
|
|
|
|
|
|
|
|
print('start')
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inputfile = open('./data/main_qanon_170k_metadata.json', "r", encoding='utf-8')
|
|
|
|
|
|
|
|
data = json.load(inputfile)
|
|
|
|
|
|
|
|
inputfile.close()
|
|
|
|
|
|
|
|
dict = {}
|
|
|
|
datalength = len(data)
|
2025-01-19 16:42:36 +00:00
|
|
|
tagger = POSTagger(model='pos_tagger.model')
|
|
|
|
chunker = Chunker(model='chunker.model')
|
2025-01-19 15:46:21 +00:00
|
|
|
count = 0
|
|
|
|
for key in tqdm(data):
|
|
|
|
print('progress: ' + str(((count + 1)/datalength)*100))
|
|
|
|
items = data[key]
|
|
|
|
for item in items:
|
|
|
|
content = item['content']
|
|
|
|
child_order = item['child_order']
|
|
|
|
level =item['level']
|
|
|
|
tokens = word_tokenize(content)
|
|
|
|
tagged = tagger.tag(tokens)
|
|
|
|
ct_tree = chunker.parse(tagged)
|
|
|
|
# ct_tree.draw()
|
|
|
|
chunked = tree2brackets(ct_tree)
|
|
|
|
# if len(tokens) < 30:
|
|
|
|
# print("="*20)
|
|
|
|
# print(tagged)
|
|
|
|
# print(chunked)
|
|
|
|
find_VP = False
|
|
|
|
for titem in tree2conlltags(ct_tree):
|
|
|
|
tp = titem[2].split('-')
|
|
|
|
if len(tp) > 1:
|
|
|
|
if tp[1] == 'VP':
|
|
|
|
find_VP = True
|
|
|
|
break
|
|
|
|
if not find_VP:
|
|
|
|
item['join'] = True
|
|
|
|
# print("*********************** CAN JOIN *********************")
|
|
|
|
count += 1
|
|
|
|
else:
|
|
|
|
item['join'] = False
|
|
|
|
s = 10
|
2025-01-19 16:42:36 +00:00
|
|
|
ccount = 0
|
2025-01-19 15:46:21 +00:00
|
|
|
for key in tqdm(data):
|
2025-01-19 16:42:36 +00:00
|
|
|
ccount += 1
|
|
|
|
print('progress: ' + str(((ccount + 1)/(len(data)))*100))
|
|
|
|
print('count: ' + str(ccount + 1)+"/"+str(len(data)) )
|
|
|
|
|
2025-01-19 15:46:21 +00:00
|
|
|
items = data[key]
|
|
|
|
index = 1
|
|
|
|
while index < len(items)-1:
|
|
|
|
item0 = items[index-1]
|
|
|
|
item1 = items[index]
|
|
|
|
item2 = items[index+1]
|
|
|
|
level0 =item0['level']
|
|
|
|
level1 =item1['level']
|
|
|
|
level2 =item2['level']
|
|
|
|
if item0['join']:
|
|
|
|
item0['join'] = False
|
|
|
|
if level0 < level1:
|
|
|
|
item0['content'] = item0['content'] + '. ' + item1['content']
|
|
|
|
item1['content'] = ''
|
|
|
|
else:
|
|
|
|
item1['content'] = item0['content'] + '. ' + item1['content']
|
|
|
|
item0['content'] = ''
|
|
|
|
if item1['join']:
|
|
|
|
item1['join'] = False
|
|
|
|
if level0 < level1:
|
|
|
|
item0['content'] = item0['content'] + '. ' + item1['content']
|
|
|
|
item1['content'] = ''
|
|
|
|
elif level1 < level2:
|
|
|
|
item1['content'] = item1['content'] + '. ' + item2['content']
|
|
|
|
item2['content'] = ''
|
|
|
|
else:
|
|
|
|
item2['content'] = item1['content'] + '. ' + item2['content']
|
|
|
|
item1['content'] = ''
|
|
|
|
if item2['join']:
|
|
|
|
item2['join'] = False
|
|
|
|
if level1 < level2:
|
|
|
|
item1['content'] = item1['content'] + '. ' + item2['content']
|
|
|
|
item2['content'] = ''
|
|
|
|
else:
|
|
|
|
item2['content'] = item1['content'] + '. ' + item2['content']
|
|
|
|
item1['content'] = ''
|
|
|
|
|
2025-01-19 16:42:36 +00:00
|
|
|
index += 1
|
2025-01-19 15:46:21 +00:00
|
|
|
|
|
|
|
outputfile = open('./data/joint_qanon_170k_metadata.json', "w", encoding='utf-8')
|
|
|
|
outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4))
|
|
|
|
outputfile.close()
|
|
|
|
|
|
|
|
print(len(data))
|
|
|
|
print(f'join count {count}')
|
|
|
|
end_time = time.time()
|
|
|
|
print(f"elapsed time: {end_time-start_time}")
|
|
|
|
print("end")
|