import json from tqdm import tqdm import numpy as np import time from hazm import * from nltk.chunk import tree2conlltags print('start') start_time = time.time() inputfile = open('./data/main_qanon_170k_metadata.json', "r", encoding='utf-8') data = json.load(inputfile) inputfile.close() dict = {} datalength = len(data) tagger = POSTagger(model='pos_tagger.model') chunker = Chunker(model='chunker.model') count = 0 for key in tqdm(data): print('progress: ' + str(((count + 1)/datalength)*100)) items = data[key] for item in items: content = item['content'] child_order = item['child_order'] level =item['level'] tokens = word_tokenize(content) tagged = tagger.tag(tokens) ct_tree = chunker.parse(tagged) # ct_tree.draw() chunked = tree2brackets(ct_tree) # if len(tokens) < 30: # print("="*20) # print(tagged) # print(chunked) find_VP = False for titem in tree2conlltags(ct_tree): tp = titem[2].split('-') if len(tp) > 1: if tp[1] == 'VP': find_VP = True break if not find_VP: item['join'] = True # print("*********************** CAN JOIN *********************") count += 1 else: item['join'] = False s = 10 ccount = 0 for key in tqdm(data): ccount += 1 print('progress: ' + str(((ccount + 1)/(len(data)))*100)) print('count: ' + str(ccount + 1)+"/"+str(len(data)) ) items = data[key] index = 1 while index < len(items)-1: item0 = items[index-1] item1 = items[index] item2 = items[index+1] level0 =item0['level'] level1 =item1['level'] level2 =item2['level'] if item0['join']: item0['join'] = False if level0 < level1: item0['content'] = item0['content'] + '. ' + item1['content'] item1['content'] = '' else: item1['content'] = item0['content'] + '. ' + item1['content'] item0['content'] = '' if item1['join']: item1['join'] = False if level0 < level1: item0['content'] = item0['content'] + '. ' + item1['content'] item1['content'] = '' elif level1 < level2: item1['content'] = item1['content'] + '. ' + item2['content'] item2['content'] = '' else: item2['content'] = item1['content'] + '. ' + item2['content'] item1['content'] = '' if item2['join']: item2['join'] = False if level1 < level2: item1['content'] = item1['content'] + '. ' + item2['content'] item2['content'] = '' else: item2['content'] = item1['content'] + '. ' + item2['content'] item1['content'] = '' index += 1 outputfile = open('./data/joint_qanon_170k_metadata.json', "w", encoding='utf-8') outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4)) outputfile.close() print(len(data)) print(f'join count {count}') end_time = time.time() print(f"elapsed time: {end_time-start_time}") print("end")