103 lines
3.8 KiB
Python
103 lines
3.8 KiB
Python
import json
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
import time
|
|
from hazm import *
|
|
from nltk.chunk import tree2conlltags
|
|
|
|
|
|
print('start')
|
|
start_time = time.time()
|
|
|
|
inputfile = open('./data/main_qanon_170k_new.json', "r", encoding='utf-8')
|
|
|
|
data = json.load(inputfile)
|
|
|
|
inputfile.close()
|
|
|
|
dict = {}
|
|
datalength = len(data)
|
|
tagger = POSTagger(model='pos_tagger.model')
|
|
chunker = Chunker(model='chunker.model')
|
|
count = 0
|
|
new_qanon_sections = []
|
|
long_chunks_id = []
|
|
all_new_chunks_counter = 0
|
|
all_prev_chunks_counter = 0
|
|
errors = []
|
|
temp_small_qanon_counter = 0
|
|
for qid in tqdm(data):
|
|
if(qid == "qq114127"):
|
|
pass
|
|
print('progress: ' + str(((count + 1)/datalength)*100))
|
|
q_sections = data[qid]
|
|
new_qanon_chunks = []
|
|
|
|
sections_df = pd.DataFrame(q_sections)
|
|
df_sorted = sections_df.sort_values("child_order", ascending=True)
|
|
index = 0
|
|
new_q_sections = []
|
|
temp_small_sections = []
|
|
for caption, item in df_sorted.iterrows():
|
|
all_prev_chunks_counter += 1
|
|
# اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود
|
|
if item['parent_id'] != '0':
|
|
try:
|
|
father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1]
|
|
except:
|
|
errors.append({"section_id":item['section_id'], 'parent_id': item['parent_id']})
|
|
new_chunk = item['content']
|
|
all_new_chunks_counter += 1
|
|
|
|
if len(father_content) > 30:
|
|
pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟
|
|
new_chunk = father_content + " " + item['content']
|
|
all_new_chunks_counter += 1
|
|
if len(new_chunk.split()) >512:
|
|
long_chunks_id.append(item['section_id'])
|
|
else:# در این حالت، ماده دارای والد نیست
|
|
new_chunk = item['content']
|
|
all_new_chunks_counter += 1
|
|
|
|
if len(new_chunk.split()) < 30:
|
|
temp_small_sections.append(new_chunk)
|
|
all_new_chunks_counter -= 1
|
|
continue
|
|
|
|
|
|
new_qanon_chunks.append(new_chunk)
|
|
|
|
if len(new_qanon_chunks) == 0:
|
|
if len(temp_small_sections) == 2:
|
|
# ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است
|
|
# temp_small_sections.pop(0)
|
|
temp_section = temp_small_sections[0] + " " + temp_small_sections[1]
|
|
temp_small_sections = []
|
|
temp_small_sections.append(temp_section)
|
|
|
|
new_qanon_chunks = temp_small_sections
|
|
temp_small_qanon_counter += 1
|
|
temp_small_sections = []
|
|
new_qanon_sections.append({
|
|
"qanon_id": qid,
|
|
"new_sections": new_qanon_chunks
|
|
})
|
|
|
|
print("long_chunks: " + str(len(long_chunks_id)))
|
|
print("all_prev_chunks_counter: " + str(all_prev_chunks_counter))
|
|
print("all_new_chunks_counter: " + str(all_new_chunks_counter))
|
|
print("temp_small_qanon_counter: " + str(temp_small_qanon_counter))
|
|
print()
|
|
outputfile = open('./data/joint_qanon_170k_new.json', "w", encoding='utf-8')
|
|
outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
|
|
outputfile.close()
|
|
|
|
outputfile = open('./data/errors.json', "w", encoding='utf-8')
|
|
outputfile.write(json.dumps(errors, ensure_ascii=False, indent = 4))
|
|
outputfile.close()
|
|
|
|
print(len(new_qanon_sections))
|
|
print(f'join count {count}')
|
|
end_time = time.time()
|
|
print(f"elapsed time: {end_time-start_time}")
|
|
print("end") |