import json from tqdm import tqdm import pandas as pd import time from hazm import * from nltk.chunk import tree2conlltags print('start') start_time = time.time() inputfile = open('./data/main_qanon_170k_new.json', "r", encoding='utf-8') data = json.load(inputfile) inputfile.close() dict = {} datalength = len(data) tagger = POSTagger(model='pos_tagger.model') chunker = Chunker(model='chunker.model') count = 0 new_qanon_sections = [] long_chunks_id = [] all_new_chunks_counter = 0 all_prev_chunks_counter = 0 errors = [] temp_small_qanon_counter = 0 for qid in tqdm(data): if(qid == "qq114127"): pass print('progress: ' + str(((count + 1)/datalength)*100)) q_sections = data[qid] new_qanon_chunks = [] sections_df = pd.DataFrame(q_sections) df_sorted = sections_df.sort_values("child_order", ascending=True) index = 0 new_q_sections = [] temp_small_sections = [] for caption, item in df_sorted.iterrows(): all_prev_chunks_counter += 1 # اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود if item['parent_id'] != '0': try: father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1] except: errors.append({"section_id":item['section_id'], 'parent_id': item['parent_id']}) new_chunk = item['content'] all_new_chunks_counter += 1 if len(father_content) > 30: pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟ new_chunk = father_content + " " + item['content'] all_new_chunks_counter += 1 if len(new_chunk.split()) >512: long_chunks_id.append(item['section_id']) else:# در این حالت، ماده دارای والد نیست new_chunk = item['content'] all_new_chunks_counter += 1 if len(new_chunk.split()) < 30: temp_small_sections.append(new_chunk) all_new_chunks_counter -= 1 continue new_qanon_chunks.append(new_chunk) if len(new_qanon_chunks) == 0: if len(temp_small_sections) == 2: # ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است # temp_small_sections.pop(0) temp_section = temp_small_sections[0] + " " + temp_small_sections[1] temp_small_sections = [] temp_small_sections.append(temp_section) new_qanon_chunks = temp_small_sections temp_small_qanon_counter += 1 temp_small_sections = [] new_qanon_sections.append({ "qanon_id": qid, "new_sections": new_qanon_chunks }) print("long_chunks: " + str(len(long_chunks_id))) print("all_prev_chunks_counter: " + str(all_prev_chunks_counter)) print("all_new_chunks_counter: " + str(all_new_chunks_counter)) print("temp_small_qanon_counter: " + str(temp_small_qanon_counter)) print() outputfile = open('./data/joint_qanon_170k_new.json', "w", encoding='utf-8') outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4)) outputfile.close() outputfile = open('./data/errors.json', "w", encoding='utf-8') outputfile.write(json.dumps(errors, ensure_ascii=False, indent = 4)) outputfile.close() print(len(new_qanon_sections)) print(f'join count {count}') end_time = time.time() print(f"elapsed time: {end_time-start_time}") print("end")