diff --git a/relation/chunker.model b/relation/chunker.model new file mode 100644 index 0000000..6be5046 Binary files /dev/null and b/relation/chunker.model differ diff --git a/relation/join_qanon.py b/relation/join_qanon.py index 078846b..35d07eb 100644 --- a/relation/join_qanon.py +++ b/relation/join_qanon.py @@ -19,8 +19,8 @@ inputfile.close() dict = {} datalength = len(data) -tagger = POSTagger(model='./res/pos_tagger.model') -chunker = Chunker(model='./res/chunker.model') +tagger = POSTagger(model='pos_tagger.model') +chunker = Chunker(model='chunker.model') count = 0 for key in tqdm(data): print('progress: ' + str(((count + 1)/datalength)*100)) @@ -52,7 +52,12 @@ for key in tqdm(data): else: item['join'] = False s = 10 +ccount = 0 for key in tqdm(data): + ccount += 1 + print('progress: ' + str(((ccount + 1)/(len(data)))*100)) + print('count: ' + str(ccount + 1)+"/"+str(len(data)) ) + items = data[key] index = 1 while index < len(items)-1: @@ -90,6 +95,7 @@ for key in tqdm(data): item2['content'] = item1['content'] + '. ' + item2['content'] item1['content'] = '' + index += 1 outputfile = open('./data/joint_qanon_170k_metadata.json', "w", encoding='utf-8') outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4)) diff --git a/relation/join_qanon_rag.py b/relation/join_qanon_rag.py new file mode 100644 index 0000000..c375975 --- /dev/null +++ b/relation/join_qanon_rag.py @@ -0,0 +1,86 @@ +import json +from tqdm import tqdm +import numpy as np +import time +from hazm import * +from nltk.chunk import tree2conlltags + + +print('start') +start_time = time.time() + +inputfile = open('./data/main_qanon_170k_metadata.json', "r", encoding='utf-8') + +data = json.load(inputfile) + +inputfile.close() + +dict = {} +datalength = len(data) +tagger = POSTagger(model='pos_tagger.model') +chunker = Chunker(model='chunker.model') +count = 0 +new_qanon_sections = [] +long_chunks = 0 +for qid in tqdm(data): + if(qid == "qq114127"): + pass + print('progress: ' + str(((count + 1)/datalength)*100)) + q_sections = data[qid] + new_qanon_chunks = [] + pivot_index = 0 + for index, section in enumerate(q_sections): + + content = section['content'] + child_order = section['child_order'] + level =section['level'] + if index == 0: + pivot_index = index + pivot_level = level + + # به دست آوردن لول ماده قبل از ماده جاری + if (index == 0): + prev_section_level = 0 + else: + prev_section_level = q_sections[index-1]['level'] + + # حالتی که ماده جاری، آخرین ماده از قانون جاری باشد + if (index+1 == len(q_sections)): + if(level <= prev_section_level):# در این حالت، ماده جاری، آخرین ماده قانون است و در مرحله قبل اضافه نشده است بنابراین به تنهایی باید به مجموعه ماده های قانون جاری اضافه شود + new_chunk = content + else:# در این حالت، سکشن در مرحله قبل به ماده والد خود اضافه شده است + continue + else: + next_section_level = q_sections[index+1]['level'] + if (int(pivot_level)+1) == (int(next_section_level)):# اگر ماده بعدی، فرزند ماده فعلی بود، دو ماده را به هم بچسباند. در این روش ممکن است تکرار رخ دهد، اما مهم نیست + + new_chunk = content + " " + q_sections[index+1]['content'] + count += 1 + if len(new_chunk.split()) > 512: + print("long chunk !!!") + long_chunks += 1 + elif(int(pivot_level) +1 > (int(next_section_level))):# ماده بعدی از نظر لول، هم ارز ماده فعلی است + pivot_index = (int(index+1))# ماده بعدی به عنوان پیووت قرار می گیرد + pivot_level = (int(next_section_level))# ماده بعدی به عنوان پیووت قرار می گیرد + new_chunk = content + else: #(int(pivot_index) +1 < (int(next_section_level))) for example: 2<3 + pivot_index = (int(level)) # ماده فعلی به عنوان پیووت قرار می گیرد + new_chunk = content + new_qanon_chunks.append(new_chunk) + + new_qanon_sections.append({ + "qanon_id": qid, + "new_sections": new_qanon_chunks + }) + +print("long_chunks: " + str(long_chunks)) +print() +outputfile = open('./data/new_joint_qanon_170k2.json', "w", encoding='utf-8') +outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4)) +outputfile.close() + +print(len(new_qanon_sections)) +print(f'join count {count}') +end_time = time.time() +print(f"elapsed time: {end_time-start_time}") +print("end") \ No newline at end of file diff --git a/relation/join_qanon_rag_2.py b/relation/join_qanon_rag_2.py new file mode 100644 index 0000000..811d44e --- /dev/null +++ b/relation/join_qanon_rag_2.py @@ -0,0 +1,103 @@ +import json +from tqdm import tqdm +import pandas as pd +import time +from hazm import * +from nltk.chunk import tree2conlltags + + +print('start') +start_time = time.time() + +inputfile = open('./data/main_qanon_170k_new.json', "r", encoding='utf-8') + +data = json.load(inputfile) + +inputfile.close() + +dict = {} +datalength = len(data) +tagger = POSTagger(model='pos_tagger.model') +chunker = Chunker(model='chunker.model') +count = 0 +new_qanon_sections = [] +long_chunks_id = [] +all_new_chunks_counter = 0 +all_prev_chunks_counter = 0 +errors = [] +temp_small_qanon_counter = 0 +for qid in tqdm(data): + if(qid == "qq114127"): + pass + print('progress: ' + str(((count + 1)/datalength)*100)) + q_sections = data[qid] + new_qanon_chunks = [] + + sections_df = pd.DataFrame(q_sections) + df_sorted = sections_df.sort_values("child_order", ascending=True) + index = 0 + new_q_sections = [] + temp_small_sections = [] + for caption, item in df_sorted.iterrows(): + all_prev_chunks_counter += 1 + # اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود + if item['parent_id'] != '0': + try: + father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1] + except: + errors.append({"section_id":item['section_id'], 'parent_id': item['parent_id']}) + new_chunk = item['content'] + all_new_chunks_counter += 1 + + if len(father_content) > 30: + pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟ + new_chunk = father_content + " " + item['content'] + all_new_chunks_counter += 1 + if len(new_chunk.split()) >512: + long_chunks_id.append(item['section_id']) + else:# در این حالت، ماده دارای والد نیست + new_chunk = item['content'] + all_new_chunks_counter += 1 + + if len(new_chunk.split()) < 30: + temp_small_sections.append(new_chunk) + all_new_chunks_counter -= 1 + continue + + + new_qanon_chunks.append(new_chunk) + + if len(new_qanon_chunks) == 0: + if len(temp_small_sections) == 2: + # ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است + # temp_small_sections.pop(0) + temp_section = temp_small_sections[0] + " " + temp_small_sections[1] + temp_small_sections = [] + temp_small_sections.append(temp_section) + + new_qanon_chunks = temp_small_sections + temp_small_qanon_counter += 1 + temp_small_sections = [] + new_qanon_sections.append({ + "qanon_id": qid, + "new_sections": new_qanon_chunks + }) + +print("long_chunks: " + str(len(long_chunks_id))) +print("all_prev_chunks_counter: " + str(all_prev_chunks_counter)) +print("all_new_chunks_counter: " + str(all_new_chunks_counter)) +print("temp_small_qanon_counter: " + str(temp_small_qanon_counter)) +print() +outputfile = open('./data/joint_qanon_170k_new.json', "w", encoding='utf-8') +outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4)) +outputfile.close() + +outputfile = open('./data/errors.json', "w", encoding='utf-8') +outputfile.write(json.dumps(errors, ensure_ascii=False, indent = 4)) +outputfile.close() + +print(len(new_qanon_sections)) +print(f'join count {count}') +end_time = time.time() +print(f"elapsed time: {end_time-start_time}") +print("end") \ No newline at end of file diff --git a/relation/pos_tagger.model b/relation/pos_tagger.model new file mode 100644 index 0000000..f4d0461 Binary files /dev/null and b/relation/pos_tagger.model differ diff --git a/relation/sections2qanon.py b/relation/sections2qanon.py index f212620..b483207 100644 --- a/relation/sections2qanon.py +++ b/relation/sections2qanon.py @@ -1,12 +1,16 @@ import json from tqdm import tqdm -import numpy as np +import pandas as pd import time print('start') start_time = time.time() +all_data_file = open('./data/sections_all.json', "r", encoding='utf-8') +all_data = json.load(all_data_file) +all_data_file.close() +all_data_df = pd.DataFrame(all_data) inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8') @@ -16,18 +20,28 @@ inputfile.close() dict = {} +count = 0 for item in tqdm(data): - key = item['qanon_id'] - child_order = item['child_order'] - content = item['content'] - level =item['other_info']['level'] + section_id = item['id'] + try: section = all_data_df[all_data_df['id'] == section_id] + except: + print("missing section: {section_id}".format(section_id)) + continue + + key = list(section['q-id']).pop() + child_order = list(section['child-order']).pop() + content = list(section['content']).pop() + level = list(section['level']).pop() + parent_id = list(section['parent-id']).pop() + number_text = list(section['number-text']).pop() if not key in dict: dict[key] = [] - dict[key].append({'content':content, 'child_order':child_order, 'level': level}) - + dict[key].append({'section_id':section_id,'content':content, 'child_order':child_order, 'level': level, 'number-text': number_text, 'parent_id':parent_id}) + print(f"Section ---> {count}") + count += 1 -outputfile = open('./data/main_qanon_170k_metadata.json', "w", encoding='utf-8') +outputfile = open('./data/main_qanon_170k_new.json', "w", encoding='utf-8') outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4)) outputfile.close() end_time = time.time()