add rag data creator

2025-01-19 20:12:36 +03:30 · 2025-01-19 20:12:36 +03:30 · 0eb3adde1a
commit 0eb3adde1a
parent 406792d66d
6 changed files with 219 additions and 10 deletions
--- a/relation/chunker.model
+++ b/relation/chunker.model
--- a/relation/join_qanon.py
+++ b/relation/join_qanon.py
@ -19,8 +19,8 @@ inputfile.close()

 dict = {}
 datalength = len(data)
-tagger = POSTagger(model='./res/pos_tagger.model')
-chunker = Chunker(model='./res/chunker.model')
+tagger = POSTagger(model='pos_tagger.model')
+chunker = Chunker(model='chunker.model')
 count = 0
 for key in tqdm(data):
    print('progress: ' + str(((count + 1)/datalength)*100))
@ -52,7 +52,12 @@ for key in tqdm(data):
        else:
            item['join'] = False
    s = 10
+ccount = 0
 for key in tqdm(data):
+    ccount += 1
+    print('progress: ' + str(((ccount + 1)/(len(data)))*100))
+    print('count: ' + str(ccount + 1)+"/"+str(len(data)) )
+    
    items = data[key]
    index = 1
    while index < len(items)-1:
@ -90,6 +95,7 @@ for key in tqdm(data):
                item2['content'] = item1['content'] + '. ' + item2['content']
                item1['content'] = ''

+        index += 1

 outputfile = open('./data/joint_qanon_170k_metadata.json', "w", encoding='utf-8')
 outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4))
--- a/relation/join_qanon_rag.py
+++ b/relation/join_qanon_rag.py
@ -0,0 +1,86 @@
+import json
+from tqdm import tqdm
+import numpy as np
+import time
+from hazm import *
+from nltk.chunk import tree2conlltags
+
+
+print('start')
+start_time = time.time()
+
+inputfile = open('./data/main_qanon_170k_metadata.json', "r", encoding='utf-8')
+
+data = json.load(inputfile)
+
+inputfile.close()
+
+dict = {}
+datalength = len(data)
+tagger = POSTagger(model='pos_tagger.model')
+chunker = Chunker(model='chunker.model')
+count = 0
+new_qanon_sections = []
+long_chunks = 0
+for qid in tqdm(data):
+    if(qid == "qq114127"):
+        pass
+    print('progress: ' + str(((count + 1)/datalength)*100))
+    q_sections = data[qid]
+    new_qanon_chunks = []
+    pivot_index = 0
+    for index, section in enumerate(q_sections):
+        
+        content = section['content']
+        child_order = section['child_order']
+        level =section['level']
+        if index == 0:
+            pivot_index = index
+            pivot_level = level
+        
+        # به دست آوردن لول ماده قبل از ماده جاری
+        if (index == 0):
+            prev_section_level = 0
+        else:
+            prev_section_level = q_sections[index-1]['level']
+        
+        # حالتی که ماده جاری، آخرین ماده از قانون جاری باشد
+        if (index+1 == len(q_sections)):
+            if(level <= prev_section_level):# در این حالت، ماده جاری، آخرین ماده قانون است و در مرحله قبل اضافه نشده است بنابراین به تنهایی باید به مجموعه ماده های قانون جاری اضافه شود
+                new_chunk = content
+            else:# در این حالت، سکشن در مرحله قبل به ماده والد خود اضافه شده است
+                continue
+        else:
+            next_section_level = q_sections[index+1]['level']
+            if (int(pivot_level)+1) == (int(next_section_level)):# اگر ماده بعدی، فرزند ماده فعلی بود، دو ماده را به هم بچسباند. در این روش ممکن است تکرار رخ دهد، اما مهم نیست
+                
+                new_chunk = content + " " + q_sections[index+1]['content']
+                count += 1
+                if len(new_chunk.split()) > 512:
+                    print("long chunk !!!")
+                    long_chunks += 1
+            elif(int(pivot_level) +1 > (int(next_section_level))):# ماده بعدی از نظر لول، هم ارز ماده فعلی است
+                pivot_index = (int(index+1))# ماده بعدی به عنوان پیووت قرار می گیرد
+                pivot_level = (int(next_section_level))# ماده بعدی به عنوان پیووت قرار می گیرد
+                new_chunk = content
+            else: #(int(pivot_index) +1 < (int(next_section_level))) for example: 2<3
+                pivot_index = (int(level)) # ماده فعلی به عنوان پیووت قرار می گیرد
+                new_chunk = content
+        new_qanon_chunks.append(new_chunk)
+            
+    new_qanon_sections.append({
+        "qanon_id": qid,
+        "new_sections": new_qanon_chunks
+    })  
+        
+print("long_chunks: " + str(long_chunks))
+print()
+outputfile = open('./data/new_joint_qanon_170k2.json', "w", encoding='utf-8')
+outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
+outputfile.close()
+
+print(len(new_qanon_sections))
+print(f'join count {count}')
+end_time = time.time()
+print(f"elapsed time:   {end_time-start_time}")
+print("end")
--- a/relation/join_qanon_rag_2.py
+++ b/relation/join_qanon_rag_2.py
@ -0,0 +1,103 @@
+import json
+from tqdm import tqdm
+import pandas as pd
+import time
+from hazm import *
+from nltk.chunk import tree2conlltags
+
+
+print('start')
+start_time = time.time()
+
+inputfile = open('./data/main_qanon_170k_new.json', "r", encoding='utf-8')
+
+data = json.load(inputfile)
+
+inputfile.close()
+
+dict = {}
+datalength = len(data)
+tagger = POSTagger(model='pos_tagger.model')
+chunker = Chunker(model='chunker.model')
+count = 0
+new_qanon_sections = []
+long_chunks_id = []
+all_new_chunks_counter = 0
+all_prev_chunks_counter = 0
+errors = []
+temp_small_qanon_counter = 0
+for qid in tqdm(data):
+    if(qid == "qq114127"):
+        pass
+    print('progress: ' + str(((count + 1)/datalength)*100))
+    q_sections = data[qid]
+    new_qanon_chunks = []
+    
+    sections_df = pd.DataFrame(q_sections)
+    df_sorted = sections_df.sort_values("child_order", ascending=True)
+    index = 0
+    new_q_sections = []
+    temp_small_sections = []
+    for caption, item in df_sorted.iterrows():
+        all_prev_chunks_counter += 1
+        # اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود
+        if item['parent_id'] != '0':
+            try:
+                father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1]
+            except:
+                errors.append({"section_id":item['section_id'], 'parent_id': item['parent_id']})
+                new_chunk = item['content']
+                all_new_chunks_counter += 1
+                
+            if len(father_content) > 30:
+                pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟
+            new_chunk = father_content + " " + item['content']
+            all_new_chunks_counter += 1
+            if len(new_chunk.split()) >512:
+                long_chunks_id.append(item['section_id'])
+        else:# در این حالت، ماده دارای والد نیست
+            new_chunk = item['content']
+            all_new_chunks_counter += 1
+        
+        if len(new_chunk.split()) < 30:
+            temp_small_sections.append(new_chunk)
+            all_new_chunks_counter -= 1
+            continue
+
+        
+        new_qanon_chunks.append(new_chunk)    
+    
+    if len(new_qanon_chunks) == 0:
+        if len(temp_small_sections) == 2:
+            # ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است
+            # temp_small_sections.pop(0)
+            temp_section = temp_small_sections[0] + " " + temp_small_sections[1]
+            temp_small_sections = []
+            temp_small_sections.append(temp_section)
+            
+        new_qanon_chunks = temp_small_sections
+        temp_small_qanon_counter += 1
+        temp_small_sections = []
+    new_qanon_sections.append({
+        "qanon_id": qid,
+        "new_sections": new_qanon_chunks
+    })  
+        
+print("long_chunks: " + str(len(long_chunks_id)))
+print("all_prev_chunks_counter: " + str(all_prev_chunks_counter))
+print("all_new_chunks_counter: " + str(all_new_chunks_counter))
+print("temp_small_qanon_counter: " + str(temp_small_qanon_counter))
+print()
+outputfile = open('./data/joint_qanon_170k_new.json', "w", encoding='utf-8')
+outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
+outputfile.close()
+
+outputfile = open('./data/errors.json', "w", encoding='utf-8')
+outputfile.write(json.dumps(errors, ensure_ascii=False, indent = 4))
+outputfile.close()
+
+print(len(new_qanon_sections))
+print(f'join count {count}')
+end_time = time.time()
+print(f"elapsed time:   {end_time-start_time}")
+print("end")
--- a/relation/pos_tagger.model
+++ b/relation/pos_tagger.model
--- a/relation/sections2qanon.py
+++ b/relation/sections2qanon.py
@ -1,12 +1,16 @@
 import json
 from tqdm import tqdm
-import numpy as np
+import pandas as pd
 import time

 print('start')
 start_time = time.time()


+all_data_file = open('./data/sections_all.json', "r", encoding='utf-8')
+all_data =  json.load(all_data_file)
+all_data_file.close()
+all_data_df = pd.DataFrame(all_data)

 inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8')

@ -16,18 +20,28 @@ inputfile.close()

 dict = {}

+count = 0
 for item in tqdm(data):
-    key = item['qanon_id']
-    child_order = item['child_order']
-    content = item['content']
-    level =item['other_info']['level']
+    section_id = item['id']
+    try: section =  all_data_df[all_data_df['id'] == section_id]
+    except:
+        print("missing section: {section_id}".format(section_id))
+        continue
+    
+    key = list(section['q-id']).pop()
+    child_order = list(section['child-order']).pop()
+    content = list(section['content']).pop()
+    level = list(section['level']).pop()
+    parent_id = list(section['parent-id']).pop()
+    number_text = list(section['number-text']).pop()
    if not key in  dict:
        dict[key] = []
-    dict[key].append({'content':content, 'child_order':child_order, 'level': level})
+    dict[key].append({'section_id':section_id,'content':content, 'child_order':child_order, 'level': level, 'number-text': number_text, 'parent_id':parent_id})
+    print(f"Section ---> {count}")
+    count += 1
    

-
-outputfile = open('./data/main_qanon_170k_metadata.json', "w", encoding='utf-8')
+outputfile = open('./data/main_qanon_170k_new.json', "w", encoding='utf-8')
 outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4))
 outputfile.close()
 end_time = time.time()