data

2025-01-22 18:27:08 +03:30 · 2025-01-22 18:27:08 +03:30 · 797c6ba902
commit 797c6ba902
parent 0eb3adde1a
5 changed files with 2421710 additions and 11 deletions
--- a/relation/data/errors.json
+++ b/relation/data/errors.json
@ -0,0 +1,78 @@
+[
+    {
+        "section_id": "qs252042",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252043",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252044",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252045",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252046",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252047",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252048",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252049",
+        "parent_id": "qs252041"
+    },
+    {
+        "section_id": "qs252051",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252052",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252053",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252054",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252055",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252056",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252057",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252058",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252059",
+        "parent_id": "qs252050"
+    },
+    {
+        "section_id": "qs252061",
+        "parent_id": "qs252060"
+    },
+    {
+        "section_id": "qs252062",
+        "parent_id": "qs252060"
+    }
+]
--- a/relation/data/joint_qanon_170k_newface.json
+++ b/relation/data/joint_qanon_170k_newface.json
--- a/relation/data/long_ids.txt
+++ b/relation/data/long_ids.txt
--- a/relation/data/main_qanon_170k_new.json
+++ b/relation/data/main_qanon_170k_new.json
--- a/relation/join_qanon_rag_2.py
+++ b/relation/join_qanon_rag_2.py
@ -27,8 +27,8 @@ all_prev_chunks_counter = 0
 errors = []
 temp_small_qanon_counter = 0
 for qid in tqdm(data):
-    if(qid == "qq114127"):
-        pass
+    # if(qid == "qq114127"):
+    #     pass
    print('progress: ' + str(((count + 1)/datalength)*100))
    q_sections = data[qid]
    new_qanon_chunks = []
@ -41,6 +41,11 @@ for qid in tqdm(data):
    for caption, item in df_sorted.iterrows():
        all_prev_chunks_counter += 1
        # اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود
+        section_id = item['section_id']
+        parent_id = item['parent_id']
+        child_order = item['child_order']
+        level = item['level']
+        number_text = item['number-text']
        if item['parent_id'] != '0':
            try:
                father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1]
@ -53,27 +58,49 @@ for qid in tqdm(data):
                pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟
            new_chunk = father_content + " " + item['content']
            all_new_chunks_counter += 1
-            if len(new_chunk.split()) >512:
-                long_chunks_id.append(item['section_id'])
+            # 927 sections are long!!!
+            if len(new_chunk.split()) >512:# اگر طول سکشن جدید بزرگتر از 512 توکن است، فقط متن سکشن فرزند را نگهدارد و بی خیال والد شود
+                # با توجه به صحبت آقای دلدار در کل می توان در چنین مواردی میانگین امبدینگ فرزند و والد را بدست آورد و نیازی به ذخیره و ارسال متن کامل تلفیق شده از این دو تا نیز نیست
+                
+                # می توان برای چنین مواردی فقط امبدینگ موجودیت های نامدار، موضوع و عنوان قانونی که این سکشن ذیل آن است را در نظر گرفت
+                    long_chunks_id.append(item['section_id'])
        else:# در این حالت، ماده دارای والد نیست
            new_chunk = item['content']
            all_new_chunks_counter += 1
-        
+            
+        section_dict = {
+                    'id': section_id,
+                    'child_order' : child_order,
+                    'content': new_chunk,
+                    'parent_id': parent_id,
+                    'level' : level,
+                    'number_text' : number_text
+                }
        if len(new_chunk.split()) < 30:
-            temp_small_sections.append(new_chunk)
+            
+            temp_small_sections.append(section_dict)
            all_new_chunks_counter -= 1
            continue

        
-        new_qanon_chunks.append(new_chunk)    
+        new_qanon_chunks.append(section_dict)    
    
    if len(new_qanon_chunks) == 0:
        if len(temp_small_sections) == 2:
            # ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است
            # temp_small_sections.pop(0)
-            temp_section = temp_small_sections[0] + " " + temp_small_sections[1]
+            temp_section = temp_small_sections[0]
+            temp_section_content = temp_section['content'] + " " + temp_small_sections[1]['content']
+            section_dict = {
+                    'id': temp_section['id'],
+                    'child_order' : temp_section['child_order'],
+                    'content': temp_section_content,
+                    'parent_id': temp_section['parent_id'],
+                    'level' : temp_section['level'],
+                    'number_text' : temp_section['number_text']
+                }
            temp_small_sections = []
-            temp_small_sections.append(temp_section)
+            temp_small_sections.append(section_dict)
            
        new_qanon_chunks = temp_small_sections
        temp_small_qanon_counter += 1
@ -83,12 +110,11 @@ for qid in tqdm(data):
        "new_sections": new_qanon_chunks
    })  
        
-print("long_chunks: " + str(len(long_chunks_id)))
 print("all_prev_chunks_counter: " + str(all_prev_chunks_counter))
 print("all_new_chunks_counter: " + str(all_new_chunks_counter))
 print("temp_small_qanon_counter: " + str(temp_small_qanon_counter))
 print()
-outputfile = open('./data/joint_qanon_170k_new.json', "w", encoding='utf-8')
+outputfile = open('./data/joint_qanon_170k_newface.json', "w", encoding='utf-8')
 outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
 outputfile.close()

@ -99,5 +125,16 @@ outputfile.close()
 print(len(new_qanon_sections))
 print(f'join count {count}')
 end_time = time.time()
+print('====================================')
+print('====================================')
+print("long_chunks: " + str(len(long_chunks_id)))
+long_chunks_ids_text = ''
+for idd in long_chunks_id:
+    long_chunks_ids_text += idd + '\n'
+outputfile = open('./data/long_ids.txt', "w", encoding='utf-8')
+outputfile.write(long_chunks_ids_text)
+outputfile.close()
+print('====================================')
+print('====================================')
 print(f"elapsed time:   {end_time-start_time}")
 print("end")