RD_relation/relation/join_qanon_rag_2.py

import json
from tqdm import tqdm
import pandas as pd
import time
from hazm import *
from nltk.chunk import tree2conlltags


print('start')
start_time = time.time()

inputfile = open('./data/main_qanon_170k_new2.json', "r", encoding='utf-8')

data = json.load(inputfile)

inputfile.close()

dict = {}
datalength = len(data)
tagger = POSTagger(model='pos_tagger.model')
chunker = Chunker(model='chunker.model')
count = 0
new_qanon_sections = []
long_chunks_id = []
all_new_chunks_counter = 0
all_prev_chunks_counter = 0
errors = []
temp_small_qanon_counter = 0
for qid in tqdm(data):
    # if(qid == "qq114127"):
    #     pass
    # print('progress: ' + str(((count + 1)/datalength)*100))
    q_sections = data[qid]
    new_qanon_chunks = []
    
    sections_df = pd.DataFrame(q_sections)
    df_sorted = sections_df.sort_values("child_order", ascending=True)
    index = 0
    new_q_sections = []
    temp_small_sections = []
    for caption, item in df_sorted.iterrows():
        all_prev_chunks_counter += 1
        # اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود
        section_id = item['section_id']
        print(f'section_id: {section_id}')
        parent_id = item['parent_id']
        child_order = item['child_order']
        level = item['level']
        number_text = item['number-text']
        is_long = False
        if item['parent_id'] != '0':
            try:
                father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1]
            except:
                errors.append({"section_id":item['section_id'], 'parent_id': item['parent_id']})
                new_chunk = item['content']
                all_new_chunks_counter += 1
                
            if len(father_content) > 30:
                pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟
            new_chunk = father_content + " " + item['content']
            all_new_chunks_counter += 1
            # 927 sections are long!!!
            if len(new_chunk.split()) >512:# اگر طول سکشن جدید بزرگتر از 512 توکن است، فقط متن سکشن فرزند را نگهدارد و بی خیال والد شود
                # با توجه به صحبت آقای دلدار در کل می توان در چنین مواردی میانگین امبدینگ فرزند و والد را بدست آورد و نیازی به ذخیره و ارسال متن کامل تلفیق شده از این دو تا نیز نیست
                
                # می توان برای چنین مواردی فقط امبدینگ موجودیت های نامدار، موضوع و عنوان قانونی که این سکشن ذیل آن است را در نظر گرفت
                    long_chunks_id.append(item['section_id'])
                    is_long = True
        else:# در این حالت، ماده دارای والد نیست
            new_chunk = item['content']
            all_new_chunks_counter += 1
            
        section_dict = {
                    'id': section_id,
                    'child_order' : child_order,
                    'parent_id': parent_id,
                    'level' : level,
                    'number_text' : number_text,
                    'is-long' : is_long,
                    'content': new_chunk,
                }
        if len(new_chunk.split()) < 30:
            
            temp_small_sections.append(section_dict)
            all_new_chunks_counter -= 1
            continue

        
        new_qanon_chunks.append(section_dict)    
    
    if len(new_qanon_chunks) == 0:
        if len(temp_small_sections) == 2:
            # ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است
            # temp_small_sections.pop(0)
            temp_section = temp_small_sections[0]
            temp_section_content = temp_section['content'] + " " + temp_small_sections[1]['content']
            section_dict = {
                    'id': temp_section['id'],
                    'child_order' : temp_section['child_order'],
                    'parent_id': temp_section['parent_id'],
                    'level' : temp_section['level'],
                    'number_text' : temp_section['number_text'],
                    'is-long' : temp_section['is-long'],
                    'content': temp_section_content,
                }
            temp_small_sections = []
            temp_small_sections.append(section_dict)
            
        new_qanon_chunks = temp_small_sections
        temp_small_qanon_counter += 1
        temp_small_sections = []
    new_qanon_sections.append({
        "qanon_id": qid,
        "new_sections": new_qanon_chunks
    })  
        
print("all_prev_chunks_counter: " + str(all_prev_chunks_counter))
print("all_new_chunks_counter: " + str(all_new_chunks_counter))
print("temp_small_qanon_counter: " + str(temp_small_qanon_counter))
print()
outputfile = open('./data/joint_qanon_170k_newface.json', "w", encoding='utf-8')
outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
outputfile.close()

outputfile = open('./data/errors.json', "w", encoding='utf-8')
outputfile.write(json.dumps(errors, ensure_ascii=False, indent = 4))
outputfile.close()

print(len(new_qanon_sections))
print(f'join count {count}')
end_time = time.time()
print('====================================')
print('====================================')
print("long_chunks: " + str(len(long_chunks_id)))
long_chunks_ids_text = ''
for idd in long_chunks_id:
    long_chunks_ids_text += idd + '\n'
outputfile = open('./data/long_ids.txt', "w", encoding='utf-8')
outputfile.write(long_chunks_ids_text)
outputfile.close()
print('====================================')
print('====================================')
print(f"elapsed time:   {end_time-start_time}")
print("end")
add rag data creator 2025-01-19 16:42:36 +00:00			`import json`
			`from tqdm import tqdm`
			`import pandas as pd`
			`import time`
			`from hazm import *`
			`from nltk.chunk import tree2conlltags`


			`print('start')`
			`start_time = time.time()`

embeddings 2025-01-23 15:06:04 +00:00			`inputfile = open('./data/main_qanon_170k_new2.json', "r", encoding='utf-8')`
add rag data creator 2025-01-19 16:42:36 +00:00
			`data = json.load(inputfile)`

			`inputfile.close()`

			`dict = {}`
			`datalength = len(data)`
			`tagger = POSTagger(model='pos_tagger.model')`
			`chunker = Chunker(model='chunker.model')`
			`count = 0`
			`new_qanon_sections = []`
			`long_chunks_id = []`
			`all_new_chunks_counter = 0`
			`all_prev_chunks_counter = 0`
			`errors = []`
			`temp_small_qanon_counter = 0`
			`for qid in tqdm(data):`
data 2025-01-22 14:57:08 +00:00			`# if(qid == "qq114127"):`
			`# pass`
embeddings 2025-01-23 15:06:04 +00:00			`# print('progress: ' + str(((count + 1)/datalength)*100))`
add rag data creator 2025-01-19 16:42:36 +00:00			`q_sections = data[qid]`
			`new_qanon_chunks = []`

			`sections_df = pd.DataFrame(q_sections)`
			`df_sorted = sections_df.sort_values("child_order", ascending=True)`
			`index = 0`
			`new_q_sections = []`
			`temp_small_sections = []`
			`for caption, item in df_sorted.iterrows():`
			`all_prev_chunks_counter += 1`
			`# اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود`
data 2025-01-22 14:57:08 +00:00			`section_id = item['section_id']`
embeddings 2025-01-23 15:06:04 +00:00			`print(f'section_id: {section_id}')`
data 2025-01-22 14:57:08 +00:00			`parent_id = item['parent_id']`
			`child_order = item['child_order']`
			`level = item['level']`
			`number_text = item['number-text']`
sections to qanon 2025-01-22 17:21:09 +00:00			`is_long = False`
add rag data creator 2025-01-19 16:42:36 +00:00			`if item['parent_id'] != '0':`
			`try:`
			`father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1]`
			`except:`
			`errors.append({"section_id":item['section_id'], 'parent_id': item['parent_id']})`
			`new_chunk = item['content']`
			`all_new_chunks_counter += 1`

			`if len(father_content) > 30:`
			`pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟`
			`new_chunk = father_content + " " + item['content']`
			`all_new_chunks_counter += 1`
data 2025-01-22 14:57:08 +00:00			`# 927 sections are long!!!`
			`if len(new_chunk.split()) >512:# اگر طول سکشن جدید بزرگتر از 512 توکن است، فقط متن سکشن فرزند را نگهدارد و بی خیال والد شود`
			`# با توجه به صحبت آقای دلدار در کل می توان در چنین مواردی میانگین امبدینگ فرزند و والد را بدست آورد و نیازی به ذخیره و ارسال متن کامل تلفیق شده از این دو تا نیز نیست`

			`# می توان برای چنین مواردی فقط امبدینگ موجودیت های نامدار، موضوع و عنوان قانونی که این سکشن ذیل آن است را در نظر گرفت`
			`long_chunks_id.append(item['section_id'])`
sections to qanon 2025-01-22 17:21:09 +00:00			`is_long = True`
add rag data creator 2025-01-19 16:42:36 +00:00			`else:# در این حالت، ماده دارای والد نیست`
			`new_chunk = item['content']`
			`all_new_chunks_counter += 1`
data 2025-01-22 14:57:08 +00:00
			`section_dict = {`
			`'id': section_id,`
			`'child_order' : child_order,`
			`'parent_id': parent_id,`
			`'level' : level,`
sections to qanon 2025-01-22 17:21:09 +00:00			`'number_text' : number_text,`
			`'is-long' : is_long,`
			`'content': new_chunk,`
data 2025-01-22 14:57:08 +00:00			`}`
add rag data creator 2025-01-19 16:42:36 +00:00			`if len(new_chunk.split()) < 30:`
data 2025-01-22 14:57:08 +00:00
			`temp_small_sections.append(section_dict)`
add rag data creator 2025-01-19 16:42:36 +00:00			`all_new_chunks_counter -= 1`
			`continue`


data 2025-01-22 14:57:08 +00:00			`new_qanon_chunks.append(section_dict)`
add rag data creator 2025-01-19 16:42:36 +00:00
			`if len(new_qanon_chunks) == 0:`
			`if len(temp_small_sections) == 2:`
			`# ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است`
			`# temp_small_sections.pop(0)`
data 2025-01-22 14:57:08 +00:00			`temp_section = temp_small_sections[0]`
			`temp_section_content = temp_section['content'] + " " + temp_small_sections[1]['content']`
			`section_dict = {`
			`'id': temp_section['id'],`
			`'child_order' : temp_section['child_order'],`
			`'parent_id': temp_section['parent_id'],`
			`'level' : temp_section['level'],`
embeddings 2025-01-23 15:06:04 +00:00			`'number_text' : temp_section['number_text'],`
			`'is-long' : temp_section['is-long'],`
			`'content': temp_section_content,`
data 2025-01-22 14:57:08 +00:00			`}`
add rag data creator 2025-01-19 16:42:36 +00:00			`temp_small_sections = []`
data 2025-01-22 14:57:08 +00:00			`temp_small_sections.append(section_dict)`
add rag data creator 2025-01-19 16:42:36 +00:00
			`new_qanon_chunks = temp_small_sections`
			`temp_small_qanon_counter += 1`
			`temp_small_sections = []`
			`new_qanon_sections.append({`
			`"qanon_id": qid,`
			`"new_sections": new_qanon_chunks`
			`})`

			`print("all_prev_chunks_counter: " + str(all_prev_chunks_counter))`
			`print("all_new_chunks_counter: " + str(all_new_chunks_counter))`
			`print("temp_small_qanon_counter: " + str(temp_small_qanon_counter))`
			`print()`
data 2025-01-22 14:57:08 +00:00			`outputfile = open('./data/joint_qanon_170k_newface.json', "w", encoding='utf-8')`
add rag data creator 2025-01-19 16:42:36 +00:00			`outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))`
			`outputfile.close()`

			`outputfile = open('./data/errors.json', "w", encoding='utf-8')`
			`outputfile.write(json.dumps(errors, ensure_ascii=False, indent = 4))`
			`outputfile.close()`

			`print(len(new_qanon_sections))`
			`print(f'join count {count}')`
			`end_time = time.time()`
data 2025-01-22 14:57:08 +00:00			`print('====================================')`
			`print('====================================')`
			`print("long_chunks: " + str(len(long_chunks_id)))`
			`long_chunks_ids_text = ''`
			`for idd in long_chunks_id:`
			`long_chunks_ids_text += idd + '\n'`
			`outputfile = open('./data/long_ids.txt', "w", encoding='utf-8')`
			`outputfile.write(long_chunks_ids_text)`
			`outputfile.close()`
			`print('====================================')`
			`print('====================================')`
add rag data creator 2025-01-19 16:42:36 +00:00			`print(f"elapsed time: {end_time-start_time}")`
			`print("end")`