add rag data creator

This commit is contained in:
ajokar 2025-01-19 20:12:36 +03:30
parent 406792d66d
commit 0eb3adde1a
6 changed files with 219 additions and 10 deletions

BIN
relation/chunker.model Normal file

Binary file not shown.

View File

@ -19,8 +19,8 @@ inputfile.close()
dict = {} dict = {}
datalength = len(data) datalength = len(data)
tagger = POSTagger(model='./res/pos_tagger.model') tagger = POSTagger(model='pos_tagger.model')
chunker = Chunker(model='./res/chunker.model') chunker = Chunker(model='chunker.model')
count = 0 count = 0
for key in tqdm(data): for key in tqdm(data):
print('progress: ' + str(((count + 1)/datalength)*100)) print('progress: ' + str(((count + 1)/datalength)*100))
@ -52,7 +52,12 @@ for key in tqdm(data):
else: else:
item['join'] = False item['join'] = False
s = 10 s = 10
ccount = 0
for key in tqdm(data): for key in tqdm(data):
ccount += 1
print('progress: ' + str(((ccount + 1)/(len(data)))*100))
print('count: ' + str(ccount + 1)+"/"+str(len(data)) )
items = data[key] items = data[key]
index = 1 index = 1
while index < len(items)-1: while index < len(items)-1:
@ -90,6 +95,7 @@ for key in tqdm(data):
item2['content'] = item1['content'] + '. ' + item2['content'] item2['content'] = item1['content'] + '. ' + item2['content']
item1['content'] = '' item1['content'] = ''
index += 1
outputfile = open('./data/joint_qanon_170k_metadata.json', "w", encoding='utf-8') outputfile = open('./data/joint_qanon_170k_metadata.json', "w", encoding='utf-8')
outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4)) outputfile.write(json.dumps(data, ensure_ascii=False, indent = 4))

View File

@ -0,0 +1,86 @@
import json
from tqdm import tqdm
import numpy as np
import time
from hazm import *
from nltk.chunk import tree2conlltags
print('start')
start_time = time.time()
inputfile = open('./data/main_qanon_170k_metadata.json', "r", encoding='utf-8')
data = json.load(inputfile)
inputfile.close()
dict = {}
datalength = len(data)
tagger = POSTagger(model='pos_tagger.model')
chunker = Chunker(model='chunker.model')
count = 0
new_qanon_sections = []
long_chunks = 0
for qid in tqdm(data):
if(qid == "qq114127"):
pass
print('progress: ' + str(((count + 1)/datalength)*100))
q_sections = data[qid]
new_qanon_chunks = []
pivot_index = 0
for index, section in enumerate(q_sections):
content = section['content']
child_order = section['child_order']
level =section['level']
if index == 0:
pivot_index = index
pivot_level = level
# به دست آوردن لول ماده قبل از ماده جاری
if (index == 0):
prev_section_level = 0
else:
prev_section_level = q_sections[index-1]['level']
# حالتی که ماده جاری، آخرین ماده از قانون جاری باشد
if (index+1 == len(q_sections)):
if(level <= prev_section_level):# در این حالت، ماده جاری، آخرین ماده قانون است و در مرحله قبل اضافه نشده است بنابراین به تنهایی باید به مجموعه ماده های قانون جاری اضافه شود
new_chunk = content
else:# در این حالت، سکشن در مرحله قبل به ماده والد خود اضافه شده است
continue
else:
next_section_level = q_sections[index+1]['level']
if (int(pivot_level)+1) == (int(next_section_level)):# اگر ماده بعدی، فرزند ماده فعلی بود، دو ماده را به هم بچسباند. در این روش ممکن است تکرار رخ دهد، اما مهم نیست
new_chunk = content + " " + q_sections[index+1]['content']
count += 1
if len(new_chunk.split()) > 512:
print("long chunk !!!")
long_chunks += 1
elif(int(pivot_level) +1 > (int(next_section_level))):# ماده بعدی از نظر لول، هم ارز ماده فعلی است
pivot_index = (int(index+1))# ماده بعدی به عنوان پیووت قرار می گیرد
pivot_level = (int(next_section_level))# ماده بعدی به عنوان پیووت قرار می گیرد
new_chunk = content
else: #(int(pivot_index) +1 < (int(next_section_level))) for example: 2<3
pivot_index = (int(level)) # ماده فعلی به عنوان پیووت قرار می گیرد
new_chunk = content
new_qanon_chunks.append(new_chunk)
new_qanon_sections.append({
"qanon_id": qid,
"new_sections": new_qanon_chunks
})
print("long_chunks: " + str(long_chunks))
print()
outputfile = open('./data/new_joint_qanon_170k2.json', "w", encoding='utf-8')
outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
outputfile.close()
print(len(new_qanon_sections))
print(f'join count {count}')
end_time = time.time()
print(f"elapsed time: {end_time-start_time}")
print("end")

View File

@ -0,0 +1,103 @@
import json
from tqdm import tqdm
import pandas as pd
import time
from hazm import *
from nltk.chunk import tree2conlltags
print('start')
start_time = time.time()
inputfile = open('./data/main_qanon_170k_new.json', "r", encoding='utf-8')
data = json.load(inputfile)
inputfile.close()
dict = {}
datalength = len(data)
tagger = POSTagger(model='pos_tagger.model')
chunker = Chunker(model='chunker.model')
count = 0
new_qanon_sections = []
long_chunks_id = []
all_new_chunks_counter = 0
all_prev_chunks_counter = 0
errors = []
temp_small_qanon_counter = 0
for qid in tqdm(data):
if(qid == "qq114127"):
pass
print('progress: ' + str(((count + 1)/datalength)*100))
q_sections = data[qid]
new_qanon_chunks = []
sections_df = pd.DataFrame(q_sections)
df_sorted = sections_df.sort_values("child_order", ascending=True)
index = 0
new_q_sections = []
temp_small_sections = []
for caption, item in df_sorted.iterrows():
all_prev_chunks_counter += 1
# اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود
if item['parent_id'] != '0':
try:
father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1]
except:
errors.append({"section_id":item['section_id'], 'parent_id': item['parent_id']})
new_chunk = item['content']
all_new_chunks_counter += 1
if len(father_content) > 30:
pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟
new_chunk = father_content + " " + item['content']
all_new_chunks_counter += 1
if len(new_chunk.split()) >512:
long_chunks_id.append(item['section_id'])
else:# در این حالت، ماده دارای والد نیست
new_chunk = item['content']
all_new_chunks_counter += 1
if len(new_chunk.split()) < 30:
temp_small_sections.append(new_chunk)
all_new_chunks_counter -= 1
continue
new_qanon_chunks.append(new_chunk)
if len(new_qanon_chunks) == 0:
if len(temp_small_sections) == 2:
# ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است
# temp_small_sections.pop(0)
temp_section = temp_small_sections[0] + " " + temp_small_sections[1]
temp_small_sections = []
temp_small_sections.append(temp_section)
new_qanon_chunks = temp_small_sections
temp_small_qanon_counter += 1
temp_small_sections = []
new_qanon_sections.append({
"qanon_id": qid,
"new_sections": new_qanon_chunks
})
print("long_chunks: " + str(len(long_chunks_id)))
print("all_prev_chunks_counter: " + str(all_prev_chunks_counter))
print("all_new_chunks_counter: " + str(all_new_chunks_counter))
print("temp_small_qanon_counter: " + str(temp_small_qanon_counter))
print()
outputfile = open('./data/joint_qanon_170k_new.json', "w", encoding='utf-8')
outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
outputfile.close()
outputfile = open('./data/errors.json', "w", encoding='utf-8')
outputfile.write(json.dumps(errors, ensure_ascii=False, indent = 4))
outputfile.close()
print(len(new_qanon_sections))
print(f'join count {count}')
end_time = time.time()
print(f"elapsed time: {end_time-start_time}")
print("end")

BIN
relation/pos_tagger.model Normal file

Binary file not shown.

View File

@ -1,12 +1,16 @@
import json import json
from tqdm import tqdm from tqdm import tqdm
import numpy as np import pandas as pd
import time import time
print('start') print('start')
start_time = time.time() start_time = time.time()
all_data_file = open('./data/sections_all.json', "r", encoding='utf-8')
all_data = json.load(all_data_file)
all_data_file.close()
all_data_df = pd.DataFrame(all_data)
inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8') inputfile = open('./data/main_sections_170k_metadata.json', "r", encoding='utf-8')
@ -16,18 +20,28 @@ inputfile.close()
dict = {} dict = {}
count = 0
for item in tqdm(data): for item in tqdm(data):
key = item['qanon_id'] section_id = item['id']
child_order = item['child_order'] try: section = all_data_df[all_data_df['id'] == section_id]
content = item['content'] except:
level =item['other_info']['level'] print("missing section: {section_id}".format(section_id))
continue
key = list(section['q-id']).pop()
child_order = list(section['child-order']).pop()
content = list(section['content']).pop()
level = list(section['level']).pop()
parent_id = list(section['parent-id']).pop()
number_text = list(section['number-text']).pop()
if not key in dict: if not key in dict:
dict[key] = [] dict[key] = []
dict[key].append({'content':content, 'child_order':child_order, 'level': level}) dict[key].append({'section_id':section_id,'content':content, 'child_order':child_order, 'level': level, 'number-text': number_text, 'parent_id':parent_id})
print(f"Section ---> {count}")
count += 1
outputfile = open('./data/main_qanon_170k_metadata.json', "w", encoding='utf-8') outputfile = open('./data/main_qanon_170k_new.json', "w", encoding='utf-8')
outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4)) outputfile.write(json.dumps(dict, ensure_ascii=False, indent = 4))
outputfile.close() outputfile.close()
end_time = time.time() end_time = time.time()