data
This commit is contained in:
parent
0eb3adde1a
commit
797c6ba902
78
relation/data/errors.json
Normal file
78
relation/data/errors.json
Normal file
|
@ -0,0 +1,78 @@
|
|||
[
|
||||
{
|
||||
"section_id": "qs252042",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252043",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252044",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252045",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252046",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252047",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252048",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252049",
|
||||
"parent_id": "qs252041"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252051",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252052",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252053",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252054",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252055",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252056",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252057",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252058",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252059",
|
||||
"parent_id": "qs252050"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252061",
|
||||
"parent_id": "qs252060"
|
||||
},
|
||||
{
|
||||
"section_id": "qs252062",
|
||||
"parent_id": "qs252060"
|
||||
}
|
||||
]
|
1043294
relation/data/joint_qanon_170k_newface.json
Normal file
1043294
relation/data/joint_qanon_170k_newface.json
Normal file
File diff suppressed because one or more lines are too long
0
relation/data/long_ids.txt
Normal file
0
relation/data/long_ids.txt
Normal file
1378290
relation/data/main_qanon_170k_new.json
Normal file
1378290
relation/data/main_qanon_170k_new.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -27,8 +27,8 @@ all_prev_chunks_counter = 0
|
|||
errors = []
|
||||
temp_small_qanon_counter = 0
|
||||
for qid in tqdm(data):
|
||||
if(qid == "qq114127"):
|
||||
pass
|
||||
# if(qid == "qq114127"):
|
||||
# pass
|
||||
print('progress: ' + str(((count + 1)/datalength)*100))
|
||||
q_sections = data[qid]
|
||||
new_qanon_chunks = []
|
||||
|
@ -41,6 +41,11 @@ for qid in tqdm(data):
|
|||
for caption, item in df_sorted.iterrows():
|
||||
all_prev_chunks_counter += 1
|
||||
# اگر شناسه والد برابر با صفر نبود به این معناست که این ماده، والدی دارد که باید متن والد به متن فرزند اضافه شود
|
||||
section_id = item['section_id']
|
||||
parent_id = item['parent_id']
|
||||
child_order = item['child_order']
|
||||
level = item['level']
|
||||
number_text = item['number-text']
|
||||
if item['parent_id'] != '0':
|
||||
try:
|
||||
father_content = df_sorted[df_sorted['section_id'] == item['parent_id']]._values[0][1]
|
||||
|
@ -53,27 +58,49 @@ for qid in tqdm(data):
|
|||
pass # آیا متن پدر اگر بزرگتر از 30 باشد نیاز به اضافه شدن به متن فرزند دارد؟
|
||||
new_chunk = father_content + " " + item['content']
|
||||
all_new_chunks_counter += 1
|
||||
if len(new_chunk.split()) >512:
|
||||
long_chunks_id.append(item['section_id'])
|
||||
# 927 sections are long!!!
|
||||
if len(new_chunk.split()) >512:# اگر طول سکشن جدید بزرگتر از 512 توکن است، فقط متن سکشن فرزند را نگهدارد و بی خیال والد شود
|
||||
# با توجه به صحبت آقای دلدار در کل می توان در چنین مواردی میانگین امبدینگ فرزند و والد را بدست آورد و نیازی به ذخیره و ارسال متن کامل تلفیق شده از این دو تا نیز نیست
|
||||
|
||||
# می توان برای چنین مواردی فقط امبدینگ موجودیت های نامدار، موضوع و عنوان قانونی که این سکشن ذیل آن است را در نظر گرفت
|
||||
long_chunks_id.append(item['section_id'])
|
||||
else:# در این حالت، ماده دارای والد نیست
|
||||
new_chunk = item['content']
|
||||
all_new_chunks_counter += 1
|
||||
|
||||
|
||||
section_dict = {
|
||||
'id': section_id,
|
||||
'child_order' : child_order,
|
||||
'content': new_chunk,
|
||||
'parent_id': parent_id,
|
||||
'level' : level,
|
||||
'number_text' : number_text
|
||||
}
|
||||
if len(new_chunk.split()) < 30:
|
||||
temp_small_sections.append(new_chunk)
|
||||
|
||||
temp_small_sections.append(section_dict)
|
||||
all_new_chunks_counter -= 1
|
||||
continue
|
||||
|
||||
|
||||
new_qanon_chunks.append(new_chunk)
|
||||
new_qanon_chunks.append(section_dict)
|
||||
|
||||
if len(new_qanon_chunks) == 0:
|
||||
if len(temp_small_sections) == 2:
|
||||
# ادغام سکشنی که برابر با عنوان قانون است با تک سکشن دیگر که ذیل آن عنوان است
|
||||
# temp_small_sections.pop(0)
|
||||
temp_section = temp_small_sections[0] + " " + temp_small_sections[1]
|
||||
temp_section = temp_small_sections[0]
|
||||
temp_section_content = temp_section['content'] + " " + temp_small_sections[1]['content']
|
||||
section_dict = {
|
||||
'id': temp_section['id'],
|
||||
'child_order' : temp_section['child_order'],
|
||||
'content': temp_section_content,
|
||||
'parent_id': temp_section['parent_id'],
|
||||
'level' : temp_section['level'],
|
||||
'number_text' : temp_section['number_text']
|
||||
}
|
||||
temp_small_sections = []
|
||||
temp_small_sections.append(temp_section)
|
||||
temp_small_sections.append(section_dict)
|
||||
|
||||
new_qanon_chunks = temp_small_sections
|
||||
temp_small_qanon_counter += 1
|
||||
|
@ -83,12 +110,11 @@ for qid in tqdm(data):
|
|||
"new_sections": new_qanon_chunks
|
||||
})
|
||||
|
||||
print("long_chunks: " + str(len(long_chunks_id)))
|
||||
print("all_prev_chunks_counter: " + str(all_prev_chunks_counter))
|
||||
print("all_new_chunks_counter: " + str(all_new_chunks_counter))
|
||||
print("temp_small_qanon_counter: " + str(temp_small_qanon_counter))
|
||||
print()
|
||||
outputfile = open('./data/joint_qanon_170k_new.json', "w", encoding='utf-8')
|
||||
outputfile = open('./data/joint_qanon_170k_newface.json', "w", encoding='utf-8')
|
||||
outputfile.write(json.dumps(new_qanon_sections, ensure_ascii=False, indent = 4))
|
||||
outputfile.close()
|
||||
|
||||
|
@ -99,5 +125,16 @@ outputfile.close()
|
|||
print(len(new_qanon_sections))
|
||||
print(f'join count {count}')
|
||||
end_time = time.time()
|
||||
print('====================================')
|
||||
print('====================================')
|
||||
print("long_chunks: " + str(len(long_chunks_id)))
|
||||
long_chunks_ids_text = ''
|
||||
for idd in long_chunks_id:
|
||||
long_chunks_ids_text += idd + '\n'
|
||||
outputfile = open('./data/long_ids.txt', "w", encoding='utf-8')
|
||||
outputfile.write(long_chunks_ids_text)
|
||||
outputfile.close()
|
||||
print('====================================')
|
||||
print('====================================')
|
||||
print(f"elapsed time: {end_time-start_time}")
|
||||
print("end")
|
Loading…
Reference in New Issue
Block a user