nahj_rag/convert_parts_to_context.py
2026-04-30 19:16:50 +03:30

119 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# بسم الله
import json
"""
این سورس خروجی سورس convert_sentence_to_part رو به عنوان ورودی دریافت میکنه
و پارت های نهج البلاغه رو به صورت یک (حکمت یا خطبه یا نامه) درمیاره که هرکدوم یک key شامل پاراگراف ها دارن
"""
nahj_parts_file = "./nahj_data/all_nahj_parts.json"
# --- 1. بارگذاری داده‌ها از JSON ---
with open(nahj_parts_file, 'r', encoding='utf-8') as f:
all_parts = json.load(f)
id_ = all_parts[0]['context_id']
paragraph_id = all_parts[0]['part_id']
number = all_parts[0]['number']
paragraph_order = all_parts[0]['part_order']
url = all_parts[0]['url']
interpretation_link = all_parts[0]['interpretation_link']
title = all_parts[0]['title']
large_title = all_parts[0]['large_title']
text = all_parts[0]['part_text']
arabic_text = all_parts[0]['arabic_text']
type_ = all_parts[0]['type']
first_try = True
paragraph_list = []
final_list = []
for part in all_parts:
if first_try == True:
first_try = False
paragraph_list.append({
"paragraph_id":paragraph_id,
"number": number,
"paragraph_order":paragraph_order,
"large_title":large_title,
"text":text,
"arabic_text":arabic_text
})
continue
if part['number'] == 480:
pass
if part['number'] == number and \
part['type'] == type_ :
paragraph_list.append({
"paragraph_id":part['part_id'],
"number": part['number'],
"paragraph_order":part['part_order'],
"large_title":part['large_title'],
"text":part['part_text'],
"arabic_text":part['arabic_text']
})
else:
final_list.append({
"id":id_,
"url":url,
"interpretation_link":interpretation_link,
"title":title,
"large_title":large_title,
"type":type_,
"paragraphs":paragraph_list
})
paragraph_list = [{
"paragraph_id":part['part_id'],
"number": part['number'],
"paragraph_order":part['part_order'],
"large_title":part['large_title'],
"text":part['part_text'],
"arabic_text":part['arabic_text']
}]
id_ = part['context_id']
paragraph_id = part['part_id']
number = part['number']
paragraph_order = part['part_order']
url = part['url']
interpretation_link = part['interpretation_link']
title = part['title']
large_title = part['large_title']
text = part['part_text']
arabic_text = part['arabic_text']
type_ = part['type']
final_list.append({
"id":id_,
"url":url,
"interpretation_link":interpretation_link,
"title":title,
"large_title":large_title,
"type":type_,
"paragraphs":paragraph_list
})
output_file_path = "./nahj_data/all_nahj_CONTEXT.json"
with open(output_file_path, 'w', encoding='utf-8') as f:
json.dump(final_list, f, ensure_ascii=False, indent=2)