nahj_rag/convert_sentence_to_part.py
2026-04-30 19:16:50 +03:30

106 lines
4.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# بسم الله
import json
'''
این سورس دیکشنری جیسونی نهج البلاغه که شامل سنتنس ها (جدا شده بر اساس نقطه) هستش رو
به پارت ها تبدیل میکنه (پاراگراف)
فایل جیسونی که به عنوان ورودی میگیره ، خروجی unify_embedder دایرکتوری nahj هستش
'''
nahj_sentence_file = "./nahj_data/nahj_vector_bge_m3.json"
# --- 1. بارگذاری داده‌ها از JSON ---
with open(nahj_sentence_file, 'r', encoding='utf-8') as f:
all_sentences = json.load(f)
sentence_number = all_sentences['sentence-1']['number']
part_order = all_sentences['sentence-1']['part_orders']
sentence_text = all_sentences['sentence-1']['sentence']
context_id = all_sentences['sentence-1']['context_id']
part_id = all_sentences['sentence-1']['part_id']
sentence_url = all_sentences['sentence-1']['url']
interpretation_link = all_sentences['sentence-1']['Interpretation_link']
sentence_id = all_sentences['sentence-1']['id']
sentence_title = all_sentences['sentence-1']['title']
sentence_large_title = all_sentences['sentence-1']['large_title']
sentence_arabic_text = all_sentences['sentence-1']['arabic_text']
sentence_type = all_sentences['sentence-1']['type']
first_try = True
all_parts = []
for x,sentence in all_sentences.items() :
if first_try == True :
first_try = False
continue
if x=="sentence-3777":
pass
if sentence['part_orders'] == part_order and \
sentence['number'] == sentence_number and \
sentence["type"] == sentence_type:
sentence_text = sentence_text.strip()
if sentence_text[-1] == ".":
sentence_text = sentence_text + " " + sentence['sentence']
else :
sentence_text = sentence_text + ". " + sentence['sentence']
else :
all_parts.append({
"id" : sentence_id,
"context_id" : context_id,
"part_id" : part_id,
"number" : sentence_number,
"part_order" : part_order,
"url" : sentence_url,
"interpretation_link" : interpretation_link,
"title" : sentence_title,
"large_title" : sentence_large_title,
"part_text" : sentence_text,
"arabic_text" : sentence_arabic_text,
"type" : sentence_type
})
sentence_number = sentence['number']
part_order = sentence['part_orders']
sentence_text = sentence['sentence']
context_id = sentence['context_id']
part_id = sentence['part_id']
sentence_url = sentence['url']
interpretation_link = sentence['Interpretation_link']
sentence_id = sentence['id']
sentence_title = sentence['title']
sentence_large_title = sentence['large_title']
sentence_arabic_text = sentence['arabic_text']
sentence_type = sentence['type']
all_parts.append({
"id" : sentence_id,
"context_id" : context_id,
"part_id" : part_id,
"number" : sentence_number,
"part_order" : part_order,
"url" : sentence_url,
"interpretation_link" : interpretation_link,
"title" : sentence_title,
"large_title" : sentence_large_title,
"part_text" : sentence_text,
"arabic_text" : sentence_arabic_text,
"type" : sentence_type
})
output_file_path = "./nahj_data/all_nahj_parts.json"
with open(output_file_path, 'w', encoding='utf-8') as f:
json.dump(all_parts, f, ensure_ascii=False, indent=2)