nahj_rag/convert_sentence_to_part.py

# بسم الله


import json

'''
 این سورس دیکشنری جیسونی نهج البلاغه که شامل سنتنس ها (جدا شده بر اساس نقطه) هستش رو
 به پارت ها تبدیل میکنه (پاراگراف)
 فایل جیسونی که به عنوان ورودی میگیره ، خروجی unify_embedder دایرکتوری nahj هستش
'''


nahj_sentence_file = "./nahj_data/nahj_vector_bge_m3.json"

    # --- 1. بارگذاری داده‌ها از JSON ---
with open(nahj_sentence_file, 'r', encoding='utf-8') as f:
        all_sentences = json.load(f)

sentence_number = all_sentences['sentence-1']['number']
part_order = all_sentences['sentence-1']['part_orders']
sentence_text = all_sentences['sentence-1']['sentence']
context_id = all_sentences['sentence-1']['context_id']
part_id = all_sentences['sentence-1']['part_id']
sentence_url = all_sentences['sentence-1']['url']
interpretation_link = all_sentences['sentence-1']['Interpretation_link']
sentence_id = all_sentences['sentence-1']['id']
sentence_title = all_sentences['sentence-1']['title']
sentence_large_title = all_sentences['sentence-1']['large_title']
sentence_arabic_text = all_sentences['sentence-1']['arabic_text']
sentence_type = all_sentences['sentence-1']['type']


first_try = True
all_parts = []
for x,sentence in all_sentences.items() :
        if first_try == True :
                first_try = False
                continue
        if x=="sentence-3777":
                pass
        if sentence['part_orders'] == part_order and \
            sentence['number'] == sentence_number and \
            sentence["type"] == sentence_type:

                sentence_text = sentence_text.strip()

                if sentence_text[-1] == ".":
                        sentence_text = sentence_text + " " + sentence['sentence']
                else :
                        sentence_text = sentence_text + ". " + sentence['sentence']

        else :
            all_parts.append({

                          "id" : sentence_id,
                          "context_id" : context_id,
                          "part_id" : part_id,
                          "number" : sentence_number,
                          "part_order" : part_order,
                          "url" : sentence_url,
                          "interpretation_link" : interpretation_link,
                          "title" : sentence_title,
                          "large_title" : sentence_large_title,
                          "part_text" : sentence_text,
                          "arabic_text" : sentence_arabic_text,
                          "type" : sentence_type
                   })

            sentence_number = sentence['number']
            part_order = sentence['part_orders']
            sentence_text = sentence['sentence']
            context_id = sentence['context_id']
            part_id = sentence['part_id']
            sentence_url = sentence['url']
            interpretation_link = sentence['Interpretation_link']
            sentence_id = sentence['id']
            sentence_title = sentence['title']
            sentence_large_title = sentence['large_title']
            sentence_arabic_text = sentence['arabic_text']
            sentence_type = sentence['type']

all_parts.append({

                          "id" : sentence_id,
                          "context_id" : context_id,
                          "part_id" : part_id,
                          "number" : sentence_number,
                          "part_order" : part_order,
                          "url" : sentence_url,
                          "interpretation_link" : interpretation_link,
                          "title" : sentence_title,
                          "large_title" : sentence_large_title,
                          "part_text" : sentence_text,
                          "arabic_text" : sentence_arabic_text,
                          "type" : sentence_type
                   })
output_file_path = "./nahj_data/all_nahj_parts.json"

with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(all_parts, f, ensure_ascii=False, indent=2)