106 lines
4.1 KiB
Python
106 lines
4.1 KiB
Python
# بسم الله
|
||
|
||
|
||
|
||
import json
|
||
|
||
'''
|
||
این سورس دیکشنری جیسونی نهج البلاغه که شامل سنتنس ها (جدا شده بر اساس نقطه) هستش رو
|
||
به پارت ها تبدیل میکنه (پاراگراف)
|
||
فایل جیسونی که به عنوان ورودی میگیره ، خروجی unify_embedder دایرکتوری nahj هستش
|
||
'''
|
||
|
||
|
||
|
||
nahj_sentence_file = "./nahj_data/nahj_vector_bge_m3.json"
|
||
|
||
# --- 1. بارگذاری دادهها از JSON ---
|
||
with open(nahj_sentence_file, 'r', encoding='utf-8') as f:
|
||
all_sentences = json.load(f)
|
||
|
||
sentence_number = all_sentences['sentence-1']['number']
|
||
part_order = all_sentences['sentence-1']['part_orders']
|
||
sentence_text = all_sentences['sentence-1']['sentence']
|
||
context_id = all_sentences['sentence-1']['context_id']
|
||
part_id = all_sentences['sentence-1']['part_id']
|
||
sentence_url = all_sentences['sentence-1']['url']
|
||
interpretation_link = all_sentences['sentence-1']['Interpretation_link']
|
||
sentence_id = all_sentences['sentence-1']['id']
|
||
sentence_title = all_sentences['sentence-1']['title']
|
||
sentence_large_title = all_sentences['sentence-1']['large_title']
|
||
sentence_arabic_text = all_sentences['sentence-1']['arabic_text']
|
||
sentence_type = all_sentences['sentence-1']['type']
|
||
|
||
|
||
first_try = True
|
||
all_parts = []
|
||
for x,sentence in all_sentences.items() :
|
||
if first_try == True :
|
||
first_try = False
|
||
continue
|
||
if x=="sentence-3777":
|
||
pass
|
||
if sentence['part_orders'] == part_order and \
|
||
sentence['number'] == sentence_number and \
|
||
sentence["type"] == sentence_type:
|
||
|
||
sentence_text = sentence_text.strip()
|
||
|
||
if sentence_text[-1] == ".":
|
||
sentence_text = sentence_text + " " + sentence['sentence']
|
||
else :
|
||
sentence_text = sentence_text + ". " + sentence['sentence']
|
||
|
||
else :
|
||
all_parts.append({
|
||
|
||
"id" : sentence_id,
|
||
"context_id" : context_id,
|
||
"part_id" : part_id,
|
||
"number" : sentence_number,
|
||
"part_order" : part_order,
|
||
"url" : sentence_url,
|
||
"interpretation_link" : interpretation_link,
|
||
"title" : sentence_title,
|
||
"large_title" : sentence_large_title,
|
||
"part_text" : sentence_text,
|
||
"arabic_text" : sentence_arabic_text,
|
||
"type" : sentence_type
|
||
})
|
||
|
||
sentence_number = sentence['number']
|
||
part_order = sentence['part_orders']
|
||
sentence_text = sentence['sentence']
|
||
context_id = sentence['context_id']
|
||
part_id = sentence['part_id']
|
||
sentence_url = sentence['url']
|
||
interpretation_link = sentence['Interpretation_link']
|
||
sentence_id = sentence['id']
|
||
sentence_title = sentence['title']
|
||
sentence_large_title = sentence['large_title']
|
||
sentence_arabic_text = sentence['arabic_text']
|
||
sentence_type = sentence['type']
|
||
|
||
all_parts.append({
|
||
|
||
"id" : sentence_id,
|
||
"context_id" : context_id,
|
||
"part_id" : part_id,
|
||
"number" : sentence_number,
|
||
"part_order" : part_order,
|
||
"url" : sentence_url,
|
||
"interpretation_link" : interpretation_link,
|
||
"title" : sentence_title,
|
||
"large_title" : sentence_large_title,
|
||
"part_text" : sentence_text,
|
||
"arabic_text" : sentence_arabic_text,
|
||
"type" : sentence_type
|
||
})
|
||
output_file_path = "./nahj_data/all_nahj_parts.json"
|
||
|
||
with open(output_file_path, 'w', encoding='utf-8') as f:
|
||
json.dump(all_parts, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
|