rag_qavanin_api/split_content_to_sentences.py

105 lines
3.4 KiB
Python

# بسم الله
import json
from normalizer import cleaning
# try:
from elastic_helper import ElasticHelper
# except Exception as error:
# eee = error
# pass
def full_path_text_maker(full_path):
"""
این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند
Args:
full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن
Returns:
full_path_text(str): متن بازسازی شده از مسیر یک سکشن
"""
full_path_text = ""
for i, path_item in enumerate(reversed(full_path)):
if i == len(full_path) - 1:
full_path_text += ''.join(f'{path_item}')
break
full_path_text += ''.join(f'{path_item} از ')
full_path_text = full_path_text.strip()
return full_path_text
if __name__ == "__main__":
eh_obj = ElasticHelper()
path = ".\data\mj_qa_section-v02.zip"
sections_elastic = eh_obj.iterateJsonFile(path, True)
all_count = 0
dont_cares = []
ALL_SECTIONS = []
n=0
for index, item in enumerate(sections_elastic):
source = item['source']
section_path = source['other_info']['full_path']
id = item['id']
filtered_keys = ['فصل','موخره','امضاء','عنوان']
section_path = source['other_info']['full_path']
flag = False
if '>' in section_path:
path_parts = section_path.split('>')
for key in filtered_keys:
if key in path_parts[-1]:
dont_cares.append(id)
flag = True
break
if flag:
continue
else:
for key in filtered_keys:
if key in section_path:
dont_cares.append(id)
flag = True
break
if flag:
continue
qanon_title = source['qanon_title']
full_path_text = full_path_text_maker(section_path.split('>'))
section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: "
try:
content = cleaning(item['source']['content'])
sentences = content.split(".")
# # کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند
# if len(content.split()) <= 10:
# continue
except Exception as error:
print(error)
continue
for sentence in sentences:
if sentence == "":
continue
all_count +=1
sentence_id = f"sn{n}"
n+=1
data = {
'id': id,
"sentence_id" : sentence_id,
'fullpath': section_path,
'qanon-title': qanon_title,
'section-prefix': section_prefix,
'sentence-content': sentence
}
ALL_SECTIONS.append(data)
with open('ALL_SECTIONS.json', 'w', encoding='utf-8') as f:
json.dump(ALL_SECTIONS, f, indent=4, ensure_ascii=False)
print(f'all_count: {all_count}')
print(f'dont_cares: {len(dont_cares)}')
print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}')