105 lines
3.4 KiB
Python
105 lines
3.4 KiB
Python
# بسم الله
|
|
|
|
import json
|
|
from normalizer import cleaning
|
|
# try:
|
|
from elastic_helper import ElasticHelper
|
|
# except Exception as error:
|
|
# eee = error
|
|
# pass
|
|
|
|
|
|
|
|
|
|
def full_path_text_maker(full_path):
|
|
"""
|
|
این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند
|
|
|
|
Args:
|
|
full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن
|
|
Returns:
|
|
full_path_text(str): متن بازسازی شده از مسیر یک سکشن
|
|
"""
|
|
full_path_text = ""
|
|
for i, path_item in enumerate(reversed(full_path)):
|
|
if i == len(full_path) - 1:
|
|
full_path_text += ''.join(f'{path_item}')
|
|
break
|
|
full_path_text += ''.join(f'{path_item} از ')
|
|
full_path_text = full_path_text.strip()
|
|
return full_path_text
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
eh_obj = ElasticHelper()
|
|
path = ".\data\mj_qa_section-v02.zip"
|
|
sections_elastic = eh_obj.iterateJsonFile(path, True)
|
|
all_count = 0
|
|
dont_cares = []
|
|
ALL_SECTIONS = []
|
|
n=0
|
|
for index, item in enumerate(sections_elastic):
|
|
|
|
source = item['source']
|
|
section_path = source['other_info']['full_path']
|
|
id = item['id']
|
|
|
|
filtered_keys = ['فصل','موخره','امضاء','عنوان']
|
|
section_path = source['other_info']['full_path']
|
|
flag = False
|
|
if '>' in section_path:
|
|
path_parts = section_path.split('>')
|
|
for key in filtered_keys:
|
|
if key in path_parts[-1]:
|
|
dont_cares.append(id)
|
|
flag = True
|
|
break
|
|
if flag:
|
|
continue
|
|
else:
|
|
for key in filtered_keys:
|
|
if key in section_path:
|
|
dont_cares.append(id)
|
|
flag = True
|
|
break
|
|
if flag:
|
|
continue
|
|
|
|
qanon_title = source['qanon_title']
|
|
full_path_text = full_path_text_maker(section_path.split('>'))
|
|
section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: "
|
|
|
|
try:
|
|
content = cleaning(item['source']['content'])
|
|
sentences = content.split(".")
|
|
# # کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند
|
|
# if len(content.split()) <= 10:
|
|
# continue
|
|
except Exception as error:
|
|
print(error)
|
|
continue
|
|
|
|
for sentence in sentences:
|
|
if sentence == "":
|
|
continue
|
|
all_count +=1
|
|
sentence_id = f"sn{n}"
|
|
n+=1
|
|
|
|
data = {
|
|
'id': id,
|
|
"sentence_id" : sentence_id,
|
|
'fullpath': section_path,
|
|
'qanon-title': qanon_title,
|
|
'section-prefix': section_prefix,
|
|
'sentence-content': sentence
|
|
}
|
|
|
|
ALL_SECTIONS.append(data)
|
|
|
|
with open('ALL_SECTIONS.json', 'w', encoding='utf-8') as f:
|
|
json.dump(ALL_SECTIONS, f, indent=4, ensure_ascii=False)
|
|
print(f'all_count: {all_count}')
|
|
print(f'dont_cares: {len(dont_cares)}')
|
|
print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}') |