# بسم الله import json from normalizer import cleaning # try: from elastic_helper import ElasticHelper # except Exception as error: # eee = error # pass def full_path_text_maker(full_path): """ این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند Args: full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن Returns: full_path_text(str): متن بازسازی شده از مسیر یک سکشن """ full_path_text = "" for i, path_item in enumerate(reversed(full_path)): if i == len(full_path) - 1: full_path_text += ''.join(f'{path_item}') break full_path_text += ''.join(f'{path_item} از ') full_path_text = full_path_text.strip() return full_path_text if __name__ == "__main__": eh_obj = ElasticHelper() path = ".\data\mj_qa_section-v02.zip" sections_elastic = eh_obj.iterateJsonFile(path, True) all_count = 0 dont_cares = [] ALL_SECTIONS = [] n=0 for index, item in enumerate(sections_elastic): source = item['source'] section_path = source['other_info']['full_path'] id = item['id'] filtered_keys = ['فصل','موخره','امضاء','عنوان'] section_path = source['other_info']['full_path'] flag = False if '>' in section_path: path_parts = section_path.split('>') for key in filtered_keys: if key in path_parts[-1]: dont_cares.append(id) flag = True break if flag: continue else: for key in filtered_keys: if key in section_path: dont_cares.append(id) flag = True break if flag: continue qanon_title = source['qanon_title'] full_path_text = full_path_text_maker(section_path.split('>')) section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: " try: content = cleaning(item['source']['content']) sentences = content.split(".") # # کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند # if len(content.split()) <= 10: # continue except Exception as error: print(error) continue for sentence in sentences: if sentence == "": continue all_count +=1 sentence_id = f"sn{n}" n+=1 data = { 'id': id, "sentence_id" : sentence_id, 'fullpath': section_path, 'qanon-title': qanon_title, 'section-prefix': section_prefix, 'sentence-content': sentence } ALL_SECTIONS.append(data) with open('ALL_SECTIONS.json', 'w', encoding='utf-8') as f: json.dump(ALL_SECTIONS, f, indent=4, ensure_ascii=False) print(f'all_count: {all_count}') print(f'dont_cares: {len(dont_cares)}') print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}')