rag_qavanin_api/split_content_to_sentences.py

# بسم الله

import json
from normalizer import cleaning
# try:
from elastic_helper import ElasticHelper
# except Exception as error:
#     eee = error
#     pass


def full_path_text_maker(full_path):
    """
    این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند

    Args:
        full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن
    Returns:
        full_path_text(str): متن بازسازی شده از مسیر یک سکشن
    """
    full_path_text = ""
    for i, path_item in enumerate(reversed(full_path)):
        if i == len(full_path) - 1:
            full_path_text += ''.join(f'{path_item}')
            break
        full_path_text += ''.join(f'{path_item} از ')
    full_path_text = full_path_text.strip()
    return full_path_text


if __name__ == "__main__":
    eh_obj = ElasticHelper()
    path = ".\data\mj_qa_section-v02.zip"
    sections_elastic = eh_obj.iterateJsonFile(path, True)
    all_count = 0
    dont_cares = []
    ALL_SECTIONS = []
    n=0
    for index, item in enumerate(sections_elastic):

        source = item['source']
        section_path = source['other_info']['full_path']
        id = item['id']

        filtered_keys = ['فصل','موخره','امضاء','عنوان']
        section_path = source['other_info']['full_path']
        flag = False
        if '>' in section_path:
            path_parts = section_path.split('>')
            for key in filtered_keys:
                if key in path_parts[-1]:
                    dont_cares.append(id)
                    flag = True
                    break
            if flag:
                continue
        else:
            for key in filtered_keys:
                if key in section_path:
                    dont_cares.append(id)
                    flag = True
                    break
            if flag:
                continue

        qanon_title = source['qanon_title']
        full_path_text = full_path_text_maker(section_path.split('>'))
        section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: "

        try:
            content = cleaning(item['source']['content'])
            sentences = content.split(".")
            # # کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند
            # if len(content.split()) <= 10:
            #     continue
        except Exception as error:
            print(error)
            continue

        for sentence in sentences:
            if sentence == "":
                continue
            all_count +=1
            sentence_id = f"sn{n}"
            n+=1

            data = {
                'id': id,
                "sentence_id" : sentence_id,
                'fullpath': section_path,
                'qanon-title': qanon_title,
                'section-prefix': section_prefix,
                'sentence-content': sentence
            }

            ALL_SECTIONS.append(data)

    with open('ALL_SECTIONS.json', 'w', encoding='utf-8') as f:
        json.dump(ALL_SECTIONS, f, indent=4, ensure_ascii=False)
    print(f'all_count: {all_count}')
    print(f'dont_cares: {len(dont_cares)}')
    print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}')