Flair_NER/remove_all_o_sentences.py

def analyze_and_filter_dataset(input_file, output_file, report_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # استخراج جملات
    sentences = []
    current_sentence = []

    for line in lines:
        stripped_line = line.strip()
        if not stripped_line or stripped_line == '...':
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
        else:
            current_sentence.append(stripped_line)
    if current_sentence:
        sentences.append(current_sentence)

    # شمارش جملات
    total_sentences = len(sentences)
    filtered_sentences = []
    all_o_count = 0

    for sent in sentences:
        tags = [token.split()[1] for token in sent if len(token.split()) >= 2]
        if all(tag == 'O' for tag in tags):
            all_o_count += 1
        else:
            filtered_sentences.append(sent)

    kept_sentences = len(filtered_sentences)

    # شمارش توکن‌ها
    total_tokens = sum(len(sent) for sent in filtered_sentences)

    # استخراج موجودیت‌ها
    org_entities = []
    ref_entities = []
    current_org = []
    current_ref = []

    for sent in filtered_sentences:
        for token_line in sent:
            parts = token_line.strip().split()
            if len(parts) < 2:
                continue
            token, tag = parts[0], parts[1]

            # پردازش ORG
            if tag == 'B-ORG':
                if current_org:
                    org_entities.append(' '.join(current_org))
                current_org = [token]
            elif tag == 'I-ORG':
                if current_org:
                    current_org.append(token)
            else:
                if current_org:
                    org_entities.append(' '.join(current_org))
                    current_org = []

            # پردازش REF
            if tag == 'B-REF':
                if current_ref:
                    ref_entities.append(' '.join(current_ref))
                current_ref = [token]
            elif tag == 'I-REF':
                if current_ref:
                    current_ref.append(token)
            else:
                if current_ref:
                    ref_entities.append(' '.join(current_ref))
                    current_ref = []

        # بستن موجودیت‌های باز در پایان جمله
        if current_org:
            org_entities.append(' '.join(current_org))
            current_org = []
        if current_ref:
            ref_entities.append(' '.join(current_ref))
            current_ref = []

    unique_org_entities = list(set(org_entities))
    unique_ref_entities = list(set(ref_entities))

    # نوشتن گزارش در فایل و چاپ آن
    report = []
    report.append("📊 گزارش تحلیل دیتاست NER")
    report.append("=" * 60)
    report.append(f"✅ تعداد کل جملات در دیتاست: {total_sentences}")
    report.append(f"❌ تعداد جملات حذف‌شده (همه تگ O): {all_o_count}")
    report.append(f"✅ تعداد جملات باقی‌مانده (دارای موجودیت): {kept_sentences}")
    report.append(f"🔤 تعداد کل توکن‌ها در جملات باقی‌مانده: {total_tokens}")
    report.append(f"🏢 تعداد موجودیت ORG (سازمان): {len(org_entities)}")
    report.append(f"📜 تعداد موجودیت REF (ارجاع به قانون/ماده): {len(ref_entities)}")
    report.append(f"🔍 تعداد موجودیت منحصربه‌فرد ORG: {len(unique_org_entities)}")
    report.append(f"🔍 تعداد موجودیت منحصربه‌فرد REF: {len(unique_ref_entities)}")

    # report.append("\n📋 نمونه‌های منحصربه‌فرد موجودیت ORG:")
    # for ent in sorted(unique_org_entities)[:10]:
    #     report.append(f"  • {ent}")
    # if len(unique_org_entities) > 10:
    #     report.append(f"  ... و {len(unique_org_entities) - 10} مورد دیگر")

    # report.append("\n📋 نمونه‌های منحصربه‌فرد موجودیت REF:")
    # for ent in sorted(unique_ref_entities)[:10]:
    #     report.append(f"  • {ent}")
    # if len(unique_ref_entities) > 10:
    #     report.append(f"  ... و {len(unique_ref_entities) - 10} مورد دیگر")

    report.append("\n📋 همه موجودیت‌های منحصربه‌فرد ORG:")
    if unique_org_entities:
        for ent in unique_org_entities:
            report.append(f"  • {ent}")
    else:
        report.append("  • هیچ موجودیت ORG‌ای یافت نشد.")

    report.append("\n📋 همه موجودیت‌های منحصربه‌فرد REF:")
    if unique_ref_entities:
        for ent in unique_ref_entities:
            report.append(f"  • {ent}")
    else:
        report.append("  • هیچ موجودیت REF‌ای یافت نشد.")


    report.append(f"\n✅ پردازش کامل شد.")
    report.append(f"💾 خروجی فیلترشده در '{output_file}' ذخیره شد.")
    report.append(f"📄 گزارش کامل در '{report_file}' ذخیره شد.")

    # چاپ گزارش در کنسول
    for line in report:
        print(line)

    # ذخیره گزارش در فایل
    with open(report_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(report))

    # ذخیره دیتاست فیلترشده
    with open(output_file, 'w', encoding='utf-8') as f:
        for sent in filtered_sentences:
            for line in sent:
                f.write(line.strip() + '\n')
            f.write('\n')

# اجرای تابع
analyze_and_filter_dataset(
    input_file='data/DATASET140402_ref_org.txt',
    output_file='DATASET140402_ref_org_filtered.txt',
    report_file='گزارش_تحلیل_دیتاست.txt'
)