nahj_rag/detect_how_to_represent.py



import csv
import re

# مسیر فایل ورودی CSV
path = './data/khamenei-new.csv'

# مسیر فایل خروجی
OUTPUT_TXT = "./data/unique_words.txt"


# حروف ربط و کلمات پرتکرار (قابل‌افزایش)
STOPWORDS = {
    "و", "یا", "به", "از", "در", "با", "برای", "که", "این", "آن",
    "است", "بود", "شد", "خب","ولی","لکن","لاکن","های","ها","می", "را", "بر", "هم", "تا", "اگر",
    "اما", "پس", "نه", "هر", "همه", "هیچ"
}


def extract_words(text: str) -> list[str]:
    if not text:
        return []

    # حذف علائم نگارشی (فارسی و انگلیسی)
    text = re.sub(r"[^\w\s]", " ", text)

    # حذف اعداد
    text = re.sub(r"\d+", " ", text)

    words = []
    for w in text.split():
        w = w.strip()
        if len(w) < 2:
            continue
        if w in STOPWORDS:
            continue
        words.append(w)

    return words


unique_words = set()

# خواندن CSV
with open(path, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)

    if "text" not in reader.fieldnames:
        raise ValueError("CSV file does not contain a 'text' column")

    for row in reader:
        words = extract_words(row["text"])
        unique_words.update(words)


# مرتب‌سازی کلمات یونیک
sorted_words = sorted(unique_words)

# نوشتن در فایل متنی
with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
    for word in sorted_words:
        f.write(word + "\n")


print(f"✅ Total unique words: {len(sorted_words)}")
print(f"📄 Saved to: {OUTPUT_TXT}")