nahj_rag/detect_how_to_represent.py
2026-02-17 16:52:37 +00:00

68 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
import re
# مسیر فایل ورودی CSV
path = './data/khamenei-new.csv'
# مسیر فایل خروجی
OUTPUT_TXT = "./data/unique_words.txt"
# حروف ربط و کلمات پرتکرار (قابل‌افزایش)
STOPWORDS = {
"و", "یا", "به", "از", "در", "با", "برای", "که", "این", "آن",
"است", "بود", "شد", "خب","ولی","لکن","لاکن","های","ها","می", "را", "بر", "هم", "تا", "اگر",
"اما", "پس", "نه", "هر", "همه", "هیچ"
}
def extract_words(text: str) -> list[str]:
if not text:
return []
# حذف علائم نگارشی (فارسی و انگلیسی)
text = re.sub(r"[^\w\s]", " ", text)
# حذف اعداد
text = re.sub(r"\d+", " ", text)
words = []
for w in text.split():
w = w.strip()
if len(w) < 2:
continue
if w in STOPWORDS:
continue
words.append(w)
return words
unique_words = set()
# خواندن CSV
with open(path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
if "text" not in reader.fieldnames:
raise ValueError("CSV file does not contain a 'text' column")
for row in reader:
words = extract_words(row["text"])
unique_words.update(words)
# مرتب‌سازی کلمات یونیک
sorted_words = sorted(unique_words)
# نوشتن در فایل متنی
with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
for word in sorted_words:
f.write(word + "\n")
print(f"✅ Total unique words: {len(sorted_words)}")
print(f"📄 Saved to: {OUTPUT_TXT}")