import csv import re # مسیر فایل ورودی CSV path = './data/khamenei-new.csv' # مسیر فایل خروجی OUTPUT_TXT = "./data/unique_words.txt" # حروف ربط و کلمات پرتکرار (قابل‌افزایش) STOPWORDS = { "و", "یا", "به", "از", "در", "با", "برای", "که", "این", "آن", "است", "بود", "شد", "خب","ولی","لکن","لاکن","های","ها","می", "را", "بر", "هم", "تا", "اگر", "اما", "پس", "نه", "هر", "همه", "هیچ" } def extract_words(text: str) -> list[str]: if not text: return [] # حذف علائم نگارشی (فارسی و انگلیسی) text = re.sub(r"[^\w\s]", " ", text) # حذف اعداد text = re.sub(r"\d+", " ", text) words = [] for w in text.split(): w = w.strip() if len(w) < 2: continue if w in STOPWORDS: continue words.append(w) return words unique_words = set() # خواندن CSV with open(path, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) if "text" not in reader.fieldnames: raise ValueError("CSV file does not contain a 'text' column") for row in reader: words = extract_words(row["text"]) unique_words.update(words) # مرتب‌سازی کلمات یونیک sorted_words = sorted(unique_words) # نوشتن در فایل متنی with open(OUTPUT_TXT, "w", encoding="utf-8") as f: for word in sorted_words: f.write(word + "\n") print(f"✅ Total unique words: {len(sorted_words)}") print(f"📄 Saved to: {OUTPUT_TXT}")