68 lines
1.6 KiB
Python
68 lines
1.6 KiB
Python
|
||
|
||
import csv
|
||
import re
|
||
|
||
# مسیر فایل ورودی CSV
|
||
path = './data/khamenei-new.csv'
|
||
|
||
# مسیر فایل خروجی
|
||
OUTPUT_TXT = "./data/unique_words.txt"
|
||
|
||
|
||
# حروف ربط و کلمات پرتکرار (قابلافزایش)
|
||
STOPWORDS = {
|
||
"و", "یا", "به", "از", "در", "با", "برای", "که", "این", "آن",
|
||
"است", "بود", "شد", "خب","ولی","لکن","لاکن","های","ها","می", "را", "بر", "هم", "تا", "اگر",
|
||
"اما", "پس", "نه", "هر", "همه", "هیچ"
|
||
}
|
||
|
||
|
||
def extract_words(text: str) -> list[str]:
|
||
if not text:
|
||
return []
|
||
|
||
# حذف علائم نگارشی (فارسی و انگلیسی)
|
||
text = re.sub(r"[^\w\s]", " ", text)
|
||
|
||
# حذف اعداد
|
||
text = re.sub(r"\d+", " ", text)
|
||
|
||
words = []
|
||
for w in text.split():
|
||
w = w.strip()
|
||
if len(w) < 2:
|
||
continue
|
||
if w in STOPWORDS:
|
||
continue
|
||
words.append(w)
|
||
|
||
return words
|
||
|
||
|
||
unique_words = set()
|
||
|
||
# خواندن CSV
|
||
with open(path, newline="", encoding="utf-8") as f:
|
||
reader = csv.DictReader(f)
|
||
|
||
if "text" not in reader.fieldnames:
|
||
raise ValueError("CSV file does not contain a 'text' column")
|
||
|
||
for row in reader:
|
||
words = extract_words(row["text"])
|
||
unique_words.update(words)
|
||
|
||
|
||
# مرتبسازی کلمات یونیک
|
||
sorted_words = sorted(unique_words)
|
||
|
||
# نوشتن در فایل متنی
|
||
with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
|
||
for word in sorted_words:
|
||
f.write(word + "\n")
|
||
|
||
|
||
print(f"✅ Total unique words: {len(sorted_words)}")
|
||
print(f"📄 Saved to: {OUTPUT_TXT}")
|