advanced crawler for nahj speechs

2025-08-19 19:07:40 +03:30 · 2025-08-19 19:07:40 +03:30 · 3d78d3623d
commit 3d78d3623d
parent 9ee755bb39
3 changed files with 8921 additions and 105 deletions
--- a/crawler.py
+++ b/crawler.py
@ -1,8 +1,14 @@
-import requests
+import os
 from bs4 import BeautifulSoup
 import json
 import time
-import os
+import requests
 from bs4 import BeautifulSoup
 BASE_URL = "http://nahj.makarem.ir"
 DATA_DIR = "./output"
 TITLES_FILE = os.path.join(DATA_DIR, "speech-titles.json")
 FAILED_FILE = os.path.join(DATA_DIR, "failed-speech-pages.txt")
 OUTPUT_FILE = os.path.join(DATA_DIR, "nahj_speeches.json")
 def crawl_wisdoms():
    data = []
@ -67,40 +73,122 @@ def crawl_wisdoms():
 def fetch_page(url):
    """دریافت صفحه با ریتری و هندل خطا"""
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"[ERROR] {url} => {e}")
        return None
 def crawl_titles():
    """کراول عناوین خطبه‌ها از صفحات اصلی"""
    all_titles = []
    speech_counter = 1
-OUTPUT_FILE = "./output/speechs.json"
+    for page in range(1, 12):  # صفحات 1 تا 11
-FAILED_FILE = "./data/failed-speech-pages.txt"
+        url = f"{BASE_URL}/speech/page/{page}"
        print(f"[INFO] Crawling titles page {page} => {url}")
        html = fetch_page(url)
        if not html:
            continue
        soup = BeautifulSoup(html, "html.parser")
        cards = soup.select("h5.card-title a")
        for a in cards:
            title = a.get_text(strip=True)
            href = a.get("href")
            if not href.startswith("http"):
                href = BASE_URL + href
            all_titles.append({
                "speech_number": speech_counter,
                "speech_title_large": title,
                "url": href
            })
            speech_counter += 1
        time.sleep(1)
    # ذخیره در فایل
    os.makedirs(DATA_DIR, exist_ok=True)
    with open(TITLES_FILE, "w", encoding="utf-8") as f:
        json.dump(all_titles, f, ensure_ascii=False, indent=2)
    print(f"[DONE] Saved {len(all_titles)} speech titles to {TITLES_FILE}")
    return all_titles
-def crawl_speech_page(page):
+def load_failed():
-    """کراول یک خطبه بر اساس شماره صفحه و برگرداندن لیست بخش‌ها"""
+    """بارگذاری صفحات ناموفق قبلی"""
-    url = f"http://nahj.makarem.ir/speech/{page}"
+    if os.path.exists(FAILED_FILE):
-    response = requests.get(url, timeout=10)
+        with open(FAILED_FILE, "r", encoding="utf-8") as f:
-    if response.status_code != 200:
+            return set(line.strip() for line in f if line.strip())
-        raise Exception(f"status code {response.status_code}")
+    return set()
    soup = BeautifulSoup(response.text, "html.parser")
-    # عنوان خطبه
+def save_failed(failed_urls):
-    title_tag = soup.find("h2", class_="text-center phrase-title")
+    """ذخیره صفحات ناموفق در فایل (تجمیعی)"""
-    speech_title = title_tag.get_text(strip=True) if title_tag else ""
+    # previous_failed = load_failed()
    # updated_failed = previous_failed.union(failed_urls)
    with open(FAILED_FILE, "w", encoding="utf-8") as f:
        for url in sorted(failed_urls):
            f.write(url + "\n")
    # لیست بخش‌ها
    parts = []
    for idx, part_div in enumerate(soup.find_all("div", class_="phrase-text-container"), start=1):
        # متن عربی
        arabic_tag = part_div.find("p", class_="arabic-text show-off")
        arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
-        # ترجمه فارسی
+def load_existing_results():
-        persian_tag = part_div.find("p", class_="translate-text")
+    """بارگذاری داده‌های ذخیره‌شده قبلی"""
-        persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
+    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    return []
 def crawl_speech_parts(speeches):
    """کراول همه بخش‌های خطبه‌ها با Retry تا موفقیت کامل"""
    all_results = load_existing_results()
    done_urls = {rec["url"] for rec in all_results}
    failed = load_failed()
    while True:
        # انتخاب فقط خطبه‌هایی که هنوز کراول نشدن
        pending = [s for s in speeches if s["url"] not in done_urls]
        # اگر چیزی نمونده، پایان
        if not pending:
            print("[SUCCESS] All speeches crawled successfully.")
            if os.path.exists(FAILED_FILE):
                os.remove(FAILED_FILE)  # همه چیز موفق بود → پاک کردن فایل خطاها
            break
        print(f"[INFO] {len(pending)} speeches remaining...")
        new_failed = set()
        for speech in pending:
            speech_number = speech["speech_number"]
            speech_title_large = speech["speech_title_large"]
            url = speech["url"]
            print(f"[INFO] Crawling speech {speech_number}: {speech_title_large}")
            html = fetch_page(url)
            if not html:
                new_failed.add(url)
                continue
            soup = BeautifulSoup(html, "html.parser")
            try:
                speech_title = soup.find("h2", class_="text-center phrase-title").get_text()
                parts = soup.find_all("div", class_="phrase-text-container")
                for idx, part in enumerate(parts, start=1):
                    arabic_text = part.find("p", class_="arabic-text show-off").get_text()
                    persian_translate = part.find("p", class_="translate-text").get_text()
                    # لینک تفسیر
                    interpretation_link = ""
-        ul_tag = part_div.find("ul", class_="tools")
+                    ul_tag = part.find("ul", class_="tools")
                    if ul_tag:
                        first_li = ul_tag.find("li")
                        if first_li:
@ -108,92 +196,51 @@ def crawl_speech_page(page):
                            if a_tag and a_tag.has_attr("href"):
                                interpretation_link = a_tag["href"]
-        parts.append({
+                    record = {
-            "speech_title": speech_title,
+                        "speech_title_large": speech_title_large.strip(),
                        "speech_title_page": speech_title.strip(),
                        "speech_number": speech_number,
                        "part_order": idx,
                        "url": url,
-            "arabic_text": arabic_text,
+                        "arabic_text": arabic_text.strip(),
-            "persian_translate": persian_translate,
+                        "persian_translate": persian_translate.strip(),
-            "Interpretation_link": interpretation_link
+                        "Interpretation_link": BASE_URL + interpretation_link
-        })
+                    }
                    all_results.append(record)
-    return parts
+                done_urls.add(url)
                print(f"[OK] Crawled {url}")
            except Exception as e:
                print(f"[ERROR parsing] {url} => {e}")
                new_failed.add(url)
-def save_failed(pages):
+            time.sleep(1)
    with open(FAILED_FILE, "w", encoding="utf-8") as f:
        for p in pages:
            f.write(f"{p}\n")
-
+        # ذخیره داده‌ها
 def load_failed():
    if not os.path.exists(FAILED_FILE):
        return []
    with open(FAILED_FILE, "r", encoding="utf-8") as f:
        return [int(line.strip()) for line in f if line.strip().isdigit()]
 def load_existing_data():
    if not os.path.exists(OUTPUT_FILE):
        return []
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        return json.load(f)
 def save_data(data):
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
-        json.dump(data, f, ensure_ascii=False, indent=4)
+            json.dump(all_results, f, ensure_ascii=False, indent=2)
        # به‌روزرسانی لیست خطاها
        if new_failed:
            save_failed(new_failed)
            print(f"[WARN] {len(new_failed)} speeches failed in this round.")
        else:
            if os.path.exists(FAILED_FILE):
                os.remove(FAILED_FILE)
-def crawl_all_speeches(start=1, end=758):
+def main():
-    all_data = load_existing_data()
+    # اگر فایل عناوین وجود نداشت، کراول کن
-    failed = []
+    if not os.path.exists(TITLES_FILE):
        speeches = crawl_titles()
    else:
        with open(TITLES_FILE, "r", encoding="utf-8") as f:
            speeches = json.load(f)
-    for page in range(start, end + 1):
+    # شروع کراول بخش‌ها با retry پیوسته
-        try:
+    crawl_speech_parts(speeches)
            print(f"Trying speech {page} ...")
            parts = crawl_speech_page(page)
            all_data.extend(parts)
            time.sleep(1)
        except Exception as e:
            print(f"❌ Failed speech {page} | error: {e}")
            failed.append(page)
    save_data(all_data)
    save_failed(failed)
 def retry_until_success(start=1, end=758):
    """تا وقتی که هیچ صفحه‌ای ناموفق نباشد تکرار می‌کند"""
    crawl_all_speeches(start, end)
    while True:
        failed_pages = load_failed()
        if not failed_pages:
            print("✅ All speeches crawled successfully.")
            break
        print(f"🔄 Retrying {len(failed_pages)} failed pages ...")
        failed = []
        all_data = load_existing_data()
        for page in failed_pages:
            try:
                print(f"Retry speech {page} ...")
                parts = crawl_speech_page(page)
                all_data.extend(parts)
                time.sleep(1)
            except Exception as e:
                print(f"❌ Still failed {page} | error: {e}")
                failed.append(page)
        save_data(all_data)
        save_failed(failed)
        if not failed:
            print("✅ Finished. No failed pages remain.")
            break
 if __name__ == "__main__":
-    retry_until_success(1, 758)
+    main()
--- a/output/nahj_speeches.json
+++ b/output/nahj_speeches.json
--- a/output/speech-titles.json
+++ b/output/speech-titles.json