advanced crawler for nahj speechs

2025-08-19 19:07:40 +03:30 · 2025-08-19 19:07:40 +03:30 · 3d78d3623d
commit 3d78d3623d
parent 9ee755bb39
3 changed files with 8921 additions and 105 deletions
--- a/crawler.py
+++ b/crawler.py
@ -1,8 +1,14 @@
-import requests
-from bs4 import BeautifulSoup
+import os
 import json
 import time
-import os
+import requests
+from bs4 import BeautifulSoup
+
+BASE_URL = "http://nahj.makarem.ir"
+DATA_DIR = "./output"
+TITLES_FILE = os.path.join(DATA_DIR, "speech-titles.json")
+FAILED_FILE = os.path.join(DATA_DIR, "failed-speech-pages.txt")
+OUTPUT_FILE = os.path.join(DATA_DIR, "nahj_speeches.json")

 def crawl_wisdoms():
    data = []
@ -67,133 +73,174 @@ def crawl_wisdoms():
        
    

+def fetch_page(url):
+    """دریافت صفحه با ریتری و هندل خطا"""
+    try:
+        response = requests.get(url, timeout=15)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        print(f"[ERROR] {url} => {e}")
+        return None


+def crawl_titles():
+    """کراول عناوین خطبه‌ها از صفحات اصلی"""
+    all_titles = []
+    speech_counter = 1

-OUTPUT_FILE = "./output/speechs.json"
-FAILED_FILE = "./data/failed-speech-pages.txt"
+    for page in range(1, 12):  # صفحات 1 تا 11
+        url = f"{BASE_URL}/speech/page/{page}"
+        print(f"[INFO] Crawling titles page {page} => {url}")
+        html = fetch_page(url)
+        if not html:
+            continue

+        soup = BeautifulSoup(html, "html.parser")
+        cards = soup.select("h5.card-title a")
+        for a in cards:
+            title = a.get_text(strip=True)
+            href = a.get("href")
+            if not href.startswith("http"):
+                href = BASE_URL + href
+            all_titles.append({
+                "speech_number": speech_counter,
+                "speech_title_large": title,
+                "url": href
+            })
+            speech_counter += 1

-def crawl_speech_page(page):
-    """کراول یک خطبه بر اساس شماره صفحه و برگرداندن لیست بخش‌ها"""
-    url = f"http://nahj.makarem.ir/speech/{page}"
-    response = requests.get(url, timeout=10)
-    if response.status_code != 200:
-        raise Exception(f"status code {response.status_code}")
+        time.sleep(1)

-    soup = BeautifulSoup(response.text, "html.parser")
+    # ذخیره در فایل
+    os.makedirs(DATA_DIR, exist_ok=True)
+    with open(TITLES_FILE, "w", encoding="utf-8") as f:
+        json.dump(all_titles, f, ensure_ascii=False, indent=2)

-    # عنوان خطبه
-    title_tag = soup.find("h2", class_="text-center phrase-title")
-    speech_title = title_tag.get_text(strip=True) if title_tag else ""
-
-    # لیست بخش‌ها
-    parts = []
-    for idx, part_div in enumerate(soup.find_all("div", class_="phrase-text-container"), start=1):
-        # متن عربی
-        arabic_tag = part_div.find("p", class_="arabic-text show-off")
-        arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
-
-        # ترجمه فارسی
-        persian_tag = part_div.find("p", class_="translate-text")
-        persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
-
-        # لینک تفسیر
-        interpretation_link = ""
-        ul_tag = part_div.find("ul", class_="tools")
-        if ul_tag:
-            first_li = ul_tag.find("li")
-            if first_li:
-                a_tag = first_li.find("a")
-                if a_tag and a_tag.has_attr("href"):
-                    interpretation_link = a_tag["href"]
-
-        parts.append({
-            "speech_title": speech_title,
-            "part_order": idx,
-            "url": url,
-            "arabic_text": arabic_text,
-            "persian_translate": persian_translate,
-            "Interpretation_link": interpretation_link
-        })
-
-    return parts
-
-
-def save_failed(pages):
-    with open(FAILED_FILE, "w", encoding="utf-8") as f:
-        for p in pages:
-            f.write(f"{p}\n")
+    print(f"[DONE] Saved {len(all_titles)} speech titles to {TITLES_FILE}")
+    return all_titles


 def load_failed():
-    if not os.path.exists(FAILED_FILE):
-        return []
-    with open(FAILED_FILE, "r", encoding="utf-8") as f:
-        return [int(line.strip()) for line in f if line.strip().isdigit()]
+    """بارگذاری صفحات ناموفق قبلی"""
+    if os.path.exists(FAILED_FILE):
+        with open(FAILED_FILE, "r", encoding="utf-8") as f:
+            return set(line.strip() for line in f if line.strip())
+    return set()


-def load_existing_data():
-    if not os.path.exists(OUTPUT_FILE):
-        return []
-    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
-        return json.load(f)
+def save_failed(failed_urls):
+    """ذخیره صفحات ناموفق در فایل (تجمیعی)"""
+    # previous_failed = load_failed()
+    # updated_failed = previous_failed.union(failed_urls)
+    with open(FAILED_FILE, "w", encoding="utf-8") as f:
+        for url in sorted(failed_urls):
+            f.write(url + "\n")


-def save_data(data):
-    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
-        json.dump(data, f, ensure_ascii=False, indent=4)
+def load_existing_results():
+    """بارگذاری داده‌های ذخیره‌شده قبلی"""
+    if os.path.exists(OUTPUT_FILE):
+        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
+            return json.load(f)
+    return []


-def crawl_all_speeches(start=1, end=758):
-    all_data = load_existing_data()
-    failed = []
-
-    for page in range(start, end + 1):
-        try:
-            print(f"Trying speech {page} ...")
-            parts = crawl_speech_page(page)
-            all_data.extend(parts)
-            time.sleep(1)
-        except Exception as e:
-            print(f"❌ Failed speech {page} | error: {e}")
-            failed.append(page)
-
-    save_data(all_data)
-    save_failed(failed)
-
-
-def retry_until_success(start=1, end=758):
-    """تا وقتی که هیچ صفحه‌ای ناموفق نباشد تکرار می‌کند"""
-    crawl_all_speeches(start, end)
+def crawl_speech_parts(speeches):
+    """کراول همه بخش‌های خطبه‌ها با Retry تا موفقیت کامل"""
+    all_results = load_existing_results()
+    done_urls = {rec["url"] for rec in all_results}
+    failed = load_failed()

    while True:
-        failed_pages = load_failed()
-        if not failed_pages:
-            print("✅ All speeches crawled successfully.")
+        # انتخاب فقط خطبه‌هایی که هنوز کراول نشدن
+        pending = [s for s in speeches if s["url"] not in done_urls]
+
+        # اگر چیزی نمونده، پایان
+        if not pending:
+            print("[SUCCESS] All speeches crawled successfully.")
+            if os.path.exists(FAILED_FILE):
+                os.remove(FAILED_FILE)  # همه چیز موفق بود → پاک کردن فایل خطاها
            break

-        print(f"🔄 Retrying {len(failed_pages)} failed pages ...")
-        failed = []
-        all_data = load_existing_data()
+        print(f"[INFO] {len(pending)} speeches remaining...")

-        for page in failed_pages:
+        new_failed = set()
+
+        for speech in pending:
+            speech_number = speech["speech_number"]
+            speech_title_large = speech["speech_title_large"]
+            url = speech["url"]
+
+            print(f"[INFO] Crawling speech {speech_number}: {speech_title_large}")
+            html = fetch_page(url)
+            if not html:
+                new_failed.add(url)
+                continue
+
+            soup = BeautifulSoup(html, "html.parser")
            try:
-                print(f"Retry speech {page} ...")
-                parts = crawl_speech_page(page)
-                all_data.extend(parts)
-                time.sleep(1)
+                speech_title = soup.find("h2", class_="text-center phrase-title").get_text()
+                parts = soup.find_all("div", class_="phrase-text-container")
+
+                for idx, part in enumerate(parts, start=1):
+                    arabic_text = part.find("p", class_="arabic-text show-off").get_text()
+                    persian_translate = part.find("p", class_="translate-text").get_text()
+                    # لینک تفسیر
+                    interpretation_link = ""
+                    ul_tag = part.find("ul", class_="tools")
+                    if ul_tag:
+                        first_li = ul_tag.find("li")
+                        if first_li:
+                            a_tag = first_li.find("a")
+                            if a_tag and a_tag.has_attr("href"):
+                                interpretation_link = a_tag["href"]
+
+                    record = {
+                        "speech_title_large": speech_title_large.strip(),
+                        "speech_title_page": speech_title.strip(),
+                        "speech_number": speech_number,
+                        "part_order": idx,
+                        "url": url,
+                        "arabic_text": arabic_text.strip(),
+                        "persian_translate": persian_translate.strip(),
+                        "Interpretation_link": BASE_URL + interpretation_link
+                    }
+                    all_results.append(record)
+
+                done_urls.add(url)
+                print(f"[OK] Crawled {url}")
+
            except Exception as e:
-                print(f"❌ Still failed {page} | error: {e}")
-                failed.append(page)
+                print(f"[ERROR parsing] {url} => {e}")
+                new_failed.add(url)

-        save_data(all_data)
-        save_failed(failed)
+            time.sleep(1)

-        if not failed:
-            print("✅ Finished. No failed pages remain.")
-            break
+        # ذخیره داده‌ها
+        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+            json.dump(all_results, f, ensure_ascii=False, indent=2)
+
+        # به‌روزرسانی لیست خطاها
+        if new_failed:
+            save_failed(new_failed)
+            print(f"[WARN] {len(new_failed)} speeches failed in this round.")
+        else:
+            if os.path.exists(FAILED_FILE):
+                os.remove(FAILED_FILE)
+
+
+def main():
+    # اگر فایل عناوین وجود نداشت، کراول کن
+    if not os.path.exists(TITLES_FILE):
+        speeches = crawl_titles()
+    else:
+        with open(TITLES_FILE, "r", encoding="utf-8") as f:
+            speeches = json.load(f)
+
+    # شروع کراول بخش‌ها با retry پیوسته
+    crawl_speech_parts(speeches)


 if __name__ == "__main__":
-    retry_until_success(1, 758)
+    main()
--- a/output/nahj_speeches.json
+++ b/output/nahj_speeches.json
--- a/output/speech-titles.json
+++ b/output/speech-titles.json