nahj/crawler.py

import requests
from bs4 import BeautifulSoup
import json
import time
import os

def crawl_wisdoms():
    data = []

    base_url = "http://nahj.makarem.ir/wisdom/{}"
    failed = []
    for page in range(958, 1450):
        url = base_url.format(page)
        try:
            print(f'try page {page} cralwing ...')
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                print(f'page {page} response error ...')
                with open('./data/failed-pages.txt', 'a+', encoding='utf-8') as f:
                    f.write(f'{page}\n')

                time.sleep(2)
                continue

            soup = BeautifulSoup(response.text, "html.parser")

            # عنوان حکمت
            title_tag = soup.find("h2", class_="card-title py-4")
            title = title_tag.get_text(strip=True) if title_tag else ""

            # متن عربی
            arabic_tag = soup.find("p", class_="card-text arabic-text")
            arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""

            # ترجمه فارسی
            persian_tag = soup.find("p", class_="card-text translate-text")
            persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""

            # تفسیر فارسی
            interpretation_tag = soup.find("div", style=lambda s: s and "font-size:14pt;" in s)
            interpretation = interpretation_tag.get_text(" ", strip=True) if interpretation_tag else ""

            # ذخیره داده‌ها
            if any([title, arabic_text, persian_translate, interpretation]):
                if not title.__contains__('حکمت'):
                    is_subpart = True
                else:
                    is_subpart = False
                data.append({
                    "title": title,
                    "url": url,
                    "arabic_text": arabic_text,
                    "persian_translate": persian_translate,
                    "Interpretation": interpretation,
                    "is-subpart": is_subpart
                })

            time.sleep(1)  # جلوگیری از فشار زیاد به سرور
        except Exception as e:
            print(f'error in crawling page: {page} . error : {e}')
            continue

    # ذخیره در فایل JSON
    output_file = "./data/wisdom_data.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


OUTPUT_FILE = "./output/speechs.json"
FAILED_FILE = "./data/failed-speech-pages.txt"


def crawl_speech_page(page):
    """کراول یک خطبه بر اساس شماره صفحه و برگرداندن لیست بخش‌ها"""
    url = f"http://nahj.makarem.ir/speech/{page}"
    response = requests.get(url, timeout=10)
    if response.status_code != 200:
        raise Exception(f"status code {response.status_code}")

    soup = BeautifulSoup(response.text, "html.parser")

    # عنوان خطبه
    title_tag = soup.find("h2", class_="text-center phrase-title")
    speech_title = title_tag.get_text(strip=True) if title_tag else ""

    # لیست بخش‌ها
    parts = []
    for idx, part_div in enumerate(soup.find_all("div", class_="phrase-text-container"), start=1):
        # متن عربی
        arabic_tag = part_div.find("p", class_="arabic-text show-off")
        arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""

        # ترجمه فارسی
        persian_tag = part_div.find("p", class_="translate-text")
        persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""

        # لینک تفسیر
        interpretation_link = ""
        ul_tag = part_div.find("ul", class_="tools")
        if ul_tag:
            first_li = ul_tag.find("li")
            if first_li:
                a_tag = first_li.find("a")
                if a_tag and a_tag.has_attr("href"):
                    interpretation_link = a_tag["href"]

        parts.append({
            "speech_title": speech_title,
            "part_order": idx,
            "url": url,
            "arabic_text": arabic_text,
            "persian_translate": persian_translate,
            "Interpretation_link": interpretation_link
        })

    return parts


def save_failed(pages):
    with open(FAILED_FILE, "w", encoding="utf-8") as f:
        for p in pages:
            f.write(f"{p}\n")


def load_failed():
    if not os.path.exists(FAILED_FILE):
        return []
    with open(FAILED_FILE, "r", encoding="utf-8") as f:
        return [int(line.strip()) for line in f if line.strip().isdigit()]


def load_existing_data():
    if not os.path.exists(OUTPUT_FILE):
        return []
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        return json.load(f)


def save_data(data):
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def crawl_all_speeches(start=1, end=758):
    all_data = load_existing_data()
    failed = []

    for page in range(start, end + 1):
        try:
            print(f"Trying speech {page} ...")
            parts = crawl_speech_page(page)
            all_data.extend(parts)
            time.sleep(1)
        except Exception as e:
            print(f"❌ Failed speech {page} | error: {e}")
            failed.append(page)

    save_data(all_data)
    save_failed(failed)


def retry_until_success(start=1, end=758):
    """تا وقتی که هیچ صفحه‌ای ناموفق نباشد تکرار می‌کند"""
    crawl_all_speeches(start, end)

    while True:
        failed_pages = load_failed()
        if not failed_pages:
            print("✅ All speeches crawled successfully.")
            break

        print(f"🔄 Retrying {len(failed_pages)} failed pages ...")
        failed = []
        all_data = load_existing_data()

        for page in failed_pages:
            try:
                print(f"Retry speech {page} ...")
                parts = crawl_speech_page(page)
                all_data.extend(parts)
                time.sleep(1)
            except Exception as e:
                print(f"❌ Still failed {page} | error: {e}")
                failed.append(page)

        save_data(all_data)
        save_failed(failed)

        if not failed:
            print("✅ Finished. No failed pages remain.")
            break


if __name__ == "__main__":
    retry_until_success(1, 758)