import os import json import time import requests from bs4 import BeautifulSoup BASE_URL = "http://nahj.makarem.ir" DATA_DIR = "./output" TITLES_FILE = os.path.join(DATA_DIR, "speech-titles.json") FAILED_FILE = os.path.join(DATA_DIR, "failed-speech-pages.txt") OUTPUT_FILE = os.path.join(DATA_DIR, "nahj_speeches.json") def crawl_wisdoms(): data = [] base_url = "http://nahj.makarem.ir/wisdom/{}" failed = [] for page in range(958, 1450): url = base_url.format(page) try: print(f'try page {page} cralwing ...') response = requests.get(url, timeout=10) if response.status_code != 200: print(f'page {page} response error ...') with open('./data/failed-pages.txt', 'a+', encoding='utf-8') as f: f.write(f'{page}\n') time.sleep(2) continue soup = BeautifulSoup(response.text, "html.parser") # عنوان حکمت title_tag = soup.find("h2", class_="card-title py-4") title = title_tag.get_text(strip=True) if title_tag else "" # متن عربی arabic_tag = soup.find("p", class_="card-text arabic-text") arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else "" # ترجمه فارسی persian_tag = soup.find("p", class_="card-text translate-text") persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else "" # تفسیر فارسی interpretation_tag = soup.find("div", style=lambda s: s and "font-size:14pt;" in s) interpretation = interpretation_tag.get_text(" ", strip=True) if interpretation_tag else "" # ذخیره داده‌ها if any([title, arabic_text, persian_translate, interpretation]): if not title.__contains__('حکمت'): is_subpart = True else: is_subpart = False data.append({ "title": title, "url": url, "arabic_text": arabic_text, "persian_translate": persian_translate, "Interpretation": interpretation, "is-subpart": is_subpart }) time.sleep(1) # جلوگیری از فشار زیاد به سرور except Exception as e: print(f'error in crawling page: {page} . error : {e}') continue # ذخیره در فایل JSON output_file = "./data/wisdom_data.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) def fetch_page(url): """دریافت صفحه با ریتری و هندل خطا""" try: response = requests.get(url, timeout=15) response.raise_for_status() return response.text except Exception as e: print(f"[ERROR] {url} => {e}") return None def crawl_titles(): """کراول عناوین خطبه‌ها از صفحات اصلی""" all_titles = [] speech_counter = 1 for page in range(1, 12): # صفحات 1 تا 11 url = f"{BASE_URL}/speech/page/{page}" print(f"[INFO] Crawling titles page {page} => {url}") html = fetch_page(url) if not html: continue soup = BeautifulSoup(html, "html.parser") cards = soup.select("h5.card-title a") for a in cards: title = a.get_text(strip=True) href = a.get("href") if not href.startswith("http"): href = BASE_URL + href all_titles.append({ "speech_number": speech_counter, "speech_title_large": title, "url": href }) speech_counter += 1 time.sleep(1) # ذخیره در فایل os.makedirs(DATA_DIR, exist_ok=True) with open(TITLES_FILE, "w", encoding="utf-8") as f: json.dump(all_titles, f, ensure_ascii=False, indent=2) print(f"[DONE] Saved {len(all_titles)} speech titles to {TITLES_FILE}") return all_titles def load_failed(): """بارگذاری صفحات ناموفق قبلی""" if os.path.exists(FAILED_FILE): with open(FAILED_FILE, "r", encoding="utf-8") as f: return set(line.strip() for line in f if line.strip()) return set() def save_failed(failed_urls): """ذخیره صفحات ناموفق در فایل (تجمیعی)""" # previous_failed = load_failed() # updated_failed = previous_failed.union(failed_urls) with open(FAILED_FILE, "w", encoding="utf-8") as f: for url in sorted(failed_urls): f.write(url + "\n") def load_existing_results(): """بارگذاری داده‌های ذخیره‌شده قبلی""" if os.path.exists(OUTPUT_FILE): with open(OUTPUT_FILE, "r", encoding="utf-8") as f: return json.load(f) return [] def crawl_speech_parts(speeches): """کراول همه بخش‌های خطبه‌ها با Retry تا موفقیت کامل""" all_results = load_existing_results() done_urls = {rec["url"] for rec in all_results} failed = load_failed() while True: # انتخاب فقط خطبه‌هایی که هنوز کراول نشدن pending = [s for s in speeches if s["url"] not in done_urls] # اگر چیزی نمونده، پایان if not pending: print("[SUCCESS] All speeches crawled successfully.") if os.path.exists(FAILED_FILE): os.remove(FAILED_FILE) # همه چیز موفق بود → پاک کردن فایل خطاها break print(f"[INFO] {len(pending)} speeches remaining...") new_failed = set() for speech in pending: speech_number = speech["speech_number"] speech_title_large = speech["speech_title_large"] url = speech["url"] print(f"[INFO] Crawling speech {speech_number}: {speech_title_large}") html = fetch_page(url) if not html: new_failed.add(url) continue soup = BeautifulSoup(html, "html.parser") try: speech_title = soup.find("h2", class_="text-center phrase-title").get_text() parts = soup.find_all("div", class_="phrase-text-container") for idx, part in enumerate(parts, start=1): arabic_text = part.find("p", class_="arabic-text show-off").get_text() persian_translate = part.find("p", class_="translate-text").get_text() # لینک تفسیر interpretation_link = "" ul_tag = part.find("ul", class_="tools") if ul_tag: first_li = ul_tag.find("li") if first_li: a_tag = first_li.find("a") if a_tag and a_tag.has_attr("href"): interpretation_link = a_tag["href"] record = { "speech_title_large": speech_title_large.strip(), "speech_title_page": speech_title.strip(), "speech_number": speech_number, "part_order": idx, "url": url, "arabic_text": arabic_text.strip(), "persian_translate": persian_translate.strip(), "Interpretation_link": BASE_URL + interpretation_link } all_results.append(record) done_urls.add(url) print(f"[OK] Crawled {url}") except Exception as e: print(f"[ERROR parsing] {url} => {e}") new_failed.add(url) time.sleep(1) # ذخیره داده‌ها with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(all_results, f, ensure_ascii=False, indent=2) # به‌روزرسانی لیست خطاها if new_failed: save_failed(new_failed) print(f"[WARN] {len(new_failed)} speeches failed in this round.") else: if os.path.exists(FAILED_FILE): os.remove(FAILED_FILE) def main(): # اگر فایل عناوین وجود نداشت، کراول کن if not os.path.exists(TITLES_FILE): speeches = crawl_titles() else: with open(TITLES_FILE, "r", encoding="utf-8") as f: speeches = json.load(f) # شروع کراول بخش‌ها با retry پیوسته crawl_speech_parts(speeches) if __name__ == "__main__": main()