import requests from bs4 import BeautifulSoup import json import time import os def crawl_wisdoms(): data = [] base_url = "http://nahj.makarem.ir/wisdom/{}" failed = [] for page in range(958, 1450): url = base_url.format(page) try: print(f'try page {page} cralwing ...') response = requests.get(url, timeout=10) if response.status_code != 200: print(f'page {page} response error ...') with open('./data/failed-pages.txt', 'a+', encoding='utf-8') as f: f.write(f'{page}\n') time.sleep(2) continue soup = BeautifulSoup(response.text, "html.parser") # عنوان حکمت title_tag = soup.find("h2", class_="card-title py-4") title = title_tag.get_text(strip=True) if title_tag else "" # متن عربی arabic_tag = soup.find("p", class_="card-text arabic-text") arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else "" # ترجمه فارسی persian_tag = soup.find("p", class_="card-text translate-text") persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else "" # تفسیر فارسی interpretation_tag = soup.find("div", style=lambda s: s and "font-size:14pt;" in s) interpretation = interpretation_tag.get_text(" ", strip=True) if interpretation_tag else "" # ذخیره داده‌ها if any([title, arabic_text, persian_translate, interpretation]): if not title.__contains__('حکمت'): is_subpart = True else: is_subpart = False data.append({ "title": title, "url": url, "arabic_text": arabic_text, "persian_translate": persian_translate, "Interpretation": interpretation, "is-subpart": is_subpart }) time.sleep(1) # جلوگیری از فشار زیاد به سرور except Exception as e: print(f'error in crawling page: {page} . error : {e}') continue # ذخیره در فایل JSON output_file = "./data/wisdom_data.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) OUTPUT_FILE = "./output/speechs.json" FAILED_FILE = "./data/failed-speech-pages.txt" def crawl_speech_page(page): """کراول یک خطبه بر اساس شماره صفحه و برگرداندن لیست بخش‌ها""" url = f"http://nahj.makarem.ir/speech/{page}" response = requests.get(url, timeout=10) if response.status_code != 200: raise Exception(f"status code {response.status_code}") soup = BeautifulSoup(response.text, "html.parser") # عنوان خطبه title_tag = soup.find("h2", class_="text-center phrase-title") speech_title = title_tag.get_text(strip=True) if title_tag else "" # لیست بخش‌ها parts = [] for idx, part_div in enumerate(soup.find_all("div", class_="phrase-text-container"), start=1): # متن عربی arabic_tag = part_div.find("p", class_="arabic-text show-off") arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else "" # ترجمه فارسی persian_tag = part_div.find("p", class_="translate-text") persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else "" # لینک تفسیر interpretation_link = "" ul_tag = part_div.find("ul", class_="tools") if ul_tag: first_li = ul_tag.find("li") if first_li: a_tag = first_li.find("a") if a_tag and a_tag.has_attr("href"): interpretation_link = a_tag["href"] parts.append({ "speech_title": speech_title, "part_order": idx, "url": url, "arabic_text": arabic_text, "persian_translate": persian_translate, "Interpretation_link": interpretation_link }) return parts def save_failed(pages): with open(FAILED_FILE, "w", encoding="utf-8") as f: for p in pages: f.write(f"{p}\n") def load_failed(): if not os.path.exists(FAILED_FILE): return [] with open(FAILED_FILE, "r", encoding="utf-8") as f: return [int(line.strip()) for line in f if line.strip().isdigit()] def load_existing_data(): if not os.path.exists(OUTPUT_FILE): return [] with open(OUTPUT_FILE, "r", encoding="utf-8") as f: return json.load(f) def save_data(data): with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=4) def crawl_all_speeches(start=1, end=758): all_data = load_existing_data() failed = [] for page in range(start, end + 1): try: print(f"Trying speech {page} ...") parts = crawl_speech_page(page) all_data.extend(parts) time.sleep(1) except Exception as e: print(f"❌ Failed speech {page} | error: {e}") failed.append(page) save_data(all_data) save_failed(failed) def retry_until_success(start=1, end=758): """تا وقتی که هیچ صفحه‌ای ناموفق نباشد تکرار می‌کند""" crawl_all_speeches(start, end) while True: failed_pages = load_failed() if not failed_pages: print("✅ All speeches crawled successfully.") break print(f"🔄 Retrying {len(failed_pages)} failed pages ...") failed = [] all_data = load_existing_data() for page in failed_pages: try: print(f"Retry speech {page} ...") parts = crawl_speech_page(page) all_data.extend(parts) time.sleep(1) except Exception as e: print(f"❌ Still failed {page} | error: {e}") failed.append(page) save_data(all_data) save_failed(failed) if not failed: print("✅ Finished. No failed pages remain.") break if __name__ == "__main__": retry_until_success(1, 758)