nahj/crawler.py

247 lines
8.7 KiB
Python

import os
import json
import time
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://nahj.makarem.ir"
DATA_DIR = "./output"
TITLES_FILE = os.path.join(DATA_DIR, "speech-titles.json")
FAILED_FILE = os.path.join(DATA_DIR, "failed-speech-pages.txt")
OUTPUT_FILE = os.path.join(DATA_DIR, "nahj_speeches.json")
def crawl_wisdoms():
data = []
base_url = "http://nahj.makarem.ir/wisdom/{}"
failed = []
for page in range(958, 1450):
url = base_url.format(page)
try:
print(f'try page {page} cralwing ...')
response = requests.get(url, timeout=10)
if response.status_code != 200:
print(f'page {page} response error ...')
with open('./data/failed-pages.txt', 'a+', encoding='utf-8') as f:
f.write(f'{page}\n')
time.sleep(2)
continue
soup = BeautifulSoup(response.text, "html.parser")
# عنوان حکمت
title_tag = soup.find("h2", class_="card-title py-4")
title = title_tag.get_text(strip=True) if title_tag else ""
# متن عربی
arabic_tag = soup.find("p", class_="card-text arabic-text")
arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
# ترجمه فارسی
persian_tag = soup.find("p", class_="card-text translate-text")
persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
# تفسیر فارسی
interpretation_tag = soup.find("div", style=lambda s: s and "font-size:14pt;" in s)
interpretation = interpretation_tag.get_text(" ", strip=True) if interpretation_tag else ""
# ذخیره داده‌ها
if any([title, arabic_text, persian_translate, interpretation]):
if not title.__contains__('حکمت'):
is_subpart = True
else:
is_subpart = False
data.append({
"title": title,
"url": url,
"arabic_text": arabic_text,
"persian_translate": persian_translate,
"Interpretation": interpretation,
"is-subpart": is_subpart
})
time.sleep(1) # جلوگیری از فشار زیاد به سرور
except Exception as e:
print(f'error in crawling page: {page} . error : {e}')
continue
# ذخیره در فایل JSON
output_file = "./data/wisdom_data.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def fetch_page(url):
"""دریافت صفحه با ریتری و هندل خطا"""
try:
response = requests.get(url, timeout=15)
response.raise_for_status()
return response.text
except Exception as e:
print(f"[ERROR] {url} => {e}")
return None
def crawl_titles():
"""کراول عناوین خطبه‌ها از صفحات اصلی"""
all_titles = []
speech_counter = 1
for page in range(1, 12): # صفحات 1 تا 11
url = f"{BASE_URL}/speech/page/{page}"
print(f"[INFO] Crawling titles page {page} => {url}")
html = fetch_page(url)
if not html:
continue
soup = BeautifulSoup(html, "html.parser")
cards = soup.select("h5.card-title a")
for a in cards:
title = a.get_text(strip=True)
href = a.get("href")
if not href.startswith("http"):
href = BASE_URL + href
all_titles.append({
"speech_number": speech_counter,
"speech_title_large": title,
"url": href
})
speech_counter += 1
time.sleep(1)
# ذخیره در فایل
os.makedirs(DATA_DIR, exist_ok=True)
with open(TITLES_FILE, "w", encoding="utf-8") as f:
json.dump(all_titles, f, ensure_ascii=False, indent=2)
print(f"[DONE] Saved {len(all_titles)} speech titles to {TITLES_FILE}")
return all_titles
def load_failed():
"""بارگذاری صفحات ناموفق قبلی"""
if os.path.exists(FAILED_FILE):
with open(FAILED_FILE, "r", encoding="utf-8") as f:
return set(line.strip() for line in f if line.strip())
return set()
def save_failed(failed_urls):
"""ذخیره صفحات ناموفق در فایل (تجمیعی)"""
# previous_failed = load_failed()
# updated_failed = previous_failed.union(failed_urls)
with open(FAILED_FILE, "w", encoding="utf-8") as f:
for url in sorted(failed_urls):
f.write(url + "\n")
def load_existing_results():
"""بارگذاری داده‌های ذخیره‌شده قبلی"""
if os.path.exists(OUTPUT_FILE):
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
return json.load(f)
return []
def crawl_speech_parts(speeches):
"""کراول همه بخش‌های خطبه‌ها با Retry تا موفقیت کامل"""
all_results = load_existing_results()
done_urls = {rec["url"] for rec in all_results}
failed = load_failed()
while True:
# انتخاب فقط خطبه‌هایی که هنوز کراول نشدن
pending = [s for s in speeches if s["url"] not in done_urls]
# اگر چیزی نمونده، پایان
if not pending:
print("[SUCCESS] All speeches crawled successfully.")
if os.path.exists(FAILED_FILE):
os.remove(FAILED_FILE) # همه چیز موفق بود → پاک کردن فایل خطاها
break
print(f"[INFO] {len(pending)} speeches remaining...")
new_failed = set()
for speech in pending:
speech_number = speech["speech_number"]
speech_title_large = speech["speech_title_large"]
url = speech["url"]
print(f"[INFO] Crawling speech {speech_number}: {speech_title_large}")
html = fetch_page(url)
if not html:
new_failed.add(url)
continue
soup = BeautifulSoup(html, "html.parser")
try:
speech_title = soup.find("h2", class_="text-center phrase-title").get_text()
parts = soup.find_all("div", class_="phrase-text-container")
for idx, part in enumerate(parts, start=1):
arabic_text = part.find("p", class_="arabic-text show-off").get_text()
persian_translate = part.find("p", class_="translate-text").get_text()
# لینک تفسیر
interpretation_link = ""
ul_tag = part.find("ul", class_="tools")
if ul_tag:
first_li = ul_tag.find("li")
if first_li:
a_tag = first_li.find("a")
if a_tag and a_tag.has_attr("href"):
interpretation_link = a_tag["href"]
record = {
"speech_title_large": speech_title_large.strip(),
"speech_title_page": speech_title.strip(),
"speech_number": speech_number,
"part_order": idx,
"url": url,
"arabic_text": arabic_text.strip(),
"persian_translate": persian_translate.strip(),
"Interpretation_link": BASE_URL + interpretation_link
}
all_results.append(record)
done_urls.add(url)
print(f"[OK] Crawled {url}")
except Exception as e:
print(f"[ERROR parsing] {url} => {e}")
new_failed.add(url)
time.sleep(1)
# ذخیره داده‌ها
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
# به‌روزرسانی لیست خطاها
if new_failed:
save_failed(new_failed)
print(f"[WARN] {len(new_failed)} speeches failed in this round.")
else:
if os.path.exists(FAILED_FILE):
os.remove(FAILED_FILE)
def main():
# اگر فایل عناوین وجود نداشت، کراول کن
if not os.path.exists(TITLES_FILE):
speeches = crawl_titles()
else:
with open(TITLES_FILE, "r", encoding="utf-8") as f:
speeches = json.load(f)
# شروع کراول بخش‌ها با retry پیوسته
crawl_speech_parts(speeches)
if __name__ == "__main__":
main()