nahj/crawler.py

200 lines
6.4 KiB
Python

import requests
from bs4 import BeautifulSoup
import json
import time
import os
def crawl_wisdoms():
data = []
base_url = "http://nahj.makarem.ir/wisdom/{}"
failed = []
for page in range(958, 1450):
url = base_url.format(page)
try:
print(f'try page {page} cralwing ...')
response = requests.get(url, timeout=10)
if response.status_code != 200:
print(f'page {page} response error ...')
with open('./data/failed-pages.txt', 'a+', encoding='utf-8') as f:
f.write(f'{page}\n')
time.sleep(2)
continue
soup = BeautifulSoup(response.text, "html.parser")
# عنوان حکمت
title_tag = soup.find("h2", class_="card-title py-4")
title = title_tag.get_text(strip=True) if title_tag else ""
# متن عربی
arabic_tag = soup.find("p", class_="card-text arabic-text")
arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
# ترجمه فارسی
persian_tag = soup.find("p", class_="card-text translate-text")
persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
# تفسیر فارسی
interpretation_tag = soup.find("div", style=lambda s: s and "font-size:14pt;" in s)
interpretation = interpretation_tag.get_text(" ", strip=True) if interpretation_tag else ""
# ذخیره داده‌ها
if any([title, arabic_text, persian_translate, interpretation]):
if not title.__contains__('حکمت'):
is_subpart = True
else:
is_subpart = False
data.append({
"title": title,
"url": url,
"arabic_text": arabic_text,
"persian_translate": persian_translate,
"Interpretation": interpretation,
"is-subpart": is_subpart
})
time.sleep(1) # جلوگیری از فشار زیاد به سرور
except Exception as e:
print(f'error in crawling page: {page} . error : {e}')
continue
# ذخیره در فایل JSON
output_file = "./data/wisdom_data.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
OUTPUT_FILE = "./output/speechs.json"
FAILED_FILE = "./data/failed-speech-pages.txt"
def crawl_speech_page(page):
"""کراول یک خطبه بر اساس شماره صفحه و برگرداندن لیست بخش‌ها"""
url = f"http://nahj.makarem.ir/speech/{page}"
response = requests.get(url, timeout=10)
if response.status_code != 200:
raise Exception(f"status code {response.status_code}")
soup = BeautifulSoup(response.text, "html.parser")
# عنوان خطبه
title_tag = soup.find("h2", class_="text-center phrase-title")
speech_title = title_tag.get_text(strip=True) if title_tag else ""
# لیست بخش‌ها
parts = []
for idx, part_div in enumerate(soup.find_all("div", class_="phrase-text-container"), start=1):
# متن عربی
arabic_tag = part_div.find("p", class_="arabic-text show-off")
arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
# ترجمه فارسی
persian_tag = part_div.find("p", class_="translate-text")
persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
# لینک تفسیر
interpretation_link = ""
ul_tag = part_div.find("ul", class_="tools")
if ul_tag:
first_li = ul_tag.find("li")
if first_li:
a_tag = first_li.find("a")
if a_tag and a_tag.has_attr("href"):
interpretation_link = a_tag["href"]
parts.append({
"speech_title": speech_title,
"part_order": idx,
"url": url,
"arabic_text": arabic_text,
"persian_translate": persian_translate,
"Interpretation_link": interpretation_link
})
return parts
def save_failed(pages):
with open(FAILED_FILE, "w", encoding="utf-8") as f:
for p in pages:
f.write(f"{p}\n")
def load_failed():
if not os.path.exists(FAILED_FILE):
return []
with open(FAILED_FILE, "r", encoding="utf-8") as f:
return [int(line.strip()) for line in f if line.strip().isdigit()]
def load_existing_data():
if not os.path.exists(OUTPUT_FILE):
return []
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
return json.load(f)
def save_data(data):
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
def crawl_all_speeches(start=1, end=758):
all_data = load_existing_data()
failed = []
for page in range(start, end + 1):
try:
print(f"Trying speech {page} ...")
parts = crawl_speech_page(page)
all_data.extend(parts)
time.sleep(1)
except Exception as e:
print(f"❌ Failed speech {page} | error: {e}")
failed.append(page)
save_data(all_data)
save_failed(failed)
def retry_until_success(start=1, end=758):
"""تا وقتی که هیچ صفحه‌ای ناموفق نباشد تکرار می‌کند"""
crawl_all_speeches(start, end)
while True:
failed_pages = load_failed()
if not failed_pages:
print("✅ All speeches crawled successfully.")
break
print(f"🔄 Retrying {len(failed_pages)} failed pages ...")
failed = []
all_data = load_existing_data()
for page in failed_pages:
try:
print(f"Retry speech {page} ...")
parts = crawl_speech_page(page)
all_data.extend(parts)
time.sleep(1)
except Exception as e:
print(f"❌ Still failed {page} | error: {e}")
failed.append(page)
save_data(all_data)
save_failed(failed)
if not failed:
print("✅ Finished. No failed pages remain.")
break
if __name__ == "__main__":
retry_until_success(1, 758)