200 lines
6.4 KiB
Python
200 lines
6.4 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import time
|
|
import os
|
|
|
|
def crawl_wisdoms():
|
|
data = []
|
|
|
|
base_url = "http://nahj.makarem.ir/wisdom/{}"
|
|
failed = []
|
|
for page in range(958, 1450):
|
|
url = base_url.format(page)
|
|
try:
|
|
print(f'try page {page} cralwing ...')
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code != 200:
|
|
print(f'page {page} response error ...')
|
|
with open('./data/failed-pages.txt', 'a+', encoding='utf-8') as f:
|
|
f.write(f'{page}\n')
|
|
|
|
time.sleep(2)
|
|
continue
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# عنوان حکمت
|
|
title_tag = soup.find("h2", class_="card-title py-4")
|
|
title = title_tag.get_text(strip=True) if title_tag else ""
|
|
|
|
# متن عربی
|
|
arabic_tag = soup.find("p", class_="card-text arabic-text")
|
|
arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
|
|
|
|
# ترجمه فارسی
|
|
persian_tag = soup.find("p", class_="card-text translate-text")
|
|
persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
|
|
|
|
# تفسیر فارسی
|
|
interpretation_tag = soup.find("div", style=lambda s: s and "font-size:14pt;" in s)
|
|
interpretation = interpretation_tag.get_text(" ", strip=True) if interpretation_tag else ""
|
|
|
|
# ذخیره دادهها
|
|
if any([title, arabic_text, persian_translate, interpretation]):
|
|
if not title.__contains__('حکمت'):
|
|
is_subpart = True
|
|
else:
|
|
is_subpart = False
|
|
data.append({
|
|
"title": title,
|
|
"url": url,
|
|
"arabic_text": arabic_text,
|
|
"persian_translate": persian_translate,
|
|
"Interpretation": interpretation,
|
|
"is-subpart": is_subpart
|
|
})
|
|
|
|
time.sleep(1) # جلوگیری از فشار زیاد به سرور
|
|
except Exception as e:
|
|
print(f'error in crawling page: {page} . error : {e}')
|
|
continue
|
|
|
|
# ذخیره در فایل JSON
|
|
output_file = "./data/wisdom_data.json"
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OUTPUT_FILE = "./output/speechs.json"
|
|
FAILED_FILE = "./data/failed-speech-pages.txt"
|
|
|
|
|
|
def crawl_speech_page(page):
|
|
"""کراول یک خطبه بر اساس شماره صفحه و برگرداندن لیست بخشها"""
|
|
url = f"http://nahj.makarem.ir/speech/{page}"
|
|
response = requests.get(url, timeout=10)
|
|
if response.status_code != 200:
|
|
raise Exception(f"status code {response.status_code}")
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# عنوان خطبه
|
|
title_tag = soup.find("h2", class_="text-center phrase-title")
|
|
speech_title = title_tag.get_text(strip=True) if title_tag else ""
|
|
|
|
# لیست بخشها
|
|
parts = []
|
|
for idx, part_div in enumerate(soup.find_all("div", class_="phrase-text-container"), start=1):
|
|
# متن عربی
|
|
arabic_tag = part_div.find("p", class_="arabic-text show-off")
|
|
arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
|
|
|
|
# ترجمه فارسی
|
|
persian_tag = part_div.find("p", class_="translate-text")
|
|
persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
|
|
|
|
# لینک تفسیر
|
|
interpretation_link = ""
|
|
ul_tag = part_div.find("ul", class_="tools")
|
|
if ul_tag:
|
|
first_li = ul_tag.find("li")
|
|
if first_li:
|
|
a_tag = first_li.find("a")
|
|
if a_tag and a_tag.has_attr("href"):
|
|
interpretation_link = a_tag["href"]
|
|
|
|
parts.append({
|
|
"speech_title": speech_title,
|
|
"part_order": idx,
|
|
"url": url,
|
|
"arabic_text": arabic_text,
|
|
"persian_translate": persian_translate,
|
|
"Interpretation_link": interpretation_link
|
|
})
|
|
|
|
return parts
|
|
|
|
|
|
def save_failed(pages):
|
|
with open(FAILED_FILE, "w", encoding="utf-8") as f:
|
|
for p in pages:
|
|
f.write(f"{p}\n")
|
|
|
|
|
|
def load_failed():
|
|
if not os.path.exists(FAILED_FILE):
|
|
return []
|
|
with open(FAILED_FILE, "r", encoding="utf-8") as f:
|
|
return [int(line.strip()) for line in f if line.strip().isdigit()]
|
|
|
|
|
|
def load_existing_data():
|
|
if not os.path.exists(OUTPUT_FILE):
|
|
return []
|
|
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def save_data(data):
|
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
|
|
|
def crawl_all_speeches(start=1, end=758):
|
|
all_data = load_existing_data()
|
|
failed = []
|
|
|
|
for page in range(start, end + 1):
|
|
try:
|
|
print(f"Trying speech {page} ...")
|
|
parts = crawl_speech_page(page)
|
|
all_data.extend(parts)
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
print(f"❌ Failed speech {page} | error: {e}")
|
|
failed.append(page)
|
|
|
|
save_data(all_data)
|
|
save_failed(failed)
|
|
|
|
|
|
def retry_until_success(start=1, end=758):
|
|
"""تا وقتی که هیچ صفحهای ناموفق نباشد تکرار میکند"""
|
|
crawl_all_speeches(start, end)
|
|
|
|
while True:
|
|
failed_pages = load_failed()
|
|
if not failed_pages:
|
|
print("✅ All speeches crawled successfully.")
|
|
break
|
|
|
|
print(f"🔄 Retrying {len(failed_pages)} failed pages ...")
|
|
failed = []
|
|
all_data = load_existing_data()
|
|
|
|
for page in failed_pages:
|
|
try:
|
|
print(f"Retry speech {page} ...")
|
|
parts = crawl_speech_page(page)
|
|
all_data.extend(parts)
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
print(f"❌ Still failed {page} | error: {e}")
|
|
failed.append(page)
|
|
|
|
save_data(all_data)
|
|
save_failed(failed)
|
|
|
|
if not failed:
|
|
print("✅ Finished. No failed pages remain.")
|
|
break
|
|
|
|
|
|
if __name__ == "__main__":
|
|
retry_until_success(1, 758)
|