advanced crawler for nahj speechs
This commit is contained in:
parent
9ee755bb39
commit
3d78d3623d
249
crawler.py
249
crawler.py
|
@ -1,8 +1,14 @@
|
||||||
import requests
|
import os
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import os
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
BASE_URL = "http://nahj.makarem.ir"
|
||||||
|
DATA_DIR = "./output"
|
||||||
|
TITLES_FILE = os.path.join(DATA_DIR, "speech-titles.json")
|
||||||
|
FAILED_FILE = os.path.join(DATA_DIR, "failed-speech-pages.txt")
|
||||||
|
OUTPUT_FILE = os.path.join(DATA_DIR, "nahj_speeches.json")
|
||||||
|
|
||||||
def crawl_wisdoms():
|
def crawl_wisdoms():
|
||||||
data = []
|
data = []
|
||||||
|
@ -67,40 +73,122 @@ def crawl_wisdoms():
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url):
|
||||||
|
"""دریافت صفحه با ریتری و هندل خطا"""
|
||||||
|
try:
|
||||||
|
response = requests.get(url, timeout=15)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.text
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR] {url} => {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_titles():
|
||||||
|
"""کراول عناوین خطبهها از صفحات اصلی"""
|
||||||
|
all_titles = []
|
||||||
|
speech_counter = 1
|
||||||
|
|
||||||
OUTPUT_FILE = "./output/speechs.json"
|
for page in range(1, 12): # صفحات 1 تا 11
|
||||||
FAILED_FILE = "./data/failed-speech-pages.txt"
|
url = f"{BASE_URL}/speech/page/{page}"
|
||||||
|
print(f"[INFO] Crawling titles page {page} => {url}")
|
||||||
|
html = fetch_page(url)
|
||||||
|
if not html:
|
||||||
|
continue
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
cards = soup.select("h5.card-title a")
|
||||||
|
for a in cards:
|
||||||
|
title = a.get_text(strip=True)
|
||||||
|
href = a.get("href")
|
||||||
|
if not href.startswith("http"):
|
||||||
|
href = BASE_URL + href
|
||||||
|
all_titles.append({
|
||||||
|
"speech_number": speech_counter,
|
||||||
|
"speech_title_large": title,
|
||||||
|
"url": href
|
||||||
|
})
|
||||||
|
speech_counter += 1
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# ذخیره در فایل
|
||||||
|
os.makedirs(DATA_DIR, exist_ok=True)
|
||||||
|
with open(TITLES_FILE, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(all_titles, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"[DONE] Saved {len(all_titles)} speech titles to {TITLES_FILE}")
|
||||||
|
return all_titles
|
||||||
|
|
||||||
|
|
||||||
def crawl_speech_page(page):
|
def load_failed():
|
||||||
"""کراول یک خطبه بر اساس شماره صفحه و برگرداندن لیست بخشها"""
|
"""بارگذاری صفحات ناموفق قبلی"""
|
||||||
url = f"http://nahj.makarem.ir/speech/{page}"
|
if os.path.exists(FAILED_FILE):
|
||||||
response = requests.get(url, timeout=10)
|
with open(FAILED_FILE, "r", encoding="utf-8") as f:
|
||||||
if response.status_code != 200:
|
return set(line.strip() for line in f if line.strip())
|
||||||
raise Exception(f"status code {response.status_code}")
|
return set()
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
|
|
||||||
# عنوان خطبه
|
def save_failed(failed_urls):
|
||||||
title_tag = soup.find("h2", class_="text-center phrase-title")
|
"""ذخیره صفحات ناموفق در فایل (تجمیعی)"""
|
||||||
speech_title = title_tag.get_text(strip=True) if title_tag else ""
|
# previous_failed = load_failed()
|
||||||
|
# updated_failed = previous_failed.union(failed_urls)
|
||||||
|
with open(FAILED_FILE, "w", encoding="utf-8") as f:
|
||||||
|
for url in sorted(failed_urls):
|
||||||
|
f.write(url + "\n")
|
||||||
|
|
||||||
# لیست بخشها
|
|
||||||
parts = []
|
|
||||||
for idx, part_div in enumerate(soup.find_all("div", class_="phrase-text-container"), start=1):
|
|
||||||
# متن عربی
|
|
||||||
arabic_tag = part_div.find("p", class_="arabic-text show-off")
|
|
||||||
arabic_text = arabic_tag.get_text(" ", strip=True) if arabic_tag else ""
|
|
||||||
|
|
||||||
# ترجمه فارسی
|
def load_existing_results():
|
||||||
persian_tag = part_div.find("p", class_="translate-text")
|
"""بارگذاری دادههای ذخیرهشده قبلی"""
|
||||||
persian_translate = persian_tag.get_text(" ", strip=True) if persian_tag else ""
|
if os.path.exists(OUTPUT_FILE):
|
||||||
|
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_speech_parts(speeches):
|
||||||
|
"""کراول همه بخشهای خطبهها با Retry تا موفقیت کامل"""
|
||||||
|
all_results = load_existing_results()
|
||||||
|
done_urls = {rec["url"] for rec in all_results}
|
||||||
|
failed = load_failed()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# انتخاب فقط خطبههایی که هنوز کراول نشدن
|
||||||
|
pending = [s for s in speeches if s["url"] not in done_urls]
|
||||||
|
|
||||||
|
# اگر چیزی نمونده، پایان
|
||||||
|
if not pending:
|
||||||
|
print("[SUCCESS] All speeches crawled successfully.")
|
||||||
|
if os.path.exists(FAILED_FILE):
|
||||||
|
os.remove(FAILED_FILE) # همه چیز موفق بود → پاک کردن فایل خطاها
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"[INFO] {len(pending)} speeches remaining...")
|
||||||
|
|
||||||
|
new_failed = set()
|
||||||
|
|
||||||
|
for speech in pending:
|
||||||
|
speech_number = speech["speech_number"]
|
||||||
|
speech_title_large = speech["speech_title_large"]
|
||||||
|
url = speech["url"]
|
||||||
|
|
||||||
|
print(f"[INFO] Crawling speech {speech_number}: {speech_title_large}")
|
||||||
|
html = fetch_page(url)
|
||||||
|
if not html:
|
||||||
|
new_failed.add(url)
|
||||||
|
continue
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
try:
|
||||||
|
speech_title = soup.find("h2", class_="text-center phrase-title").get_text()
|
||||||
|
parts = soup.find_all("div", class_="phrase-text-container")
|
||||||
|
|
||||||
|
for idx, part in enumerate(parts, start=1):
|
||||||
|
arabic_text = part.find("p", class_="arabic-text show-off").get_text()
|
||||||
|
persian_translate = part.find("p", class_="translate-text").get_text()
|
||||||
# لینک تفسیر
|
# لینک تفسیر
|
||||||
interpretation_link = ""
|
interpretation_link = ""
|
||||||
ul_tag = part_div.find("ul", class_="tools")
|
ul_tag = part.find("ul", class_="tools")
|
||||||
if ul_tag:
|
if ul_tag:
|
||||||
first_li = ul_tag.find("li")
|
first_li = ul_tag.find("li")
|
||||||
if first_li:
|
if first_li:
|
||||||
|
@ -108,92 +196,51 @@ def crawl_speech_page(page):
|
||||||
if a_tag and a_tag.has_attr("href"):
|
if a_tag and a_tag.has_attr("href"):
|
||||||
interpretation_link = a_tag["href"]
|
interpretation_link = a_tag["href"]
|
||||||
|
|
||||||
parts.append({
|
record = {
|
||||||
"speech_title": speech_title,
|
"speech_title_large": speech_title_large.strip(),
|
||||||
|
"speech_title_page": speech_title.strip(),
|
||||||
|
"speech_number": speech_number,
|
||||||
"part_order": idx,
|
"part_order": idx,
|
||||||
"url": url,
|
"url": url,
|
||||||
"arabic_text": arabic_text,
|
"arabic_text": arabic_text.strip(),
|
||||||
"persian_translate": persian_translate,
|
"persian_translate": persian_translate.strip(),
|
||||||
"Interpretation_link": interpretation_link
|
"Interpretation_link": BASE_URL + interpretation_link
|
||||||
})
|
}
|
||||||
|
all_results.append(record)
|
||||||
|
|
||||||
return parts
|
done_urls.add(url)
|
||||||
|
print(f"[OK] Crawled {url}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[ERROR parsing] {url} => {e}")
|
||||||
|
new_failed.add(url)
|
||||||
|
|
||||||
def save_failed(pages):
|
time.sleep(1)
|
||||||
with open(FAILED_FILE, "w", encoding="utf-8") as f:
|
|
||||||
for p in pages:
|
|
||||||
f.write(f"{p}\n")
|
|
||||||
|
|
||||||
|
# ذخیره دادهها
|
||||||
def load_failed():
|
|
||||||
if not os.path.exists(FAILED_FILE):
|
|
||||||
return []
|
|
||||||
with open(FAILED_FILE, "r", encoding="utf-8") as f:
|
|
||||||
return [int(line.strip()) for line in f if line.strip().isdigit()]
|
|
||||||
|
|
||||||
|
|
||||||
def load_existing_data():
|
|
||||||
if not os.path.exists(OUTPUT_FILE):
|
|
||||||
return []
|
|
||||||
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
|
|
||||||
return json.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
def save_data(data):
|
|
||||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
# بهروزرسانی لیست خطاها
|
||||||
|
if new_failed:
|
||||||
|
save_failed(new_failed)
|
||||||
|
print(f"[WARN] {len(new_failed)} speeches failed in this round.")
|
||||||
|
else:
|
||||||
|
if os.path.exists(FAILED_FILE):
|
||||||
|
os.remove(FAILED_FILE)
|
||||||
|
|
||||||
|
|
||||||
def crawl_all_speeches(start=1, end=758):
|
def main():
|
||||||
all_data = load_existing_data()
|
# اگر فایل عناوین وجود نداشت، کراول کن
|
||||||
failed = []
|
if not os.path.exists(TITLES_FILE):
|
||||||
|
speeches = crawl_titles()
|
||||||
|
else:
|
||||||
|
with open(TITLES_FILE, "r", encoding="utf-8") as f:
|
||||||
|
speeches = json.load(f)
|
||||||
|
|
||||||
for page in range(start, end + 1):
|
# شروع کراول بخشها با retry پیوسته
|
||||||
try:
|
crawl_speech_parts(speeches)
|
||||||
print(f"Trying speech {page} ...")
|
|
||||||
parts = crawl_speech_page(page)
|
|
||||||
all_data.extend(parts)
|
|
||||||
time.sleep(1)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed speech {page} | error: {e}")
|
|
||||||
failed.append(page)
|
|
||||||
|
|
||||||
save_data(all_data)
|
|
||||||
save_failed(failed)
|
|
||||||
|
|
||||||
|
|
||||||
def retry_until_success(start=1, end=758):
|
|
||||||
"""تا وقتی که هیچ صفحهای ناموفق نباشد تکرار میکند"""
|
|
||||||
crawl_all_speeches(start, end)
|
|
||||||
|
|
||||||
while True:
|
|
||||||
failed_pages = load_failed()
|
|
||||||
if not failed_pages:
|
|
||||||
print("✅ All speeches crawled successfully.")
|
|
||||||
break
|
|
||||||
|
|
||||||
print(f"🔄 Retrying {len(failed_pages)} failed pages ...")
|
|
||||||
failed = []
|
|
||||||
all_data = load_existing_data()
|
|
||||||
|
|
||||||
for page in failed_pages:
|
|
||||||
try:
|
|
||||||
print(f"Retry speech {page} ...")
|
|
||||||
parts = crawl_speech_page(page)
|
|
||||||
all_data.extend(parts)
|
|
||||||
time.sleep(1)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Still failed {page} | error: {e}")
|
|
||||||
failed.append(page)
|
|
||||||
|
|
||||||
save_data(all_data)
|
|
||||||
save_failed(failed)
|
|
||||||
|
|
||||||
if not failed:
|
|
||||||
print("✅ Finished. No failed pages remain.")
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
retry_until_success(1, 758)
|
main()
|
||||||
|
|
7562
output/nahj_speeches.json
Normal file
7562
output/nahj_speeches.json
Normal file
File diff suppressed because it is too large
Load Diff
1207
output/speech-titles.json
Normal file
1207
output/speech-titles.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user