//! Crawler binary. //! //! Now an ops escape hatch sitting alongside the in-process daemon: walks //! the source's manga listing (all pages), fetches each manga's metadata + //! chapter list, downloads covers, reconciles chapters — and then, for any //! chapter belonging to a bookmarked manga whose `page_count` is still 0, //! fetches the chapter pages inline. The daemon does the same work through //! `crawler_jobs`; the CLI is kept around for force-refetches and manual //! backfills. //! //! Configuration mirrors the daemon's `CRAWLER_*` env vars (see //! `crate::config::CrawlerConfig`) plus the CLI-only: //! - **Start URL**: first CLI positional arg, else `$CRAWLER_START_URL`. //! - **Skip chapters / chapter content / force re-fetch / keep browser**: //! `CRAWLER_SKIP_CHAPTERS`, `CRAWLER_SKIP_CHAPTER_CONTENT`, //! `CRAWLER_FORCE_REFETCH_CHAPTERS`, `CRAWLER_KEEP_BROWSER_OPEN`. //! - **Limit**: `CRAWLER_LIMIT` (max manga detail fetches per run). //! //! See `crawler::pipeline::run_metadata_pass` for the shared metadata //! flow. use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, Context}; use futures_util::stream::{self, StreamExt}; use mangalord::crawler::browser::{BrowserMode, LaunchOptions}; use mangalord::crawler::browser_manager::{self, BrowserManager}; use mangalord::crawler::content::{self, SyncOutcome}; use mangalord::crawler::pipeline; use mangalord::crawler::rate_limit::HostRateLimiters; use mangalord::crawler::session; use mangalord::crawler::source::DiscoverMode; use mangalord::storage::{LocalStorage, Storage}; use sqlx::postgres::PgPoolOptions; use sqlx::PgPool; use tracing_subscriber::EnvFilter; use uuid::Uuid; #[tokio::main] async fn main() -> anyhow::Result<()> { dotenvy::dotenv().ok(); tracing_subscriber::fmt() .with_env_filter( EnvFilter::try_from_default_env().unwrap_or_else(|_| { "info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off" .into() }), ) .init(); let start_url = resolve_start_url()?; let database_url = std::env::var("DATABASE_URL") .map_err(|_| anyhow!("DATABASE_URL must be set"))?; let storage_dir: PathBuf = std::env::var("STORAGE_DIR") .unwrap_or_else(|_| "./data/storage".to_string()) .into(); let rate_ms = env_u64("CRAWLER_RATE_MS", 1000); let cdn_host = std::env::var("CRAWLER_CDN_HOST") .ok() .filter(|s| !s.trim().is_empty()); let cdn_rate_ms = env_u64("CRAWLER_CDN_RATE_MS", rate_ms); let limit = env_u64("CRAWLER_LIMIT", 0) as usize; let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false); let incremental_stop_after = env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize; let mode = parse_crawler_mode(incremental_stop_after)?; let skip_chapter_content = env_bool("CRAWLER_SKIP_CHAPTER_CONTENT", false); let chapter_workers = env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize; let force_refetch_chapters = env_bool("CRAWLER_FORCE_REFETCH_CHAPTERS", false); let phpsessid = std::env::var("CRAWLER_PHPSESSID") .ok() .filter(|s| !s.trim().is_empty()); let cookie_domain = std::env::var("CRAWLER_COOKIE_DOMAIN") .ok() .filter(|s| !s.trim().is_empty()) .or_else(|| session::registrable_domain(&start_url)); let user_agent = std::env::var("CRAWLER_USER_AGENT") .ok() .filter(|s| !s.trim().is_empty()); let proxy_url = std::env::var("CRAWLER_PROXY") .ok() .filter(|s| !s.trim().is_empty()); let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false); let db = PgPoolOptions::new() .max_connections(5) .connect(&database_url) .await .context("connect to database")?; sqlx::migrate!("./migrations").run(&db).await?; let storage: Arc = Arc::new(LocalStorage::new(&storage_dir)); let cookie_jar = Arc::new(reqwest::cookie::Jar::default()); if let (Some(sid), Some(domain)) = (&phpsessid, &cookie_domain) { let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/"); let seed_url = reqwest::Url::parse(&start_url).context("parse start URL for cookie seed")?; cookie_jar.add_cookie_str(&cookie_str, &seed_url); tracing::info!(domain, "seeded PHPSESSID into reqwest cookie jar"); } let mut http_builder = reqwest::Client::builder() .timeout(Duration::from_secs(30)) .no_proxy() .cookie_provider(cookie_jar); if let Some(ua) = &user_agent { http_builder = http_builder.user_agent(ua); } if let Some(proxy) = &proxy_url { http_builder = http_builder .proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy URL: {proxy}"))?); } let http = http_builder.build().context("build http client")?; let mut options = LaunchOptions::from_env(); if let Some(proxy) = &proxy_url { options.extra_args.push(format!("--proxy-server={proxy}")); } let keep_open = match (keep_browser_open, options.mode) { (true, BrowserMode::Headed) => true, (true, BrowserMode::Headless) => { tracing::warn!( "CRAWLER_KEEP_BROWSER_OPEN ignored in headless mode (no window to inspect)" ); false } _ => false, }; tracing::info!( ?options, %start_url, rate_ms, cdn_host = ?cdn_host, cdn_rate_ms, limit, skip_chapters, skip_chapter_content, chapter_workers, force_refetch_chapters, phpsessid_set = phpsessid.is_some(), cookie_domain = ?cookie_domain, user_agent = ?user_agent, proxy = ?proxy_url, keep_open, ?mode, storage_dir = %storage_dir.display(), "starting crawler" ); // BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium // alive for the entire run — same lifecycle as the old direct // `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the // session probe; bad cookies fail fast before any real work happens. let on_launch: browser_manager::OnLaunch = match (&phpsessid, &cookie_domain) { (Some(sid), Some(domain)) => { let sid = sid.clone(); let domain = domain.clone(); let start_url_clone = start_url.clone(); Arc::new(move |browser| { let sid = sid.clone(); let domain = domain.clone(); let start_url = start_url_clone.clone(); Box::pin(async move { session::inject_phpsessid(&browser, &sid, &domain) .await .context("inject_phpsessid")?; session::verify_session(&browser, &start_url) .await .context("verify_session")?; Ok(()) }) }) } _ => browser_manager::noop_on_launch(), }; let session_ready = phpsessid.is_some() && cookie_domain.is_some(); let manager = BrowserManager::new(options, Duration::ZERO, on_launch); let result = run( Arc::clone(&manager), &db, Arc::clone(&storage), &http, &start_url, rate_ms, cdn_host.as_deref(), cdn_rate_ms, limit, skip_chapters, skip_chapter_content || !session_ready, chapter_workers, force_refetch_chapters, mode, ) .await; if keep_open { tracing::info!( "crawler finished; browser kept open. Press Ctrl+C to close and exit." ); let _ = tokio::signal::ctrl_c().await; tracing::info!("Ctrl+C received; closing browser"); } manager.shutdown().await; result } #[allow(clippy::too_many_arguments)] async fn run( manager: Arc, db: &PgPool, storage: Arc, http: &reqwest::Client, start_url: &str, rate_ms: u64, cdn_host: Option<&str>, cdn_rate_ms: u64, limit: usize, skip_chapters: bool, skip_chapter_content: bool, chapter_workers: usize, force_refetch_chapters: bool, mode: DiscoverMode, ) -> anyhow::Result<()> { let mut rate = HostRateLimiters::new(Duration::from_millis(rate_ms)); if let Some(host) = cdn_host { rate = rate.with_override(host, Duration::from_millis(cdn_rate_ms)); } let rate = Arc::new(rate); let stats = pipeline::run_metadata_pass( manager.as_ref(), db, storage.as_ref(), http, rate.as_ref(), start_url, limit, skip_chapters, mode, ) .await?; tracing::info!(?stats, "metadata pass complete"); if !skip_chapter_content { sync_bookmarked_chapter_content( Arc::clone(&manager), db, Arc::clone(&storage), http, Arc::clone(&rate), "target", chapter_workers, force_refetch_chapters, ) .await?; } Ok(()) } /// Find every chapter whose manga is bookmarked by at least one user and /// that hasn't been content-synced yet, then fan them out across `workers` /// concurrent tasks. Same as before except the browser comes from a /// BrowserManager lease so it interleaves cleanly with the metadata pass. /// /// A `SessionExpired` result aborts the phase. #[allow(clippy::too_many_arguments)] async fn sync_bookmarked_chapter_content( manager: Arc, db: &PgPool, storage: Arc, http: &reqwest::Client, rate: Arc, source_id: &str, workers: usize, force_refetch: bool, ) -> anyhow::Result<()> { let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as( r#" SELECT id, manga_id, source_url FROM ( SELECT DISTINCT c.id, c.manga_id, c.created_at, cs.source_url FROM chapters c JOIN bookmarks b ON b.manga_id = c.manga_id JOIN chapter_sources cs ON cs.chapter_id = c.id WHERE cs.source_id = $1 AND cs.dropped_at IS NULL AND (c.page_count = 0 OR $2) ) sub ORDER BY manga_id, created_at ASC "#, ) .bind(source_id) .bind(force_refetch) .fetch_all(db) .await .context("query pending chapter content")?; if pending.is_empty() { tracing::info!("chapter content: nothing pending"); return Ok(()); } tracing::info!(count = pending.len(), workers, "chapter content phase starting"); let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false)); let stats = std::sync::Mutex::new(WorkerStats::default()); stream::iter(pending.into_iter()) .for_each_concurrent(workers.max(1), |(chapter_id, manga_id, source_url)| { let session_expired = Arc::clone(&session_expired); let storage = Arc::clone(&storage); let rate = Arc::clone(&rate); let manager = Arc::clone(&manager); let stats = &stats; async move { if session_expired.load(std::sync::atomic::Ordering::Relaxed) { return; } let lease = match manager.acquire().await { Ok(l) => l, Err(e) => { tracing::error!(%chapter_id, error = ?e, "browser acquire failed"); let mut s = stats.lock().unwrap(); s.failed += 1; return; } }; let outcome = content::sync_chapter_content( &lease, db, storage.as_ref(), http, rate.as_ref(), chapter_id, manga_id, &source_url, force_refetch, ) .await; drop(lease); let mut s = stats.lock().unwrap(); match outcome { Ok(SyncOutcome::Fetched { pages }) => { tracing::info!(%chapter_id, pages, "chapter content fetched"); s.fetched += 1; } Ok(SyncOutcome::Skipped) => s.skipped += 1, Ok(SyncOutcome::SessionExpired) => { tracing::error!( %chapter_id, "session expired mid-run — refresh CRAWLER_PHPSESSID and re-run" ); session_expired .store(true, std::sync::atomic::Ordering::Relaxed); } Err(e) => { tracing::warn!( %chapter_id, error = ?e, "chapter content sync failed" ); s.failed += 1; } } } }) .await; let total = stats.into_inner().unwrap(); tracing::info!( fetched = total.fetched, skipped = total.skipped, failed = total.failed, "chapter content phase done" ); if session_expired.load(std::sync::atomic::Ordering::Relaxed) { anyhow::bail!("session expired during chapter content phase"); } Ok(()) } #[derive(Default, Clone, Copy)] struct WorkerStats { fetched: usize, skipped: usize, failed: usize, } fn resolve_start_url() -> anyhow::Result { if let Some(arg) = std::env::args().nth(1) { return Ok(arg); } std::env::var("CRAWLER_START_URL").map_err(|_| { anyhow!( "start URL is required — pass as first CLI arg or set $CRAWLER_START_URL" ) }) } /// Parse the CLI's `CRAWLER_MODE`. Defaults to `backfill` because the /// binary is operator-driven (manual reseeds, force-refetches) — the /// auto-detect logic lives in the daemon. `auto` is rejected because /// the CLI has no DB state to consult before the run. fn parse_crawler_mode(incremental_stop_after: usize) -> anyhow::Result { parse_crawler_mode_str( std::env::var("CRAWLER_MODE").ok().as_deref(), incremental_stop_after, ) } /// Pure variant of [`parse_crawler_mode`] — testable without env-var /// mutation. fn parse_crawler_mode_str( raw: Option<&str>, incremental_stop_after: usize, ) -> anyhow::Result { match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() { None | Some("") | Some("backfill") => Ok(DiscoverMode::Backfill), Some("incremental") => Ok(DiscoverMode::Incremental { stop_after_unchanged: incremental_stop_after, }), Some("auto") => Err(anyhow!( "CRAWLER_MODE=auto isn't supported by the CLI (use backfill or incremental); \ the daemon does auto-detection" )), Some(other) => Err(anyhow!( "CRAWLER_MODE must be one of: backfill, incremental (got {other:?})" )), } } fn env_u64(name: &str, default: u64) -> u64 { std::env::var(name) .ok() .and_then(|s| s.parse().ok()) .unwrap_or(default) } fn env_bool(name: &str, default: bool) -> bool { match std::env::var(name).ok().as_deref() { Some("1") | Some("true") | Some("TRUE") | Some("yes") => true, Some("0") | Some("false") | Some("FALSE") | Some("no") => false, _ => default, } } #[cfg(test)] mod tests { use super::*; #[test] fn cli_mode_defaults_to_backfill_when_unset_or_blank() { let none = parse_crawler_mode_str(None, 20).unwrap(); assert!(matches!(none, DiscoverMode::Backfill)); let blank = parse_crawler_mode_str(Some(""), 20).unwrap(); assert!(matches!(blank, DiscoverMode::Backfill)); } #[test] fn cli_mode_recognizes_backfill_and_incremental() { let backfill = parse_crawler_mode_str(Some("backfill"), 20).unwrap(); assert!(matches!(backfill, DiscoverMode::Backfill)); let incremental = parse_crawler_mode_str(Some("incremental"), 9).unwrap(); assert!(matches!( incremental, DiscoverMode::Incremental { stop_after_unchanged: 9 } )); } #[test] fn cli_mode_rejects_auto_explicitly() { let err = parse_crawler_mode_str(Some("auto"), 20).unwrap_err(); let msg = format!("{err}"); assert!( msg.contains("daemon"), "rejection should point operator at the daemon: {msg}" ); } #[test] fn cli_mode_rejects_unknown_value() { let err = parse_crawler_mode_str(Some("garbage"), 20).unwrap_err(); let msg = format!("{err}"); assert!(msg.contains("backfill")); assert!(msg.contains("incremental")); } #[test] fn cli_mode_is_case_insensitive_and_trims() { let mixed = parse_crawler_mode_str(Some(" Incremental "), 4).unwrap(); assert!(matches!( mixed, DiscoverMode::Incremental { stop_after_unchanged: 4 } )); } }