feat: in-process crawler daemon with cron and worker pool (0.28.0)

The backend now boots an internal crawler daemon that runs a daily metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded for multi-replica safety) and drains SyncChapterContent jobs from crawler_jobs through a worker pool. Chromium launches lazily on first job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity. Modules: - crawler::browser_manager — lazy-launch / idle-teardown wrapper around browser::Handle, with an on_launch hook that re-injects PHPSESSID on every fresh Chromium spawn. - crawler::pipeline — run_metadata_pass (the shared discover/upsert /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper used by the cron tick. - crawler::daemon — cron task + worker pool, behind two trait seams (MetadataPass, ChapterDispatcher) so tests can inject stubs without standing up Chromium or a live source. Behavior: - CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests). - Catch-up tick fires on startup if the last persisted slot was missed. - A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers idle until operator restart with a refreshed PHPSESSID. - Worker dispatch wrapped in catch_unwind so a panicking handler marks the job failed instead of taking down the worker. - Migration 0015 adds a small crawler_state k-v table for the last_metadata_tick_at watermark. Dep additions: chrono-tz (IANA TZ parsing). CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds the browser via BrowserManager so the on_launch session injection flow stays in one place. Inline chapter-content sync semantics are unchanged — the queue is for the daemon, force-refetches and manual backfills still bypass it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:32:02 +02:00
parent 93c7fd63fc
commit 9fe0f26d75
14 changed files with 2162 additions and 309 deletions
--- a/backend/src/config.rs
+++ b/backend/src/config.rs
@@ -1,4 +1,10 @@
 use std::path::PathBuf;
+use std::time::Duration;
+
+use chrono::NaiveTime;
+use chrono_tz::Tz;
+
+use crate::crawler::browser::LaunchOptions;

 #[derive(Clone, Debug)]
 pub struct AuthConfig {
@@ -45,6 +51,54 @@ pub struct Config {
    pub auth: AuthConfig,
    pub upload: UploadConfig,
    pub cors_allowed_origins: Vec<String>,
+    pub crawler: CrawlerConfig,
+}
+
+/// All crawler-daemon knobs read from env. Mirrors the env vars the
+/// `bin/crawler` binary already reads, plus the new daemon-only knobs
+/// (daily_at, tz, idle_timeout, retention_days, daemon_enabled).
+///
+/// `daemon_enabled = false` skips the daemon spawn entirely — used by
+/// integration tests and dev runs that don't want background activity.
+#[derive(Clone, Debug)]
+pub struct CrawlerConfig {
+    pub daemon_enabled: bool,
+    pub daily_at: NaiveTime,
+    pub tz: Tz,
+    pub idle_timeout: Duration,
+    pub chapter_workers: usize,
+    pub retention_days: u32,
+    pub start_url: Option<String>,
+    pub rate_ms: u64,
+    pub cdn_host: Option<String>,
+    pub cdn_rate_ms: u64,
+    pub phpsessid: Option<String>,
+    pub cookie_domain: Option<String>,
+    pub user_agent: Option<String>,
+    pub proxy: Option<String>,
+    pub browser: LaunchOptions,
+}
+
+impl Default for CrawlerConfig {
+    fn default() -> Self {
+        Self {
+            daemon_enabled: false,
+            daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
+            tz: Tz::UTC,
+            idle_timeout: Duration::from_secs(600),
+            chapter_workers: 1,
+            retention_days: 7,
+            start_url: None,
+            rate_ms: 1000,
+            cdn_host: None,
+            cdn_rate_ms: 1000,
+            phpsessid: None,
+            cookie_domain: None,
+            user_agent: None,
+            proxy: None,
+            browser: LaunchOptions::headless(),
+        }
+    }
 }

 impl Config {
@@ -77,10 +131,65 @@ impl Config {
                        .collect()
                })
                .unwrap_or_default(),
+            crawler: CrawlerConfig::from_env()?,
        })
    }
 }

+impl CrawlerConfig {
+    pub fn from_env() -> anyhow::Result<Self> {
+        // Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
+        let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() {
+            None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
+            Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| {
+                anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}")
+            })?,
+        };
+        let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() {
+            None | Some("") => Tz::UTC,
+            Some(raw) => raw
+                .parse()
+                .map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
+        };
+        Ok(Self {
+            daemon_enabled: env_bool("CRAWLER_DAEMON", true),
+            daily_at,
+            tz,
+            idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
+            chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
+            retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
+            start_url: std::env::var("CRAWLER_START_URL")
+                .ok()
+                .filter(|s| !s.trim().is_empty()),
+            rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
+            cdn_host: std::env::var("CRAWLER_CDN_HOST")
+                .ok()
+                .filter(|s| !s.trim().is_empty()),
+            cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
+            phpsessid: std::env::var("CRAWLER_PHPSESSID")
+                .ok()
+                .filter(|s| !s.trim().is_empty()),
+            cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN")
+                .ok()
+                .filter(|s| !s.trim().is_empty()),
+            user_agent: std::env::var("CRAWLER_USER_AGENT")
+                .ok()
+                .filter(|s| !s.trim().is_empty()),
+            proxy: std::env::var("CRAWLER_PROXY")
+                .ok()
+                .filter(|s| !s.trim().is_empty()),
+            browser: LaunchOptions::from_env(),
+        })
+    }
+}
+
+fn env_u64(name: &str, default: u64) -> u64 {
+    std::env::var(name)
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(default)
+}
+
 fn env_bool(name: &str, default: bool) -> bool {
    match std::env::var(name).ok().as_deref() {
        Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,