feat: in-process crawler daemon with cron and worker pool (0.28.0)

The backend now boots an internal crawler daemon that runs a daily
metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded
for multi-replica safety) and drains SyncChapterContent jobs from
crawler_jobs through a worker pool. Chromium launches lazily on first
job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity.

Modules:
- crawler::browser_manager — lazy-launch / idle-teardown wrapper
  around browser::Handle, with an on_launch hook that re-injects
  PHPSESSID on every fresh Chromium spawn.
- crawler::pipeline — run_metadata_pass (the shared discover/upsert
  /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper
  used by the cron tick.
- crawler::daemon — cron task + worker pool, behind two trait seams
  (MetadataPass, ChapterDispatcher) so tests can inject stubs without
  standing up Chromium or a live source.

Behavior:
- CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests).
- Catch-up tick fires on startup if the last persisted slot was missed.
- A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers
  idle until operator restart with a refreshed PHPSESSID.
- Worker dispatch wrapped in catch_unwind so a panicking handler
  marks the job failed instead of taking down the worker.
- Migration 0015 adds a small crawler_state k-v table for the
  last_metadata_tick_at watermark.

Dep additions: chrono-tz (IANA TZ parsing).

CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds
the browser via BrowserManager so the on_launch session injection
flow stays in one place. Inline chapter-content sync semantics are
unchanged — the queue is for the daemon, force-refetches and manual
backfills still bypass it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-25 20:32:02 +02:00
parent 93c7fd63fc
commit 9fe0f26d75
14 changed files with 2162 additions and 309 deletions

View File

@@ -1,4 +1,10 @@
use std::path::PathBuf;
use std::time::Duration;
use chrono::NaiveTime;
use chrono_tz::Tz;
use crate::crawler::browser::LaunchOptions;
#[derive(Clone, Debug)]
pub struct AuthConfig {
@@ -45,6 +51,54 @@ pub struct Config {
pub auth: AuthConfig,
pub upload: UploadConfig,
pub cors_allowed_origins: Vec<String>,
pub crawler: CrawlerConfig,
}
/// All crawler-daemon knobs read from env. Mirrors the env vars the
/// `bin/crawler` binary already reads, plus the new daemon-only knobs
/// (daily_at, tz, idle_timeout, retention_days, daemon_enabled).
///
/// `daemon_enabled = false` skips the daemon spawn entirely — used by
/// integration tests and dev runs that don't want background activity.
#[derive(Clone, Debug)]
pub struct CrawlerConfig {
pub daemon_enabled: bool,
pub daily_at: NaiveTime,
pub tz: Tz,
pub idle_timeout: Duration,
pub chapter_workers: usize,
pub retention_days: u32,
pub start_url: Option<String>,
pub rate_ms: u64,
pub cdn_host: Option<String>,
pub cdn_rate_ms: u64,
pub phpsessid: Option<String>,
pub cookie_domain: Option<String>,
pub user_agent: Option<String>,
pub proxy: Option<String>,
pub browser: LaunchOptions,
}
impl Default for CrawlerConfig {
fn default() -> Self {
Self {
daemon_enabled: false,
daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
tz: Tz::UTC,
idle_timeout: Duration::from_secs(600),
chapter_workers: 1,
retention_days: 7,
start_url: None,
rate_ms: 1000,
cdn_host: None,
cdn_rate_ms: 1000,
phpsessid: None,
cookie_domain: None,
user_agent: None,
proxy: None,
browser: LaunchOptions::headless(),
}
}
}
impl Config {
@@ -77,10 +131,65 @@ impl Config {
.collect()
})
.unwrap_or_default(),
crawler: CrawlerConfig::from_env()?,
})
}
}
impl CrawlerConfig {
pub fn from_env() -> anyhow::Result<Self> {
// Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() {
None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| {
anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}")
})?,
};
let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() {
None | Some("") => Tz::UTC,
Some(raw) => raw
.parse()
.map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
};
Ok(Self {
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
daily_at,
tz,
idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
start_url: std::env::var("CRAWLER_START_URL")
.ok()
.filter(|s| !s.trim().is_empty()),
rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
cdn_host: std::env::var("CRAWLER_CDN_HOST")
.ok()
.filter(|s| !s.trim().is_empty()),
cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
phpsessid: std::env::var("CRAWLER_PHPSESSID")
.ok()
.filter(|s| !s.trim().is_empty()),
cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN")
.ok()
.filter(|s| !s.trim().is_empty()),
user_agent: std::env::var("CRAWLER_USER_AGENT")
.ok()
.filter(|s| !s.trim().is_empty()),
proxy: std::env::var("CRAWLER_PROXY")
.ok()
.filter(|s| !s.trim().is_empty()),
browser: LaunchOptions::from_env(),
})
}
}
fn env_u64(name: &str, default: u64) -> u64 {
std::env::var(name)
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(default)
}
fn env_bool(name: &str, default: bool) -> bool {
match std::env::var(name).ok().as_deref() {
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,