feat: in-process crawler daemon with cron and worker pool (0.28.0)
The backend now boots an internal crawler daemon that runs a daily metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded for multi-replica safety) and drains SyncChapterContent jobs from crawler_jobs through a worker pool. Chromium launches lazily on first job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity. Modules: - crawler::browser_manager — lazy-launch / idle-teardown wrapper around browser::Handle, with an on_launch hook that re-injects PHPSESSID on every fresh Chromium spawn. - crawler::pipeline — run_metadata_pass (the shared discover/upsert /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper used by the cron tick. - crawler::daemon — cron task + worker pool, behind two trait seams (MetadataPass, ChapterDispatcher) so tests can inject stubs without standing up Chromium or a live source. Behavior: - CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests). - Catch-up tick fires on startup if the last persisted slot was missed. - A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers idle until operator restart with a refreshed PHPSESSID. - Worker dispatch wrapped in catch_unwind so a panicking handler marks the job failed instead of taking down the worker. - Migration 0015 adds a small crawler_state k-v table for the last_metadata_tick_at watermark. Dep additions: chrono-tz (IANA TZ parsing). CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds the browser via BrowserManager so the on_launch session injection flow stays in one place. Inline chapter-content sync semantics are unchanged — the queue is for the daemon, force-refetches and manual backfills still bypass it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,10 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::NaiveTime;
|
||||
use chrono_tz::Tz;
|
||||
|
||||
use crate::crawler::browser::LaunchOptions;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AuthConfig {
|
||||
@@ -45,6 +51,54 @@ pub struct Config {
|
||||
pub auth: AuthConfig,
|
||||
pub upload: UploadConfig,
|
||||
pub cors_allowed_origins: Vec<String>,
|
||||
pub crawler: CrawlerConfig,
|
||||
}
|
||||
|
||||
/// All crawler-daemon knobs read from env. Mirrors the env vars the
|
||||
/// `bin/crawler` binary already reads, plus the new daemon-only knobs
|
||||
/// (daily_at, tz, idle_timeout, retention_days, daemon_enabled).
|
||||
///
|
||||
/// `daemon_enabled = false` skips the daemon spawn entirely — used by
|
||||
/// integration tests and dev runs that don't want background activity.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CrawlerConfig {
|
||||
pub daemon_enabled: bool,
|
||||
pub daily_at: NaiveTime,
|
||||
pub tz: Tz,
|
||||
pub idle_timeout: Duration,
|
||||
pub chapter_workers: usize,
|
||||
pub retention_days: u32,
|
||||
pub start_url: Option<String>,
|
||||
pub rate_ms: u64,
|
||||
pub cdn_host: Option<String>,
|
||||
pub cdn_rate_ms: u64,
|
||||
pub phpsessid: Option<String>,
|
||||
pub cookie_domain: Option<String>,
|
||||
pub user_agent: Option<String>,
|
||||
pub proxy: Option<String>,
|
||||
pub browser: LaunchOptions,
|
||||
}
|
||||
|
||||
impl Default for CrawlerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
daemon_enabled: false,
|
||||
daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
||||
tz: Tz::UTC,
|
||||
idle_timeout: Duration::from_secs(600),
|
||||
chapter_workers: 1,
|
||||
retention_days: 7,
|
||||
start_url: None,
|
||||
rate_ms: 1000,
|
||||
cdn_host: None,
|
||||
cdn_rate_ms: 1000,
|
||||
phpsessid: None,
|
||||
cookie_domain: None,
|
||||
user_agent: None,
|
||||
proxy: None,
|
||||
browser: LaunchOptions::headless(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
@@ -77,10 +131,65 @@ impl Config {
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default(),
|
||||
crawler: CrawlerConfig::from_env()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl CrawlerConfig {
|
||||
pub fn from_env() -> anyhow::Result<Self> {
|
||||
// Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
|
||||
let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() {
|
||||
None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
||||
Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| {
|
||||
anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}")
|
||||
})?,
|
||||
};
|
||||
let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() {
|
||||
None | Some("") => Tz::UTC,
|
||||
Some(raw) => raw
|
||||
.parse()
|
||||
.map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
|
||||
};
|
||||
Ok(Self {
|
||||
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
|
||||
daily_at,
|
||||
tz,
|
||||
idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
|
||||
chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
|
||||
retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
|
||||
start_url: std::env::var("CRAWLER_START_URL")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
|
||||
cdn_host: std::env::var("CRAWLER_CDN_HOST")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
|
||||
phpsessid: std::env::var("CRAWLER_PHPSESSID")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
user_agent: std::env::var("CRAWLER_USER_AGENT")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
proxy: std::env::var("CRAWLER_PROXY")
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
browser: LaunchOptions::from_env(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn env_u64(name: &str, default: u64) -> u64 {
|
||||
std::env::var(name)
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(default)
|
||||
}
|
||||
|
||||
fn env_bool(name: &str, default: bool) -> bool {
|
||||
match std::env::var(name).ok().as_deref() {
|
||||
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
||||
|
||||
Reference in New Issue
Block a user