use std::path::PathBuf; use std::time::Duration; use chrono::NaiveTime; use chrono_tz::Tz; use crate::crawler::browser::LaunchOptions; use crate::crawler::safety::{DownloadAllowlist, DEFAULT_MAX_IMAGE_BYTES}; use crate::crawler::source::DiscoverMode; /// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default — /// pick Backfill until `seed_completed_at` is written, then flip to /// Incremental. `Explicit` forces a single mode regardless. #[derive(Clone, Copy, Debug)] pub enum CrawlerModePref { Auto, Explicit(DiscoverMode), } #[derive(Clone, Debug)] pub struct AuthConfig { pub cookie_secure: bool, pub cookie_domain: Option, pub session_ttl_days: i64, } impl Default for AuthConfig { fn default() -> Self { Self { cookie_secure: true, cookie_domain: None, session_ttl_days: 30, } } } #[derive(Clone, Debug)] pub struct UploadConfig { /// Total request size cap, enforced by axum's DefaultBodyLimit on the /// upload routes. Rejected requests get a 413. pub max_request_bytes: usize, /// Per-image-part size cap, enforced after the part is read. Lets us /// reject a single oversized cover/page without failing the whole /// request just because the total happens to fit. pub max_file_bytes: usize, } impl Default for UploadConfig { fn default() -> Self { Self { max_request_bytes: 200 * 1024 * 1024, // 200 MiB max_file_bytes: 20 * 1024 * 1024, // 20 MiB } } } #[derive(Clone, Debug)] pub struct Config { pub database_url: String, pub bind_address: String, pub storage_dir: PathBuf, pub auth: AuthConfig, pub upload: UploadConfig, pub cors_allowed_origins: Vec, pub crawler: CrawlerConfig, } /// All crawler-daemon knobs read from env. Mirrors the env vars the /// `bin/crawler` binary already reads, plus the new daemon-only knobs /// (daily_at, tz, idle_timeout, retention_days, daemon_enabled). /// /// `daemon_enabled = false` skips the daemon spawn entirely — used by /// integration tests and dev runs that don't want background activity. #[derive(Clone, Debug)] pub struct CrawlerConfig { pub daemon_enabled: bool, pub daily_at: NaiveTime, pub tz: Tz, pub idle_timeout: Duration, pub chapter_workers: usize, pub retention_days: u32, pub start_url: Option, pub rate_ms: u64, pub cdn_host: Option, pub cdn_rate_ms: u64, pub phpsessid: Option, pub cookie_domain: Option, pub user_agent: Option, pub proxy: Option, pub browser: LaunchOptions, /// Mode preference for the metadata pass. Daemon default is `Auto` /// (Backfill until `seed_completed_at` is written, then Incremental). pub mode: CrawlerModePref, /// `stop_after_unchanged` threshold supplied to Incremental in both /// `Auto` (post-seed) and `Explicit(Incremental)` modes. pub incremental_stop_after: usize, /// Hosts the crawler is allowed to download images / covers from. /// Always seeded with the host of `start_url` and (when set) the /// configured `cdn_host`. Additional hosts can be added via /// `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-separated). pub download_allowlist: DownloadAllowlist, /// Hard upper bound on a single image download. Defaults to 32 MiB. pub max_image_bytes: usize, } impl Default for CrawlerConfig { fn default() -> Self { Self { daemon_enabled: false, daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(), tz: Tz::UTC, idle_timeout: Duration::from_secs(600), chapter_workers: 1, retention_days: 7, start_url: None, rate_ms: 1000, cdn_host: None, cdn_rate_ms: 1000, phpsessid: None, cookie_domain: None, user_agent: None, proxy: None, browser: LaunchOptions::headless(), mode: CrawlerModePref::Auto, incremental_stop_after: 20, download_allowlist: DownloadAllowlist::new(), max_image_bytes: DEFAULT_MAX_IMAGE_BYTES, } } } impl Config { pub fn from_env() -> anyhow::Result { Ok(Self { database_url: std::env::var("DATABASE_URL") .map_err(|_| anyhow::anyhow!("DATABASE_URL must be set"))?, bind_address: std::env::var("BIND_ADDRESS") .unwrap_or_else(|_| "0.0.0.0:8080".to_string()), storage_dir: std::env::var("STORAGE_DIR") .unwrap_or_else(|_| "./data/storage".to_string()) .into(), auth: AuthConfig { cookie_secure: env_bool("COOKIE_SECURE", true), cookie_domain: std::env::var("COOKIE_DOMAIN") .ok() .filter(|s| !s.is_empty()), session_ttl_days: env_i64("SESSION_TTL_DAYS", 30), }, upload: UploadConfig { max_request_bytes: env_usize("MAX_REQUEST_BYTES", 200 * 1024 * 1024), max_file_bytes: env_usize("MAX_FILE_BYTES", 20 * 1024 * 1024), }, cors_allowed_origins: std::env::var("CORS_ALLOWED_ORIGINS") .ok() .map(|s| { s.split(',') .map(|o| o.trim().to_string()) .filter(|o| !o.is_empty()) .collect() }) .unwrap_or_default(), crawler: CrawlerConfig::from_env()?, }) } } impl CrawlerConfig { pub fn from_env() -> anyhow::Result { // Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast. let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() { None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(), Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| { anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}") })?, }; let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() { None | Some("") => Tz::UTC, Some(raw) => raw .parse() .map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?, }; let incremental_stop_after = env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize; let mode = parse_mode_env(incremental_stop_after)?; let start_url = std::env::var("CRAWLER_START_URL") .ok() .filter(|s| !s.trim().is_empty()); let cdn_host = std::env::var("CRAWLER_CDN_HOST") .ok() .filter(|s| !s.trim().is_empty()); let download_allowlist = build_download_allowlist(start_url.as_deref(), cdn_host.as_deref()); Ok(Self { daemon_enabled: env_bool("CRAWLER_DAEMON", true), daily_at, tz, idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)), chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize, retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32, start_url, rate_ms: env_u64("CRAWLER_RATE_MS", 1000), cdn_host, cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)), phpsessid: std::env::var("CRAWLER_PHPSESSID") .ok() .filter(|s| !s.trim().is_empty()), cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN") .ok() .filter(|s| !s.trim().is_empty()), user_agent: std::env::var("CRAWLER_USER_AGENT") .ok() .filter(|s| !s.trim().is_empty()), proxy: std::env::var("CRAWLER_PROXY") .ok() .filter(|s| !s.trim().is_empty()), browser: LaunchOptions::from_env(), mode, incremental_stop_after, download_allowlist, max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES), }) } } /// Build the download allowlist from env. Always includes /// `CRAWLER_START_URL`'s host (so the crawler can fetch covers from /// the catalog itself) and `CRAWLER_CDN_HOST` when set. Additional /// hosts can be supplied via `CRAWLER_DOWNLOAD_ALLOWLIST` (comma- /// separated). Empty by default — meaning the crawler refuses to /// download anything when no source is configured, which is the safe /// fail-closed posture. fn build_download_allowlist( start_url: Option<&str>, cdn_host: Option<&str>, ) -> DownloadAllowlist { let mut allow = DownloadAllowlist::new(); if let Some(url) = start_url { if let Ok(parsed) = reqwest::Url::parse(url) { if let Some(h) = parsed.host_str() { allow = allow.allow(h); } } } if let Some(host) = cdn_host { allow = allow.allow(host); } if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") { for piece in extras.split(',') { let trimmed = piece.trim(); if !trimmed.is_empty() { allow = allow.allow(trimmed); } } } allow } /// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are /// `auto`, `backfill`, and `incremental` (case-insensitive). Anything /// else is a hard error so a typo can't silently fall through to the /// default and mask itself. fn parse_mode_env(incremental_stop_after: usize) -> anyhow::Result { parse_mode_str(std::env::var("CRAWLER_MODE").ok().as_deref(), incremental_stop_after) } /// Pure variant of [`parse_mode_env`] — testable without env-var /// mutation. Takes the raw value (or `None` if unset). pub(crate) fn parse_mode_str( raw: Option<&str>, incremental_stop_after: usize, ) -> anyhow::Result { match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() { None | Some("") | Some("auto") => Ok(CrawlerModePref::Auto), Some("backfill") => Ok(CrawlerModePref::Explicit(DiscoverMode::Backfill)), Some("incremental") => Ok(CrawlerModePref::Explicit(DiscoverMode::Incremental { stop_after_unchanged: incremental_stop_after, })), Some(other) => Err(anyhow::anyhow!( "CRAWLER_MODE must be one of: auto, backfill, incremental (got {other:?})" )), } } fn env_u64(name: &str, default: u64) -> u64 { std::env::var(name) .ok() .and_then(|s| s.parse().ok()) .unwrap_or(default) } fn env_bool(name: &str, default: bool) -> bool { match std::env::var(name).ok().as_deref() { Some("1") | Some("true") | Some("TRUE") | Some("yes") => true, Some("0") | Some("false") | Some("FALSE") | Some("no") => false, _ => default, } } fn env_i64(name: &str, default: i64) -> i64 { std::env::var(name) .ok() .and_then(|s| s.parse().ok()) .unwrap_or(default) } fn env_usize(name: &str, default: usize) -> usize { std::env::var(name) .ok() .and_then(|s| s.parse().ok()) .unwrap_or(default) } #[cfg(test)] mod tests { use super::*; #[test] fn parse_mode_str_defaults_to_auto_when_unset_or_blank() { let none = parse_mode_str(None, 20).unwrap(); assert!(matches!(none, CrawlerModePref::Auto)); let blank = parse_mode_str(Some(""), 20).unwrap(); assert!(matches!(blank, CrawlerModePref::Auto)); let whitespace = parse_mode_str(Some(" "), 20).unwrap(); assert!(matches!(whitespace, CrawlerModePref::Auto)); } #[test] fn parse_mode_str_recognizes_each_keyword() { let auto = parse_mode_str(Some("auto"), 20).unwrap(); assert!(matches!(auto, CrawlerModePref::Auto)); let backfill = parse_mode_str(Some("backfill"), 20).unwrap(); assert!(matches!( backfill, CrawlerModePref::Explicit(DiscoverMode::Backfill) )); let incremental = parse_mode_str(Some("incremental"), 7).unwrap(); assert!(matches!( incremental, CrawlerModePref::Explicit(DiscoverMode::Incremental { stop_after_unchanged: 7 }) )); } #[test] fn parse_mode_str_is_case_insensitive_and_trims_whitespace() { let mixed = parse_mode_str(Some(" Incremental "), 5).unwrap(); assert!(matches!( mixed, CrawlerModePref::Explicit(DiscoverMode::Incremental { stop_after_unchanged: 5 }) )); let upper = parse_mode_str(Some("BACKFILL"), 5).unwrap(); assert!(matches!( upper, CrawlerModePref::Explicit(DiscoverMode::Backfill) )); } #[test] fn parse_mode_str_hard_errors_on_unknown_value() { let err = parse_mode_str(Some("backfil"), 20).unwrap_err(); let msg = format!("{err}"); assert!(msg.contains("backfill"), "error should list valid values: {msg}"); assert!(msg.contains("auto")); assert!(msg.contains("incremental")); } }