use std::path::PathBuf; use std::time::Duration; use chrono::NaiveTime; use chrono_tz::Tz; use crate::crawler::browser::LaunchOptions; #[derive(Clone, Debug)] pub struct AuthConfig { pub cookie_secure: bool, pub cookie_domain: Option, pub session_ttl_days: i64, } impl Default for AuthConfig { fn default() -> Self { Self { cookie_secure: true, cookie_domain: None, session_ttl_days: 30, } } } #[derive(Clone, Debug)] pub struct UploadConfig { /// Total request size cap, enforced by axum's DefaultBodyLimit on the /// upload routes. Rejected requests get a 413. pub max_request_bytes: usize, /// Per-image-part size cap, enforced after the part is read. Lets us /// reject a single oversized cover/page without failing the whole /// request just because the total happens to fit. pub max_file_bytes: usize, } impl Default for UploadConfig { fn default() -> Self { Self { max_request_bytes: 200 * 1024 * 1024, // 200 MiB max_file_bytes: 20 * 1024 * 1024, // 20 MiB } } } #[derive(Clone, Debug)] pub struct Config { pub database_url: String, pub bind_address: String, pub storage_dir: PathBuf, pub auth: AuthConfig, pub upload: UploadConfig, pub cors_allowed_origins: Vec, pub crawler: CrawlerConfig, } /// All crawler-daemon knobs read from env. Mirrors the env vars the /// `bin/crawler` binary already reads, plus the new daemon-only knobs /// (daily_at, tz, idle_timeout, retention_days, daemon_enabled). /// /// `daemon_enabled = false` skips the daemon spawn entirely — used by /// integration tests and dev runs that don't want background activity. #[derive(Clone, Debug)] pub struct CrawlerConfig { pub daemon_enabled: bool, pub daily_at: NaiveTime, pub tz: Tz, pub idle_timeout: Duration, pub chapter_workers: usize, pub retention_days: u32, pub start_url: Option, pub rate_ms: u64, pub cdn_host: Option, pub cdn_rate_ms: u64, pub phpsessid: Option, pub cookie_domain: Option, pub user_agent: Option, pub proxy: Option, pub browser: LaunchOptions, } impl Default for CrawlerConfig { fn default() -> Self { Self { daemon_enabled: false, daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(), tz: Tz::UTC, idle_timeout: Duration::from_secs(600), chapter_workers: 1, retention_days: 7, start_url: None, rate_ms: 1000, cdn_host: None, cdn_rate_ms: 1000, phpsessid: None, cookie_domain: None, user_agent: None, proxy: None, browser: LaunchOptions::headless(), } } } impl Config { pub fn from_env() -> anyhow::Result { Ok(Self { database_url: std::env::var("DATABASE_URL") .map_err(|_| anyhow::anyhow!("DATABASE_URL must be set"))?, bind_address: std::env::var("BIND_ADDRESS") .unwrap_or_else(|_| "0.0.0.0:8080".to_string()), storage_dir: std::env::var("STORAGE_DIR") .unwrap_or_else(|_| "./data/storage".to_string()) .into(), auth: AuthConfig { cookie_secure: env_bool("COOKIE_SECURE", true), cookie_domain: std::env::var("COOKIE_DOMAIN") .ok() .filter(|s| !s.is_empty()), session_ttl_days: env_i64("SESSION_TTL_DAYS", 30), }, upload: UploadConfig { max_request_bytes: env_usize("MAX_REQUEST_BYTES", 200 * 1024 * 1024), max_file_bytes: env_usize("MAX_FILE_BYTES", 20 * 1024 * 1024), }, cors_allowed_origins: std::env::var("CORS_ALLOWED_ORIGINS") .ok() .map(|s| { s.split(',') .map(|o| o.trim().to_string()) .filter(|o| !o.is_empty()) .collect() }) .unwrap_or_default(), crawler: CrawlerConfig::from_env()?, }) } } impl CrawlerConfig { pub fn from_env() -> anyhow::Result { // Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast. let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() { None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(), Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| { anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}") })?, }; let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() { None | Some("") => Tz::UTC, Some(raw) => raw .parse() .map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?, }; Ok(Self { daemon_enabled: env_bool("CRAWLER_DAEMON", true), daily_at, tz, idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)), chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize, retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32, start_url: std::env::var("CRAWLER_START_URL") .ok() .filter(|s| !s.trim().is_empty()), rate_ms: env_u64("CRAWLER_RATE_MS", 1000), cdn_host: std::env::var("CRAWLER_CDN_HOST") .ok() .filter(|s| !s.trim().is_empty()), cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)), phpsessid: std::env::var("CRAWLER_PHPSESSID") .ok() .filter(|s| !s.trim().is_empty()), cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN") .ok() .filter(|s| !s.trim().is_empty()), user_agent: std::env::var("CRAWLER_USER_AGENT") .ok() .filter(|s| !s.trim().is_empty()), proxy: std::env::var("CRAWLER_PROXY") .ok() .filter(|s| !s.trim().is_empty()), browser: LaunchOptions::from_env(), }) } } fn env_u64(name: &str, default: u64) -> u64 { std::env::var(name) .ok() .and_then(|s| s.parse().ok()) .unwrap_or(default) } fn env_bool(name: &str, default: bool) -> bool { match std::env::var(name).ok().as_deref() { Some("1") | Some("true") | Some("TRUE") | Some("yes") => true, Some("0") | Some("false") | Some("FALSE") | Some("no") => false, _ => default, } } fn env_i64(name: &str, default: i64) -> i64 { std::env::var(name) .ok() .and_then(|s| s.parse().ok()) .unwrap_or(default) } fn env_usize(name: &str, default: usize) -> usize { std::env::var(name) .ok() .and_then(|s| s.parse().ok()) .unwrap_or(default) }