Mangalord/backend/src/config.rs

use std::path::PathBuf;
use std::time::Duration;

use chrono::NaiveTime;
use chrono_tz::Tz;

use crate::crawler::browser::LaunchOptions;
use crate::crawler::safety::{DownloadAllowlist, DEFAULT_MAX_IMAGE_BYTES};
use crate::crawler::source::DiscoverMode;

/// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default —
/// pick Backfill until `seed_completed_at` is written, then flip to
/// Incremental. `Explicit` forces a single mode regardless.
#[derive(Clone, Copy, Debug)]
pub enum CrawlerModePref {
    Auto,
    Explicit(DiscoverMode),
}

#[derive(Clone, Debug)]
pub struct AuthConfig {
    pub cookie_secure: bool,
    pub cookie_domain: Option<String>,
    pub session_ttl_days: i64,
}

impl Default for AuthConfig {
    fn default() -> Self {
        Self {
            cookie_secure: true,
            cookie_domain: None,
            session_ttl_days: 30,
        }
    }
}

#[derive(Clone, Debug)]
pub struct UploadConfig {
    /// Total request size cap, enforced by axum's DefaultBodyLimit on the
    /// upload routes. Rejected requests get a 413.
    pub max_request_bytes: usize,
    /// Per-image-part size cap, enforced after the part is read. Lets us
    /// reject a single oversized cover/page without failing the whole
    /// request just because the total happens to fit.
    pub max_file_bytes: usize,
}

impl Default for UploadConfig {
    fn default() -> Self {
        Self {
            max_request_bytes: 200 * 1024 * 1024, // 200 MiB
            max_file_bytes: 20 * 1024 * 1024,     // 20 MiB
        }
    }
}

#[derive(Clone, Debug)]
pub struct Config {
    pub database_url: String,
    pub bind_address: String,
    pub storage_dir: PathBuf,
    pub auth: AuthConfig,
    pub upload: UploadConfig,
    pub cors_allowed_origins: Vec<String>,
    pub crawler: CrawlerConfig,
}

/// All crawler-daemon knobs read from env. Mirrors the env vars the
/// `bin/crawler` binary already reads, plus the new daemon-only knobs
/// (daily_at, tz, idle_timeout, retention_days, daemon_enabled).
///
/// `daemon_enabled = false` skips the daemon spawn entirely — used by
/// integration tests and dev runs that don't want background activity.
#[derive(Clone, Debug)]
pub struct CrawlerConfig {
    pub daemon_enabled: bool,
    pub daily_at: NaiveTime,
    pub tz: Tz,
    pub idle_timeout: Duration,
    pub chapter_workers: usize,
    pub retention_days: u32,
    pub start_url: Option<String>,
    pub rate_ms: u64,
    pub cdn_host: Option<String>,
    pub cdn_rate_ms: u64,
    pub phpsessid: Option<String>,
    pub cookie_domain: Option<String>,
    pub user_agent: Option<String>,
    pub proxy: Option<String>,
    pub browser: LaunchOptions,
    /// Mode preference for the metadata pass. Daemon default is `Auto`
    /// (Backfill until `seed_completed_at` is written, then Incremental).
    pub mode: CrawlerModePref,
    /// `stop_after_unchanged` threshold supplied to Incremental in both
    /// `Auto` (post-seed) and `Explicit(Incremental)` modes.
    pub incremental_stop_after: usize,
    /// Hosts the crawler is allowed to download images / covers from.
    /// Always seeded with the host of `start_url` and (when set) the
    /// configured `cdn_host`. Additional hosts can be added via
    /// `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-separated).
    pub download_allowlist: DownloadAllowlist,
    /// Hard upper bound on a single image download. Defaults to 32 MiB.
    pub max_image_bytes: usize,
}

impl Default for CrawlerConfig {
    fn default() -> Self {
        Self {
            daemon_enabled: false,
            daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
            tz: Tz::UTC,
            idle_timeout: Duration::from_secs(600),
            chapter_workers: 1,
            retention_days: 7,
            start_url: None,
            rate_ms: 1000,
            cdn_host: None,
            cdn_rate_ms: 1000,
            phpsessid: None,
            cookie_domain: None,
            user_agent: None,
            proxy: None,
            browser: LaunchOptions::headless(),
            mode: CrawlerModePref::Auto,
            incremental_stop_after: 20,
            download_allowlist: DownloadAllowlist::new(),
            max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
        }
    }
}

impl Config {
    pub fn from_env() -> anyhow::Result<Self> {
        Ok(Self {
            database_url: std::env::var("DATABASE_URL")
                .map_err(|_| anyhow::anyhow!("DATABASE_URL must be set"))?,
            bind_address: std::env::var("BIND_ADDRESS")
                .unwrap_or_else(|_| "0.0.0.0:8080".to_string()),
            storage_dir: std::env::var("STORAGE_DIR")
                .unwrap_or_else(|_| "./data/storage".to_string())
                .into(),
            auth: AuthConfig {
                cookie_secure: env_bool("COOKIE_SECURE", true),
                cookie_domain: std::env::var("COOKIE_DOMAIN")
                    .ok()
                    .filter(|s| !s.is_empty()),
                session_ttl_days: env_i64("SESSION_TTL_DAYS", 30),
            },
            upload: UploadConfig {
                max_request_bytes: env_usize("MAX_REQUEST_BYTES", 200 * 1024 * 1024),
                max_file_bytes: env_usize("MAX_FILE_BYTES", 20 * 1024 * 1024),
            },
            cors_allowed_origins: std::env::var("CORS_ALLOWED_ORIGINS")
                .ok()
                .map(|s| {
                    s.split(',')
                        .map(|o| o.trim().to_string())
                        .filter(|o| !o.is_empty())
                        .collect()
                })
                .unwrap_or_default(),
            crawler: CrawlerConfig::from_env()?,
        })
    }
}

impl CrawlerConfig {
    pub fn from_env() -> anyhow::Result<Self> {
        // Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
        let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() {
            None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
            Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| {
                anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}")
            })?,
        };
        let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() {
            None | Some("") => Tz::UTC,
            Some(raw) => raw
                .parse()
                .map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
        };
        let incremental_stop_after =
            env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
        let mode = parse_mode_env(incremental_stop_after)?;
        let start_url = std::env::var("CRAWLER_START_URL")
            .ok()
            .filter(|s| !s.trim().is_empty());
        let cdn_host = std::env::var("CRAWLER_CDN_HOST")
            .ok()
            .filter(|s| !s.trim().is_empty());
        let download_allowlist =
            build_download_allowlist(start_url.as_deref(), cdn_host.as_deref());
        Ok(Self {
            daemon_enabled: env_bool("CRAWLER_DAEMON", true),
            daily_at,
            tz,
            idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
            chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
            retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
            start_url,
            rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
            cdn_host,
            cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
            phpsessid: std::env::var("CRAWLER_PHPSESSID")
                .ok()
                .filter(|s| !s.trim().is_empty()),
            cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN")
                .ok()
                .filter(|s| !s.trim().is_empty()),
            user_agent: std::env::var("CRAWLER_USER_AGENT")
                .ok()
                .filter(|s| !s.trim().is_empty()),
            proxy: std::env::var("CRAWLER_PROXY")
                .ok()
                .filter(|s| !s.trim().is_empty()),
            browser: LaunchOptions::from_env(),
            mode,
            incremental_stop_after,
            download_allowlist,
            max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
        })
    }
}

/// Build the download allowlist from env. Always includes
/// `CRAWLER_START_URL`'s host (so the crawler can fetch covers from
/// the catalog itself) and `CRAWLER_CDN_HOST` when set. Additional
/// hosts can be supplied via `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-
/// separated). Empty by default — meaning the crawler refuses to
/// download anything when no source is configured, which is the safe
/// fail-closed posture.
fn build_download_allowlist(
    start_url: Option<&str>,
    cdn_host: Option<&str>,
) -> DownloadAllowlist {
    let mut allow = DownloadAllowlist::new();
    if let Some(url) = start_url {
        if let Ok(parsed) = reqwest::Url::parse(url) {
            if let Some(h) = parsed.host_str() {
                allow = allow.allow(h);
            }
        }
    }
    if let Some(host) = cdn_host {
        allow = allow.allow(host);
    }
    if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
        for piece in extras.split(',') {
            let trimmed = piece.trim();
            if !trimmed.is_empty() {
                allow = allow.allow(trimmed);
            }
        }
    }
    allow
}

/// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are
/// `auto`, `backfill`, and `incremental` (case-insensitive). Anything
/// else is a hard error so a typo can't silently fall through to the
/// default and mask itself.
fn parse_mode_env(incremental_stop_after: usize) -> anyhow::Result<CrawlerModePref> {
    parse_mode_str(std::env::var("CRAWLER_MODE").ok().as_deref(), incremental_stop_after)
}

/// Pure variant of [`parse_mode_env`] — testable without env-var
/// mutation. Takes the raw value (or `None` if unset).
pub(crate) fn parse_mode_str(
    raw: Option<&str>,
    incremental_stop_after: usize,
) -> anyhow::Result<CrawlerModePref> {
    match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() {
        None | Some("") | Some("auto") => Ok(CrawlerModePref::Auto),
        Some("backfill") => Ok(CrawlerModePref::Explicit(DiscoverMode::Backfill)),
        Some("incremental") => Ok(CrawlerModePref::Explicit(DiscoverMode::Incremental {
            stop_after_unchanged: incremental_stop_after,
        })),
        Some(other) => Err(anyhow::anyhow!(
            "CRAWLER_MODE must be one of: auto, backfill, incremental (got {other:?})"
        )),
    }
}

fn env_u64(name: &str, default: u64) -> u64 {
    std::env::var(name)
        .ok()
        .and_then(|s| s.parse().ok())
        .unwrap_or(default)
}

fn env_bool(name: &str, default: bool) -> bool {
    match std::env::var(name).ok().as_deref() {
        Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
        Some("0") | Some("false") | Some("FALSE") | Some("no") => false,
        _ => default,
    }
}

fn env_i64(name: &str, default: i64) -> i64 {
    std::env::var(name)
        .ok()
        .and_then(|s| s.parse().ok())
        .unwrap_or(default)
}

fn env_usize(name: &str, default: usize) -> usize {
    std::env::var(name)
        .ok()
        .and_then(|s| s.parse().ok())
        .unwrap_or(default)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_mode_str_defaults_to_auto_when_unset_or_blank() {
        let none = parse_mode_str(None, 20).unwrap();
        assert!(matches!(none, CrawlerModePref::Auto));
        let blank = parse_mode_str(Some(""), 20).unwrap();
        assert!(matches!(blank, CrawlerModePref::Auto));
        let whitespace = parse_mode_str(Some("   "), 20).unwrap();
        assert!(matches!(whitespace, CrawlerModePref::Auto));
    }

    #[test]
    fn parse_mode_str_recognizes_each_keyword() {
        let auto = parse_mode_str(Some("auto"), 20).unwrap();
        assert!(matches!(auto, CrawlerModePref::Auto));

        let backfill = parse_mode_str(Some("backfill"), 20).unwrap();
        assert!(matches!(
            backfill,
            CrawlerModePref::Explicit(DiscoverMode::Backfill)
        ));

        let incremental = parse_mode_str(Some("incremental"), 7).unwrap();
        assert!(matches!(
            incremental,
            CrawlerModePref::Explicit(DiscoverMode::Incremental {
                stop_after_unchanged: 7
            })
        ));
    }

    #[test]
    fn parse_mode_str_is_case_insensitive_and_trims_whitespace() {
        let mixed = parse_mode_str(Some("  Incremental  "), 5).unwrap();
        assert!(matches!(
            mixed,
            CrawlerModePref::Explicit(DiscoverMode::Incremental {
                stop_after_unchanged: 5
            })
        ));
        let upper = parse_mode_str(Some("BACKFILL"), 5).unwrap();
        assert!(matches!(
            upper,
            CrawlerModePref::Explicit(DiscoverMode::Backfill)
        ));
    }

    #[test]
    fn parse_mode_str_hard_errors_on_unknown_value() {
        let err = parse_mode_str(Some("backfil"), 20).unwrap_err();
        let msg = format!("{err}");
        assert!(msg.contains("backfill"), "error should list valid values: {msg}");
        assert!(msg.contains("auto"));
        assert!(msg.contains("incremental"));
    }
}