Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
323 lines
11 KiB
Rust
323 lines
11 KiB
Rust
use std::path::PathBuf;
|
|
use std::time::Duration;
|
|
|
|
use chrono::NaiveTime;
|
|
use chrono_tz::Tz;
|
|
|
|
use crate::crawler::browser::LaunchOptions;
|
|
use crate::crawler::source::DiscoverMode;
|
|
|
|
/// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default —
|
|
/// pick Backfill until `seed_completed_at` is written, then flip to
|
|
/// Incremental. `Explicit` forces a single mode regardless.
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub enum CrawlerModePref {
|
|
Auto,
|
|
Explicit(DiscoverMode),
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct AuthConfig {
|
|
pub cookie_secure: bool,
|
|
pub cookie_domain: Option<String>,
|
|
pub session_ttl_days: i64,
|
|
}
|
|
|
|
impl Default for AuthConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
cookie_secure: true,
|
|
cookie_domain: None,
|
|
session_ttl_days: 30,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct UploadConfig {
|
|
/// Total request size cap, enforced by axum's DefaultBodyLimit on the
|
|
/// upload routes. Rejected requests get a 413.
|
|
pub max_request_bytes: usize,
|
|
/// Per-image-part size cap, enforced after the part is read. Lets us
|
|
/// reject a single oversized cover/page without failing the whole
|
|
/// request just because the total happens to fit.
|
|
pub max_file_bytes: usize,
|
|
}
|
|
|
|
impl Default for UploadConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
max_request_bytes: 200 * 1024 * 1024, // 200 MiB
|
|
max_file_bytes: 20 * 1024 * 1024, // 20 MiB
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct Config {
|
|
pub database_url: String,
|
|
pub bind_address: String,
|
|
pub storage_dir: PathBuf,
|
|
pub auth: AuthConfig,
|
|
pub upload: UploadConfig,
|
|
pub cors_allowed_origins: Vec<String>,
|
|
pub crawler: CrawlerConfig,
|
|
}
|
|
|
|
/// All crawler-daemon knobs read from env. Mirrors the env vars the
|
|
/// `bin/crawler` binary already reads, plus the new daemon-only knobs
|
|
/// (daily_at, tz, idle_timeout, retention_days, daemon_enabled).
|
|
///
|
|
/// `daemon_enabled = false` skips the daemon spawn entirely — used by
|
|
/// integration tests and dev runs that don't want background activity.
|
|
#[derive(Clone, Debug)]
|
|
pub struct CrawlerConfig {
|
|
pub daemon_enabled: bool,
|
|
pub daily_at: NaiveTime,
|
|
pub tz: Tz,
|
|
pub idle_timeout: Duration,
|
|
pub chapter_workers: usize,
|
|
pub retention_days: u32,
|
|
pub start_url: Option<String>,
|
|
pub rate_ms: u64,
|
|
pub cdn_host: Option<String>,
|
|
pub cdn_rate_ms: u64,
|
|
pub phpsessid: Option<String>,
|
|
pub cookie_domain: Option<String>,
|
|
pub user_agent: Option<String>,
|
|
pub proxy: Option<String>,
|
|
pub browser: LaunchOptions,
|
|
/// Mode preference for the metadata pass. Daemon default is `Auto`
|
|
/// (Backfill until `seed_completed_at` is written, then Incremental).
|
|
pub mode: CrawlerModePref,
|
|
/// `stop_after_unchanged` threshold supplied to Incremental in both
|
|
/// `Auto` (post-seed) and `Explicit(Incremental)` modes.
|
|
pub incremental_stop_after: usize,
|
|
}
|
|
|
|
impl Default for CrawlerConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
daemon_enabled: false,
|
|
daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
|
tz: Tz::UTC,
|
|
idle_timeout: Duration::from_secs(600),
|
|
chapter_workers: 1,
|
|
retention_days: 7,
|
|
start_url: None,
|
|
rate_ms: 1000,
|
|
cdn_host: None,
|
|
cdn_rate_ms: 1000,
|
|
phpsessid: None,
|
|
cookie_domain: None,
|
|
user_agent: None,
|
|
proxy: None,
|
|
browser: LaunchOptions::headless(),
|
|
mode: CrawlerModePref::Auto,
|
|
incremental_stop_after: 20,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Config {
|
|
pub fn from_env() -> anyhow::Result<Self> {
|
|
Ok(Self {
|
|
database_url: std::env::var("DATABASE_URL")
|
|
.map_err(|_| anyhow::anyhow!("DATABASE_URL must be set"))?,
|
|
bind_address: std::env::var("BIND_ADDRESS")
|
|
.unwrap_or_else(|_| "0.0.0.0:8080".to_string()),
|
|
storage_dir: std::env::var("STORAGE_DIR")
|
|
.unwrap_or_else(|_| "./data/storage".to_string())
|
|
.into(),
|
|
auth: AuthConfig {
|
|
cookie_secure: env_bool("COOKIE_SECURE", true),
|
|
cookie_domain: std::env::var("COOKIE_DOMAIN")
|
|
.ok()
|
|
.filter(|s| !s.is_empty()),
|
|
session_ttl_days: env_i64("SESSION_TTL_DAYS", 30),
|
|
},
|
|
upload: UploadConfig {
|
|
max_request_bytes: env_usize("MAX_REQUEST_BYTES", 200 * 1024 * 1024),
|
|
max_file_bytes: env_usize("MAX_FILE_BYTES", 20 * 1024 * 1024),
|
|
},
|
|
cors_allowed_origins: std::env::var("CORS_ALLOWED_ORIGINS")
|
|
.ok()
|
|
.map(|s| {
|
|
s.split(',')
|
|
.map(|o| o.trim().to_string())
|
|
.filter(|o| !o.is_empty())
|
|
.collect()
|
|
})
|
|
.unwrap_or_default(),
|
|
crawler: CrawlerConfig::from_env()?,
|
|
})
|
|
}
|
|
}
|
|
|
|
impl CrawlerConfig {
|
|
pub fn from_env() -> anyhow::Result<Self> {
|
|
// Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
|
|
let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() {
|
|
None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
|
Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| {
|
|
anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}")
|
|
})?,
|
|
};
|
|
let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() {
|
|
None | Some("") => Tz::UTC,
|
|
Some(raw) => raw
|
|
.parse()
|
|
.map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
|
|
};
|
|
let incremental_stop_after =
|
|
env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
|
|
let mode = parse_mode_env(incremental_stop_after)?;
|
|
Ok(Self {
|
|
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
|
|
daily_at,
|
|
tz,
|
|
idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
|
|
chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
|
|
retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
|
|
start_url: std::env::var("CRAWLER_START_URL")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty()),
|
|
rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
|
|
cdn_host: std::env::var("CRAWLER_CDN_HOST")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty()),
|
|
cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
|
|
phpsessid: std::env::var("CRAWLER_PHPSESSID")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty()),
|
|
cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty()),
|
|
user_agent: std::env::var("CRAWLER_USER_AGENT")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty()),
|
|
proxy: std::env::var("CRAWLER_PROXY")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty()),
|
|
browser: LaunchOptions::from_env(),
|
|
mode,
|
|
incremental_stop_after,
|
|
})
|
|
}
|
|
}
|
|
|
|
/// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are
|
|
/// `auto`, `backfill`, and `incremental` (case-insensitive). Anything
|
|
/// else is a hard error so a typo can't silently fall through to the
|
|
/// default and mask itself.
|
|
fn parse_mode_env(incremental_stop_after: usize) -> anyhow::Result<CrawlerModePref> {
|
|
parse_mode_str(std::env::var("CRAWLER_MODE").ok().as_deref(), incremental_stop_after)
|
|
}
|
|
|
|
/// Pure variant of [`parse_mode_env`] — testable without env-var
|
|
/// mutation. Takes the raw value (or `None` if unset).
|
|
pub(crate) fn parse_mode_str(
|
|
raw: Option<&str>,
|
|
incremental_stop_after: usize,
|
|
) -> anyhow::Result<CrawlerModePref> {
|
|
match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() {
|
|
None | Some("") | Some("auto") => Ok(CrawlerModePref::Auto),
|
|
Some("backfill") => Ok(CrawlerModePref::Explicit(DiscoverMode::Backfill)),
|
|
Some("incremental") => Ok(CrawlerModePref::Explicit(DiscoverMode::Incremental {
|
|
stop_after_unchanged: incremental_stop_after,
|
|
})),
|
|
Some(other) => Err(anyhow::anyhow!(
|
|
"CRAWLER_MODE must be one of: auto, backfill, incremental (got {other:?})"
|
|
)),
|
|
}
|
|
}
|
|
|
|
fn env_u64(name: &str, default: u64) -> u64 {
|
|
std::env::var(name)
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.unwrap_or(default)
|
|
}
|
|
|
|
fn env_bool(name: &str, default: bool) -> bool {
|
|
match std::env::var(name).ok().as_deref() {
|
|
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
|
Some("0") | Some("false") | Some("FALSE") | Some("no") => false,
|
|
_ => default,
|
|
}
|
|
}
|
|
|
|
fn env_i64(name: &str, default: i64) -> i64 {
|
|
std::env::var(name)
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.unwrap_or(default)
|
|
}
|
|
|
|
fn env_usize(name: &str, default: usize) -> usize {
|
|
std::env::var(name)
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.unwrap_or(default)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn parse_mode_str_defaults_to_auto_when_unset_or_blank() {
|
|
let none = parse_mode_str(None, 20).unwrap();
|
|
assert!(matches!(none, CrawlerModePref::Auto));
|
|
let blank = parse_mode_str(Some(""), 20).unwrap();
|
|
assert!(matches!(blank, CrawlerModePref::Auto));
|
|
let whitespace = parse_mode_str(Some(" "), 20).unwrap();
|
|
assert!(matches!(whitespace, CrawlerModePref::Auto));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_mode_str_recognizes_each_keyword() {
|
|
let auto = parse_mode_str(Some("auto"), 20).unwrap();
|
|
assert!(matches!(auto, CrawlerModePref::Auto));
|
|
|
|
let backfill = parse_mode_str(Some("backfill"), 20).unwrap();
|
|
assert!(matches!(
|
|
backfill,
|
|
CrawlerModePref::Explicit(DiscoverMode::Backfill)
|
|
));
|
|
|
|
let incremental = parse_mode_str(Some("incremental"), 7).unwrap();
|
|
assert!(matches!(
|
|
incremental,
|
|
CrawlerModePref::Explicit(DiscoverMode::Incremental {
|
|
stop_after_unchanged: 7
|
|
})
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_mode_str_is_case_insensitive_and_trims_whitespace() {
|
|
let mixed = parse_mode_str(Some(" Incremental "), 5).unwrap();
|
|
assert!(matches!(
|
|
mixed,
|
|
CrawlerModePref::Explicit(DiscoverMode::Incremental {
|
|
stop_after_unchanged: 5
|
|
})
|
|
));
|
|
let upper = parse_mode_str(Some("BACKFILL"), 5).unwrap();
|
|
assert!(matches!(
|
|
upper,
|
|
CrawlerModePref::Explicit(DiscoverMode::Backfill)
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_mode_str_hard_errors_on_unknown_value() {
|
|
let err = parse_mode_str(Some("backfil"), 20).unwrap_err();
|
|
let msg = format!("{err}");
|
|
assert!(msg.contains("backfill"), "error should list valid values: {msg}");
|
|
assert!(msg.contains("auto"));
|
|
assert!(msg.contains("incremental"));
|
|
}
|
|
}
|