feat: incremental crawl mode with seed-completion gate (0.33.0)
Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,16 @@ use chrono::NaiveTime;
|
||||
use chrono_tz::Tz;
|
||||
|
||||
use crate::crawler::browser::LaunchOptions;
|
||||
use crate::crawler::source::DiscoverMode;
|
||||
|
||||
/// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default —
|
||||
/// pick Backfill until `seed_completed_at` is written, then flip to
|
||||
/// Incremental. `Explicit` forces a single mode regardless.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum CrawlerModePref {
|
||||
Auto,
|
||||
Explicit(DiscoverMode),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct AuthConfig {
|
||||
@@ -77,6 +87,12 @@ pub struct CrawlerConfig {
|
||||
pub user_agent: Option<String>,
|
||||
pub proxy: Option<String>,
|
||||
pub browser: LaunchOptions,
|
||||
/// Mode preference for the metadata pass. Daemon default is `Auto`
|
||||
/// (Backfill until `seed_completed_at` is written, then Incremental).
|
||||
pub mode: CrawlerModePref,
|
||||
/// `stop_after_unchanged` threshold supplied to Incremental in both
|
||||
/// `Auto` (post-seed) and `Explicit(Incremental)` modes.
|
||||
pub incremental_stop_after: usize,
|
||||
}
|
||||
|
||||
impl Default for CrawlerConfig {
|
||||
@@ -97,6 +113,8 @@ impl Default for CrawlerConfig {
|
||||
user_agent: None,
|
||||
proxy: None,
|
||||
browser: LaunchOptions::headless(),
|
||||
mode: CrawlerModePref::Auto,
|
||||
incremental_stop_after: 20,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -151,6 +169,9 @@ impl CrawlerConfig {
|
||||
.parse()
|
||||
.map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
|
||||
};
|
||||
let incremental_stop_after =
|
||||
env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
|
||||
let mode = parse_mode_env(incremental_stop_after)?;
|
||||
Ok(Self {
|
||||
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
|
||||
daily_at,
|
||||
@@ -179,10 +200,38 @@ impl CrawlerConfig {
|
||||
.ok()
|
||||
.filter(|s| !s.trim().is_empty()),
|
||||
browser: LaunchOptions::from_env(),
|
||||
mode,
|
||||
incremental_stop_after,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are
|
||||
/// `auto`, `backfill`, and `incremental` (case-insensitive). Anything
|
||||
/// else is a hard error so a typo can't silently fall through to the
|
||||
/// default and mask itself.
|
||||
fn parse_mode_env(incremental_stop_after: usize) -> anyhow::Result<CrawlerModePref> {
|
||||
parse_mode_str(std::env::var("CRAWLER_MODE").ok().as_deref(), incremental_stop_after)
|
||||
}
|
||||
|
||||
/// Pure variant of [`parse_mode_env`] — testable without env-var
|
||||
/// mutation. Takes the raw value (or `None` if unset).
|
||||
pub(crate) fn parse_mode_str(
|
||||
raw: Option<&str>,
|
||||
incremental_stop_after: usize,
|
||||
) -> anyhow::Result<CrawlerModePref> {
|
||||
match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() {
|
||||
None | Some("") | Some("auto") => Ok(CrawlerModePref::Auto),
|
||||
Some("backfill") => Ok(CrawlerModePref::Explicit(DiscoverMode::Backfill)),
|
||||
Some("incremental") => Ok(CrawlerModePref::Explicit(DiscoverMode::Incremental {
|
||||
stop_after_unchanged: incremental_stop_after,
|
||||
})),
|
||||
Some(other) => Err(anyhow::anyhow!(
|
||||
"CRAWLER_MODE must be one of: auto, backfill, incremental (got {other:?})"
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn env_u64(name: &str, default: u64) -> u64 {
|
||||
std::env::var(name)
|
||||
.ok()
|
||||
@@ -211,3 +260,63 @@ fn env_usize(name: &str, default: usize) -> usize {
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(default)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_mode_str_defaults_to_auto_when_unset_or_blank() {
|
||||
let none = parse_mode_str(None, 20).unwrap();
|
||||
assert!(matches!(none, CrawlerModePref::Auto));
|
||||
let blank = parse_mode_str(Some(""), 20).unwrap();
|
||||
assert!(matches!(blank, CrawlerModePref::Auto));
|
||||
let whitespace = parse_mode_str(Some(" "), 20).unwrap();
|
||||
assert!(matches!(whitespace, CrawlerModePref::Auto));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_mode_str_recognizes_each_keyword() {
|
||||
let auto = parse_mode_str(Some("auto"), 20).unwrap();
|
||||
assert!(matches!(auto, CrawlerModePref::Auto));
|
||||
|
||||
let backfill = parse_mode_str(Some("backfill"), 20).unwrap();
|
||||
assert!(matches!(
|
||||
backfill,
|
||||
CrawlerModePref::Explicit(DiscoverMode::Backfill)
|
||||
));
|
||||
|
||||
let incremental = parse_mode_str(Some("incremental"), 7).unwrap();
|
||||
assert!(matches!(
|
||||
incremental,
|
||||
CrawlerModePref::Explicit(DiscoverMode::Incremental {
|
||||
stop_after_unchanged: 7
|
||||
})
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_mode_str_is_case_insensitive_and_trims_whitespace() {
|
||||
let mixed = parse_mode_str(Some(" Incremental "), 5).unwrap();
|
||||
assert!(matches!(
|
||||
mixed,
|
||||
CrawlerModePref::Explicit(DiscoverMode::Incremental {
|
||||
stop_after_unchanged: 5
|
||||
})
|
||||
));
|
||||
let upper = parse_mode_str(Some("BACKFILL"), 5).unwrap();
|
||||
assert!(matches!(
|
||||
upper,
|
||||
CrawlerModePref::Explicit(DiscoverMode::Backfill)
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_mode_str_hard_errors_on_unknown_value() {
|
||||
let err = parse_mode_str(Some("backfil"), 20).unwrap_err();
|
||||
let msg = format!("{err}");
|
||||
assert!(msg.contains("backfill"), "error should list valid values: {msg}");
|
||||
assert!(msg.contains("auto"));
|
||||
assert!(msg.contains("incremental"));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user