feat: incremental crawl mode with seed-completion gate (0.33.0)
Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use tower_http::cors::{AllowOrigin, CorsLayer};
|
||||
use tower_http::trace::TraceLayer;
|
||||
|
||||
use crate::config::{AuthConfig, Config, CrawlerConfig, UploadConfig};
|
||||
use crate::config::{AuthConfig, Config, CrawlerConfig, CrawlerModePref, UploadConfig};
|
||||
use crate::crawler::browser_manager::{self, BrowserManager};
|
||||
use crate::crawler::content::{self, SyncOutcome};
|
||||
use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass};
|
||||
@@ -20,6 +20,8 @@ use crate::crawler::jobs::JobPayload;
|
||||
use crate::crawler::pipeline::{self, MetadataStats};
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::session;
|
||||
use crate::crawler::source::{target as target_source, DiscoverMode};
|
||||
use crate::repo;
|
||||
use crate::storage::{LocalStorage, Storage};
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -149,6 +151,8 @@ async fn spawn_crawler_daemon(
|
||||
http: http.clone(),
|
||||
rate: Arc::clone(&rate),
|
||||
start_url: url.clone(),
|
||||
mode_pref: cfg.mode,
|
||||
incremental_stop_after: cfg.incremental_stop_after,
|
||||
});
|
||||
m
|
||||
});
|
||||
@@ -210,11 +214,20 @@ struct RealMetadataPass {
|
||||
http: reqwest::Client,
|
||||
rate: Arc<HostRateLimiters>,
|
||||
start_url: String,
|
||||
mode_pref: CrawlerModePref,
|
||||
incremental_stop_after: usize,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MetadataPass for RealMetadataPass {
|
||||
async fn run(&self) -> anyhow::Result<MetadataStats> {
|
||||
let mode = resolve_mode(
|
||||
&self.db,
|
||||
target_source::SOURCE_ID,
|
||||
self.mode_pref,
|
||||
self.incremental_stop_after,
|
||||
)
|
||||
.await?;
|
||||
pipeline::run_metadata_pass(
|
||||
&self.browser_manager,
|
||||
&self.db,
|
||||
@@ -224,11 +237,56 @@ impl MetadataPass for RealMetadataPass {
|
||||
&self.start_url,
|
||||
0,
|
||||
false,
|
||||
mode,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Pick the active mode for this tick. `Explicit` short-circuits the
|
||||
/// DB lookup. `Auto` reads `seed_completed_at`: missing → Backfill
|
||||
/// (initial seed for this source), present → Incremental with the
|
||||
/// configured threshold.
|
||||
///
|
||||
/// A DB error during the Auto lookup propagates as `Err` rather than
|
||||
/// silently degrading to Backfill — the daemon's `run_tick` catches
|
||||
/// the error, logs, and skips the tick. That's safer than running a
|
||||
/// full re-backfill (including a drop pass against stale-looking rows)
|
||||
/// when the DB is flaky.
|
||||
async fn resolve_mode(
|
||||
db: &PgPool,
|
||||
source_id: &str,
|
||||
pref: CrawlerModePref,
|
||||
incremental_stop_after: usize,
|
||||
) -> anyhow::Result<DiscoverMode> {
|
||||
match pref {
|
||||
CrawlerModePref::Explicit(m) => {
|
||||
tracing::info!(?m, "crawler mode: explicit (CRAWLER_MODE override)");
|
||||
Ok(m)
|
||||
}
|
||||
CrawlerModePref::Auto => {
|
||||
let seeded = repo::crawler::seed_completed_at(db, source_id)
|
||||
.await
|
||||
.context("seed_completed_at lookup for mode auto-detection")?;
|
||||
match seeded {
|
||||
Some(at) => {
|
||||
tracing::info!(
|
||||
seed_completed_at = %at.to_rfc3339(),
|
||||
"crawler mode: auto → incremental (seed previously completed)"
|
||||
);
|
||||
Ok(DiscoverMode::Incremental {
|
||||
stop_after_unchanged: incremental_stop_after,
|
||||
})
|
||||
}
|
||||
None => {
|
||||
tracing::info!("crawler mode: auto → backfill (no seed marker for source)");
|
||||
Ok(DiscoverMode::Backfill)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct RealChapterDispatcher {
|
||||
browser_manager: Arc<BrowserManager>,
|
||||
db: PgPool,
|
||||
|
||||
Reference in New Issue
Block a user