feat(crawler): single-mode walker gated by recovery flag (0.36.0)

Collapses the crawler to a single newest-first walker and replaces the
N-consecutive-unchanged streak with a per-manga rule: stop on the first
manga where metadata is Unchanged AND chapter sync reports zero new
chapters. The early stop is gated by a per-source recovery flag stored
in `crawler_state` — set to `false` when a run starts, back to `true`
only on a clean exit (end-of-walk or intentional stop). A crashed run
leaves the flag `false` automatically (no shutdown code runs), so the
next tick walks the full catalog instead of bailing at the first
caught-up manga.

This means a crashed mid-walk run self-heals on the next tick: the
flag stays `false`, the next walk visits every page (recovering
anything the crash missed past its crash point), and steady state
resumes once the recovery sweep reaches end-of-walk.

Removed:
- DiscoverMode enum, Backfill mode, the boundary re-check +
  displaced-refs machinery in TargetSourceWalker.
- Drop-pass (mark_dropped_mangas) and seed-completion plumbing
  (mark_seed_completed / seed_completed_at). The recovery flag
  subsumes the seed-completion signal; drop detection was explicitly
  opted out.
- JobPayload::Discover (no production callers).
- CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the
  CrawlerModePref config type.

`should_mark_clean_exit(walked_to_completion, hit_stop_condition)`
encodes the clean-exit truth table in its signature — `hit_limit` is
deliberately absent so a future edit cannot accidentally count a
caller-imposed cap as a clean exit.

Net -501 lines, 261 backend tests passing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-29 23:49:28 +02:00
parent 33f7e19077
commit 9f56f283d4
15 changed files with 387 additions and 888 deletions

View File

@@ -13,7 +13,7 @@ use tower_http::cors::{AllowOrigin, CorsLayer};
use tower_http::trace::TraceLayer;
use crate::auth::rate_limit::AuthRateLimiter;
use crate::config::{AuthConfig, Config, CrawlerConfig, CrawlerModePref, UploadConfig};
use crate::config::{AuthConfig, Config, CrawlerConfig, UploadConfig};
use crate::crawler::browser_manager::{self, BrowserManager};
use crate::crawler::content::{self, SyncOutcome};
use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass};
@@ -22,7 +22,6 @@ use crate::crawler::pipeline::{self, MetadataStats};
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::safety::DownloadAllowlist;
use crate::crawler::session;
use crate::crawler::source::{target as target_source, DiscoverMode};
use crate::repo;
use crate::storage::{LocalStorage, Storage};
@@ -159,8 +158,6 @@ async fn spawn_crawler_daemon(
http: http.clone(),
rate: Arc::clone(&rate),
start_url: url.clone(),
mode_pref: cfg.mode,
incremental_stop_after: cfg.incremental_stop_after,
download_allowlist: cfg.download_allowlist.clone(),
max_image_bytes: cfg.max_image_bytes,
});
@@ -226,8 +223,6 @@ struct RealMetadataPass {
http: reqwest::Client,
rate: Arc<HostRateLimiters>,
start_url: String,
mode_pref: CrawlerModePref,
incremental_stop_after: usize,
download_allowlist: DownloadAllowlist,
max_image_bytes: usize,
}
@@ -235,13 +230,6 @@ struct RealMetadataPass {
#[async_trait]
impl MetadataPass for RealMetadataPass {
async fn run(&self) -> anyhow::Result<MetadataStats> {
let mode = resolve_mode(
&self.db,
target_source::SOURCE_ID,
self.mode_pref,
self.incremental_stop_after,
)
.await?;
pipeline::run_metadata_pass(
&self.browser_manager,
&self.db,
@@ -251,7 +239,6 @@ impl MetadataPass for RealMetadataPass {
&self.start_url,
0,
false,
mode,
&self.download_allowlist,
self.max_image_bytes,
)
@@ -259,50 +246,6 @@ impl MetadataPass for RealMetadataPass {
}
}
/// Pick the active mode for this tick. `Explicit` short-circuits the
/// DB lookup. `Auto` reads `seed_completed_at`: missing → Backfill
/// (initial seed for this source), present → Incremental with the
/// configured threshold.
///
/// A DB error during the Auto lookup propagates as `Err` rather than
/// silently degrading to Backfill — the daemon's `run_tick` catches
/// the error, logs, and skips the tick. That's safer than running a
/// full re-backfill (including a drop pass against stale-looking rows)
/// when the DB is flaky.
async fn resolve_mode(
db: &PgPool,
source_id: &str,
pref: CrawlerModePref,
incremental_stop_after: usize,
) -> anyhow::Result<DiscoverMode> {
match pref {
CrawlerModePref::Explicit(m) => {
tracing::info!(?m, "crawler mode: explicit (CRAWLER_MODE override)");
Ok(m)
}
CrawlerModePref::Auto => {
let seeded = repo::crawler::seed_completed_at(db, source_id)
.await
.context("seed_completed_at lookup for mode auto-detection")?;
match seeded {
Some(at) => {
tracing::info!(
seed_completed_at = %at.to_rfc3339(),
"crawler mode: auto → incremental (seed previously completed)"
);
Ok(DiscoverMode::Incremental {
stop_after_unchanged: incremental_stop_after,
})
}
None => {
tracing::info!("crawler mode: auto → backfill (no seed marker for source)");
Ok(DiscoverMode::Backfill)
}
}
}
}
}
struct RealChapterDispatcher {
browser_manager: Arc<BrowserManager>,
db: PgPool,
@@ -348,8 +291,8 @@ impl ChapterDispatcher for RealChapterDispatcher {
Ok(outcome)
}
// Other payload kinds aren't dispatched by this daemon yet —
// metadata-driven jobs (Discover/SyncManga/SyncChapterList)
// are handled inline by the cron's metadata pass.
// SyncManga / SyncChapterList are handled inline by the cron's
// metadata pass.
_ => Ok(SyncOutcome::Skipped),
}
}