feat(crawler): single-mode walker gated by recovery flag (0.36.0)
Collapses the crawler to a single newest-first walker and replaces the N-consecutive-unchanged streak with a per-manga rule: stop on the first manga where metadata is Unchanged AND chapter sync reports zero new chapters. The early stop is gated by a per-source recovery flag stored in `crawler_state` — set to `false` when a run starts, back to `true` only on a clean exit (end-of-walk or intentional stop). A crashed run leaves the flag `false` automatically (no shutdown code runs), so the next tick walks the full catalog instead of bailing at the first caught-up manga. This means a crashed mid-walk run self-heals on the next tick: the flag stays `false`, the next walk visits every page (recovering anything the crash missed past its crash point), and steady state resumes once the recovery sweep reaches end-of-walk. Removed: - DiscoverMode enum, Backfill mode, the boundary re-check + displaced-refs machinery in TargetSourceWalker. - Drop-pass (mark_dropped_mangas) and seed-completion plumbing (mark_seed_completed / seed_completed_at). The recovery flag subsumes the seed-completion signal; drop detection was explicitly opted out. - JobPayload::Discover (no production callers). - CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the CrawlerModePref config type. `should_mark_clean_exit(walked_to_completion, hit_stop_condition)` encodes the clean-exit truth table in its signature — `hit_limit` is deliberately absent so a future edit cannot accidentally count a caller-imposed cap as a clean exit. Net -501 lines, 261 backend tests passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,8 +13,9 @@ use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist};
|
||||
use crate::crawler::source::target::TargetSource;
|
||||
use crate::crawler::source::{DiscoverMode, FetchContext, Source};
|
||||
use crate::crawler::source::{FetchContext, Source};
|
||||
use crate::repo;
|
||||
use crate::repo::crawler::UpsertStatus;
|
||||
use crate::storage::Storage;
|
||||
|
||||
/// Coarse counters surfaced for logging at the end of a metadata pass.
|
||||
@@ -26,16 +27,42 @@ pub struct MetadataStats {
|
||||
pub mangas_failed: usize,
|
||||
}
|
||||
|
||||
/// Decide whether the per-ref loop should stop based on the Incremental
|
||||
/// streak counter. Pulled out as a pure function so the rule is unit-
|
||||
/// testable without standing up the walker or DB.
|
||||
pub(crate) fn should_stop(mode: DiscoverMode, consecutive_unchanged: usize) -> bool {
|
||||
match mode {
|
||||
DiscoverMode::Backfill => false,
|
||||
DiscoverMode::Incremental { stop_after_unchanged } => {
|
||||
consecutive_unchanged >= stop_after_unchanged
|
||||
}
|
||||
}
|
||||
/// Decide whether the per-ref loop should stop on the manga just
|
||||
/// processed. The walk halts only when (a) the previous run exited
|
||||
/// cleanly — so the index tail is known to be caught up and we're not
|
||||
/// in a recovery sweep — AND (b) this manga's metadata hash matched
|
||||
/// storage (`Unchanged`) AND (c) the chapter sync confirmed zero new
|
||||
/// chapters. A `None` chapter count (skip_chapters, or a chapter-sync
|
||||
/// error we logged-and-swallowed) refuses the stop because we can't
|
||||
/// verify the tail is unchanged from a single piece of evidence.
|
||||
///
|
||||
/// Pure function so the rule is unit-testable without the walker, DB,
|
||||
/// or browser.
|
||||
pub(crate) fn should_stop(
|
||||
was_clean: bool,
|
||||
status: UpsertStatus,
|
||||
chapters_new: Option<usize>,
|
||||
) -> bool {
|
||||
was_clean
|
||||
&& matches!(status, UpsertStatus::Unchanged)
|
||||
&& chapters_new == Some(0)
|
||||
}
|
||||
|
||||
/// Whether the just-finished walk should be recorded as a clean exit.
|
||||
/// `true` writes the recovery flag back to `completed: true`; `false`
|
||||
/// leaves it `false` so the next tick treats this run as crashed and
|
||||
/// does a recovery sweep.
|
||||
///
|
||||
/// `hit_limit` (the caller-imposed `CRAWLER_LIMIT` cap) is *not* an
|
||||
/// argument: a limit cap by definition does not reach the catalog tail,
|
||||
/// so it can never count as a clean exit. Encoding that in the type
|
||||
/// (rather than as an `&& !hit_limit` clause inline) prevents a future
|
||||
/// edit from accidentally adding it back to the truth table.
|
||||
pub(crate) fn should_mark_clean_exit(
|
||||
walked_to_completion: bool,
|
||||
hit_stop_condition: bool,
|
||||
) -> bool {
|
||||
walked_to_completion || hit_stop_condition
|
||||
}
|
||||
|
||||
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
|
||||
@@ -45,15 +72,25 @@ pub(crate) fn should_stop(mode: DiscoverMode, consecutive_unchanged: usize) -> b
|
||||
/// `limit == 0` means no cap (full sweep up to the source's own bound).
|
||||
/// `skip_chapters == true` is the "metadata-only" mode (parser doesn't
|
||||
/// extract chapters, and `sync_manga_chapters` is skipped — otherwise an
|
||||
/// empty chapter list would soft-drop existing rows).
|
||||
/// empty chapter list would soft-drop existing rows). In this mode the
|
||||
/// stop condition never fires because chapter freshness can't be
|
||||
/// confirmed, so the walk always runs to end-of-source.
|
||||
///
|
||||
/// `mode` controls the walk:
|
||||
/// - `Backfill` — oldest-first, no early exit. The only mode that runs
|
||||
/// the end-of-walk drop pass + writes `seed_completed_at`.
|
||||
/// - `Incremental { stop_after_unchanged }` — newest-first, breaks out
|
||||
/// after N consecutive Unchanged upserts. Drop pass is skipped (the
|
||||
/// tail of the index is never visited, so its `last_seen_at` is
|
||||
/// stale and using it to soft-drop would be unsafe).
|
||||
/// The walk is always newest-first. Steady-state runs stop on the first
|
||||
/// manga where metadata is `Unchanged` AND chapter sync reports zero
|
||||
/// new chapters — the source orders by `update_date DESC`, so anything
|
||||
/// with a fresh chapter or fresh metadata is bumped to the top and will
|
||||
/// be processed before we hit a fully-caught-up manga.
|
||||
///
|
||||
/// A per-source recovery flag stored in `crawler_state`
|
||||
/// (`last_run_completed:<source_id>`) gates the early stop: it's set to
|
||||
/// `false` right after `ensure_source` and back to `true` only when the
|
||||
/// run exits via end-of-walk OR the intentional stop. A crash, panic,
|
||||
/// or SIGKILL leaves the flag at `false`, so the next tick reads it,
|
||||
/// recognizes the previous run did not exit cleanly, and walks the
|
||||
/// full catalog (ignoring the stop condition) to re-cover anything the
|
||||
/// crashed run missed past its crash point. Once that recovery sweep
|
||||
/// reaches end-of-walk, steady-state resumes.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn run_metadata_pass(
|
||||
browser_manager: &BrowserManager,
|
||||
@@ -64,7 +101,6 @@ pub async fn run_metadata_pass(
|
||||
start_url: &str,
|
||||
limit: usize,
|
||||
skip_chapters: bool,
|
||||
mode: DiscoverMode,
|
||||
allowlist: &DownloadAllowlist,
|
||||
max_image_bytes: usize,
|
||||
) -> anyhow::Result<MetadataStats> {
|
||||
@@ -97,28 +133,36 @@ pub async fn run_metadata_pass(
|
||||
.await
|
||||
.context("ensure_source")?;
|
||||
|
||||
let run_started_at = chrono::Utc::now();
|
||||
// Read BEFORE flipping to "in-flight" — a `false` here means the
|
||||
// previous run didn't reach a clean exit, and this run must walk
|
||||
// the full catalog (recovery sweep) instead of bailing on the
|
||||
// first caught-up manga.
|
||||
let was_clean = repo::crawler::last_run_completed_cleanly(db, source_id)
|
||||
.await
|
||||
.context("read last_run_completed_cleanly")?;
|
||||
repo::crawler::mark_run_started(db, source_id)
|
||||
.await
|
||||
.context("mark_run_started")?;
|
||||
|
||||
let max_refs = (limit > 0).then_some(limit);
|
||||
|
||||
tracing::info!(?mode, ?max_refs, "starting metadata pass");
|
||||
tracing::info!(was_clean, ?max_refs, "starting metadata pass");
|
||||
let mut walker = source
|
||||
.discover(&ctx, mode)
|
||||
.discover(&ctx)
|
||||
.await
|
||||
.context("discover failed")?;
|
||||
|
||||
let mut stats = MetadataStats::default();
|
||||
// Run-scoped dedup of `source_manga_key`s already processed this pass.
|
||||
// Backfill: the walker may append displaced refs that also appear on
|
||||
// the page we're about to visit naturally; skipping the dup avoids
|
||||
// redundant fetch_manga + upsert. Incremental: a shift causes the
|
||||
// slot-last item of the page we just read to reappear at slot 0 of
|
||||
// the next page; skipping it preserves the consecutive_unchanged
|
||||
// streak math instead of inflating it with a re-confirm.
|
||||
// A shift in the source index causes the slot-last item of the page
|
||||
// we just read to reappear at slot 0 of the next page; skipping it
|
||||
// here prevents redundant fetch_manga + upsert and avoids spuriously
|
||||
// tripping the stop condition with a re-confirm of an entry we
|
||||
// already counted.
|
||||
let mut seen: HashSet<String> = HashSet::new();
|
||||
let mut consecutive_unchanged: usize = 0;
|
||||
let mut walked_to_completion = false;
|
||||
let mut hit_limit = false;
|
||||
let mut hit_incremental_stop = false;
|
||||
let mut hit_stop_condition = false;
|
||||
|
||||
'outer: loop {
|
||||
let batch = match walker.next_batch(&ctx).await? {
|
||||
@@ -137,13 +181,13 @@ pub async fn run_metadata_pass(
|
||||
// Skip refs we've already *successfully* processed this pass.
|
||||
// Checking `contains` here (rather than `insert`) keeps the key
|
||||
// out of `seen` on failure paths below, so a transient fetch or
|
||||
// upsert error gets a second chance if the ref reappears via the
|
||||
// backfill boundary re-check or another batch. Done *before*
|
||||
// counting toward `stats.discovered` (the skipped ref did no
|
||||
// work) and *before* touching `consecutive_unchanged` (a
|
||||
// `continue` here preserves the streak rather than resetting or
|
||||
// inflating it). The matching `seen.insert(...)` lives just
|
||||
// after the successful upsert below.
|
||||
// upsert error gets a second chance if the ref reappears in
|
||||
// another batch. Done *before* counting toward
|
||||
// `stats.discovered` (the skipped ref did no work) and *before*
|
||||
// touching the stop check (a `continue` here doesn't let a
|
||||
// re-confirm trip the stop condition). The matching
|
||||
// `seen.insert(...)` lives just after the successful upsert
|
||||
// below.
|
||||
if seen.contains(&r.source_manga_key) {
|
||||
tracing::debug!(
|
||||
key = %r.source_manga_key,
|
||||
@@ -230,7 +274,13 @@ pub async fn run_metadata_pass(
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_chapters {
|
||||
// Chapter sync. `chapters_new` feeds the stop check below:
|
||||
// `None` (skip_chapters mode, or a logged-and-swallowed sync
|
||||
// error) refuses to stop on this manga because we can't
|
||||
// confirm "no new chapters."
|
||||
let chapters_new: Option<usize> = if skip_chapters {
|
||||
None
|
||||
} else {
|
||||
match repo::crawler::sync_manga_chapters(
|
||||
db,
|
||||
source_id,
|
||||
@@ -239,79 +289,64 @@ pub async fn run_metadata_pass(
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(diff) => tracing::info!(
|
||||
manga_id = %upsert.manga_id,
|
||||
new = diff.new,
|
||||
refreshed = diff.refreshed,
|
||||
dropped = diff.dropped,
|
||||
"chapters synced"
|
||||
),
|
||||
Err(e) => tracing::warn!(
|
||||
manga_id = %upsert.manga_id,
|
||||
error = ?e,
|
||||
"chapter sync failed"
|
||||
),
|
||||
Ok(diff) => {
|
||||
tracing::info!(
|
||||
manga_id = %upsert.manga_id,
|
||||
new = diff.new,
|
||||
refreshed = diff.refreshed,
|
||||
dropped = diff.dropped,
|
||||
"chapters synced"
|
||||
);
|
||||
Some(diff.new)
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
manga_id = %upsert.manga_id,
|
||||
error = ?e,
|
||||
"chapter sync failed"
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Incremental stop: count consecutive Unchanged upserts and
|
||||
// bail once the threshold is reached. New/Updated resets the
|
||||
// streak so a fresh entry mid-batch doesn't accidentally trip
|
||||
// the cutoff.
|
||||
match upsert.status {
|
||||
repo::crawler::UpsertStatus::Unchanged => {
|
||||
consecutive_unchanged += 1;
|
||||
}
|
||||
repo::crawler::UpsertStatus::New | repo::crawler::UpsertStatus::Updated => {
|
||||
consecutive_unchanged = 0;
|
||||
}
|
||||
}
|
||||
if should_stop(mode, consecutive_unchanged) {
|
||||
hit_incremental_stop = true;
|
||||
if should_stop(was_clean, upsert.status, chapters_new) {
|
||||
hit_stop_condition = true;
|
||||
tracing::info!(
|
||||
consecutive_unchanged,
|
||||
"incremental stop threshold reached; halting walk"
|
||||
key = %manga.source_manga_key,
|
||||
"stop condition met (Unchanged metadata + 0 new chapters); halting walk"
|
||||
);
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drop pass: only when the walk truly covered everything the source
|
||||
// surfaces. `last_seen_at` on un-visited rows is stale, so running
|
||||
// the drop on a partial walk would soft-drop the tail of the index.
|
||||
let full_walk = walked_to_completion && !hit_limit && !hit_incremental_stop;
|
||||
let backfill_complete = full_walk && matches!(mode, DiscoverMode::Backfill);
|
||||
if full_walk {
|
||||
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
|
||||
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
|
||||
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
|
||||
}
|
||||
} else {
|
||||
tracing::info!(
|
||||
?mode,
|
||||
hit_limit,
|
||||
hit_incremental_stop,
|
||||
"partial sync — skipping drop pass"
|
||||
);
|
||||
}
|
||||
if backfill_complete {
|
||||
if let Err(e) = repo::crawler::mark_seed_completed(db, source_id, run_started_at).await {
|
||||
tracing::warn!(error = ?e, "mark_seed_completed failed");
|
||||
} else {
|
||||
tracing::info!(source_id, "seed marked complete");
|
||||
// Recovery-flag write. Only on a clean exit (end-of-walk OR the
|
||||
// intentional stop). `hit_limit` is a caller-imposed early break
|
||||
// and does NOT count — the catalog tail wasn't reached, so a future
|
||||
// tick still needs to walk past where we stopped. The truth table is
|
||||
// pinned by `should_mark_clean_exit` so a future edit that adds
|
||||
// `hit_limit` back into the disjunction trips its unit test. Flag-
|
||||
// write errors are warned and swallowed: the run already did its
|
||||
// work, and a stale `false` flag just buys a recovery sweep on the
|
||||
// next tick.
|
||||
let exited_cleanly = should_mark_clean_exit(walked_to_completion, hit_stop_condition);
|
||||
if exited_cleanly {
|
||||
if let Err(e) = repo::crawler::mark_run_completed(db, source_id).await {
|
||||
tracing::warn!(error = ?e, "mark_run_completed failed");
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
?mode,
|
||||
was_clean,
|
||||
discovered = stats.discovered,
|
||||
upserted = stats.upserted,
|
||||
covers_fetched = stats.covers_fetched,
|
||||
mangas_failed = stats.mangas_failed,
|
||||
walked_to_completion,
|
||||
hit_limit,
|
||||
hit_incremental_stop,
|
||||
hit_stop_condition,
|
||||
exited_cleanly,
|
||||
"metadata pass complete"
|
||||
);
|
||||
|
||||
@@ -508,31 +543,79 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backfill_never_stops_regardless_of_streak() {
|
||||
assert!(!should_stop(DiscoverMode::Backfill, 0));
|
||||
assert!(!should_stop(DiscoverMode::Backfill, 100));
|
||||
assert!(!should_stop(DiscoverMode::Backfill, usize::MAX));
|
||||
fn stop_condition_fires_on_unchanged_metadata_and_zero_new_chapters() {
|
||||
// The whole point of the rule: in steady state, a manga whose
|
||||
// metadata hash matches AND whose chapter list gained no new
|
||||
// entries proves we've reached the caught-up tail of a
|
||||
// newest-first index.
|
||||
assert!(should_stop(true, UpsertStatus::Unchanged, Some(0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn incremental_stops_when_streak_meets_threshold() {
|
||||
let mode = DiscoverMode::Incremental {
|
||||
stop_after_unchanged: 3,
|
||||
};
|
||||
assert!(!should_stop(mode, 0));
|
||||
assert!(!should_stop(mode, 2));
|
||||
assert!(should_stop(mode, 3), "stops at exactly the threshold");
|
||||
assert!(should_stop(mode, 100), "stops at anything past threshold");
|
||||
fn stop_condition_refuses_when_chapters_added() {
|
||||
// Unchanged metadata + N new chapters means the source bumped
|
||||
// this manga because of the chapter add; the rest of the index
|
||||
// is still ahead of us. Don't bail.
|
||||
assert!(!should_stop(true, UpsertStatus::Unchanged, Some(1)));
|
||||
assert!(!should_stop(true, UpsertStatus::Unchanged, Some(42)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn incremental_with_zero_threshold_stops_immediately() {
|
||||
// A nonsensical config (no Unchanged needed to stop) shouldn't
|
||||
// panic — it just means the very first ref triggers the bail.
|
||||
let mode = DiscoverMode::Incremental {
|
||||
stop_after_unchanged: 0,
|
||||
};
|
||||
assert!(should_stop(mode, 0));
|
||||
fn stop_condition_refuses_when_metadata_changed() {
|
||||
// Updated or New metadata always continues — even with zero new
|
||||
// chapters — because the change-of-metadata bump itself is what
|
||||
// the walk is following.
|
||||
assert!(!should_stop(true, UpsertStatus::Updated, Some(0)));
|
||||
assert!(!should_stop(true, UpsertStatus::New, Some(0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stop_condition_refuses_when_chapter_count_unknown() {
|
||||
// skip_chapters mode (CLI metadata-only sweep) or a
|
||||
// logged-and-swallowed chapter sync error: we can't claim "no
|
||||
// new chapters" from absence of evidence, so don't stop. The
|
||||
// operator who runs metadata-only intentionally wants a full
|
||||
// walk anyway.
|
||||
assert!(!should_stop(true, UpsertStatus::Unchanged, None));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stop_condition_disabled_in_recovery_mode() {
|
||||
// was_clean = false means the previous run did not exit cleanly;
|
||||
// the catalog past its crash point is potentially un-synced. Walk
|
||||
// to end-of-source no matter what individual mangas report.
|
||||
assert!(!should_stop(false, UpsertStatus::Unchanged, Some(0)));
|
||||
assert!(!should_stop(false, UpsertStatus::Unchanged, Some(1)));
|
||||
assert!(!should_stop(false, UpsertStatus::Updated, Some(0)));
|
||||
assert!(!should_stop(false, UpsertStatus::New, None));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clean_exit_when_walked_to_completion() {
|
||||
// End-of-walk reached the catalog tail — the recovery flag may
|
||||
// safely flip back to `true`.
|
||||
assert!(should_mark_clean_exit(true, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clean_exit_when_stop_condition_fired() {
|
||||
// First Unchanged + 0-new-chapter manga is a complete steady-
|
||||
// state exit: every manga newer than this point was synced, and
|
||||
// by source-side `update_date DESC` ordering everything past
|
||||
// this point is at least as caught-up.
|
||||
assert!(should_mark_clean_exit(false, true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dirty_exit_when_neither_completion_nor_stop_fired() {
|
||||
// The walk ended for some other reason — including the
|
||||
// caller-imposed `hit_limit` cap, which is the regression case
|
||||
// this test exists for. `should_mark_clean_exit` does not take
|
||||
// `hit_limit` as a parameter, so a future edit that adds
|
||||
// `|| hit_limit` to the inline expression in `run_metadata_pass`
|
||||
// would need to also touch this helper, and would fail this
|
||||
// assertion when it did.
|
||||
assert!(!should_mark_clean_exit(false, false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user