feat(crawler): single-mode walker gated by recovery flag (0.36.0)
Collapses the crawler to a single newest-first walker and replaces the N-consecutive-unchanged streak with a per-manga rule: stop on the first manga where metadata is Unchanged AND chapter sync reports zero new chapters. The early stop is gated by a per-source recovery flag stored in `crawler_state` — set to `false` when a run starts, back to `true` only on a clean exit (end-of-walk or intentional stop). A crashed run leaves the flag `false` automatically (no shutdown code runs), so the next tick walks the full catalog instead of bailing at the first caught-up manga. This means a crashed mid-walk run self-heals on the next tick: the flag stays `false`, the next walk visits every page (recovering anything the crash missed past its crash point), and steady state resumes once the recovery sweep reaches end-of-walk. Removed: - DiscoverMode enum, Backfill mode, the boundary re-check + displaced-refs machinery in TargetSourceWalker. - Drop-pass (mark_dropped_mangas) and seed-completion plumbing (mark_seed_completed / seed_completed_at). The recovery flag subsumes the seed-completion signal; drop detection was explicitly opted out. - JobPayload::Discover (no production callers). - CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the CrawlerModePref config type. `should_mark_clean_exit(walked_to_completion, hit_stop_condition)` encodes the clean-exit truth table in its signature — `hit_limit` is deliberately absent so a future edit cannot accidentally count a caller-imposed cap as a clean exit. Net -501 lines, 261 backend tests passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -528,62 +528,6 @@ async fn sync_chapters_serializes_concurrent_calls_for_same_manga(pool: PgPool)
|
||||
);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
// Seed two mangas before "now" so a later run_started_at sees them as stale.
|
||||
let _ = crawler::upsert_manga_from_source(
|
||||
&pool,
|
||||
"target",
|
||||
"https://x.example/foo",
|
||||
&sample_manga("foo", "Foo", "hf"),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let _ = crawler::upsert_manga_from_source(
|
||||
&pool,
|
||||
"target",
|
||||
"https://x.example/bar",
|
||||
&sample_manga("bar", "Bar", "hb"),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Now mark a new "run" beginning. Re-upsert only `foo` — `bar`
|
||||
// should be the one flagged dropped.
|
||||
let run_started = chrono::Utc::now();
|
||||
// Sleep briefly so the second upsert's NOW() > run_started_at.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
||||
let _ = crawler::upsert_manga_from_source(
|
||||
&pool,
|
||||
"target",
|
||||
"https://x.example/foo",
|
||||
&sample_manga("foo", "Foo", "hf"),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let n = crawler::mark_dropped_mangas(&pool, "target", run_started)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(n, 1, "only bar should have been dropped");
|
||||
|
||||
let foo_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
||||
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'foo'")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(foo_dropped.0.is_none(), "foo seen this run, must not be dropped");
|
||||
let bar_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
||||
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'bar'")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bar_dropped.0.is_some());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
|
||||
Reference in New Issue
Block a user