feat(crawler): single-mode walker gated by recovery flag (0.36.0)

Collapses the crawler to a single newest-first walker and replaces the N-consecutive-unchanged streak with a per-manga rule: stop on the first manga where metadata is Unchanged AND chapter sync reports zero new chapters. The early stop is gated by a per-source recovery flag stored in `crawler_state` — set to `false` when a run starts, back to `true` only on a clean exit (end-of-walk or intentional stop). A crashed run leaves the flag `false` automatically (no shutdown code runs), so the next tick walks the full catalog instead of bailing at the first caught-up manga. This means a crashed mid-walk run self-heals on the next tick: the flag stays `false`, the next walk visits every page (recovering anything the crash missed past its crash point), and steady state resumes once the recovery sweep reaches end-of-walk. Removed: - DiscoverMode enum, Backfill mode, the boundary re-check + displaced-refs machinery in TargetSourceWalker. - Drop-pass (mark_dropped_mangas) and seed-completion plumbing (mark_seed_completed / seed_completed_at). The recovery flag subsumes the seed-completion signal; drop detection was explicitly opted out. - JobPayload::Discover (no production callers). - CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the CrawlerModePref config type. `should_mark_clean_exit(walked_to_completion, hit_stop_condition)` encodes the clean-exit truth table in its signature — `hit_limit` is deliberately absent so a future edit cannot accidentally count a caller-imposed cap as a clean exit. Net -501 lines, 261 backend tests passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-29 23:49:28 +02:00
parent 33f7e19077
commit 9f56f283d4
15 changed files with 387 additions and 888 deletions
--- a/backend/tests/crawler_incremental.rs
+++ b/backend/tests/crawler_incremental.rs
@@ -1,85 +0,0 @@
-//! Integration tests for the incremental-mode coordination state:
-//! `mark_seed_completed` / `seed_completed_at` round-trip via the
-//! `crawler_state` table.
-//!
-//! End-to-end pipeline behavior (walker + stop-on-Unchanged) requires
-//! a real `chromiumoxide::Browser` to construct a `FetchContext`, so
-//! the live integration of that path is covered by
-//! `crawler_browser_smoke.rs` instead. The pure stop logic itself is
-//! unit-tested in `crawler::pipeline::tests`.
-
-use chrono::Utc;
-use mangalord::repo::crawler;
-use sqlx::PgPool;
-
-#[sqlx::test(migrations = "./migrations")]
-async fn seed_completed_at_none_before_any_run(pool: PgPool) {
-    crawler::ensure_source(&pool, "target", "T", "https://x.example")
-        .await
-        .unwrap();
-    let res = crawler::seed_completed_at(&pool, "target").await.unwrap();
-    assert!(res.is_none(), "fresh source has no seed marker");
-}
-
-#[sqlx::test(migrations = "./migrations")]
-async fn mark_seed_completed_then_read_round_trips_timestamp(pool: PgPool) {
-    crawler::ensure_source(&pool, "target", "T", "https://x.example")
-        .await
-        .unwrap();
-    let at = Utc::now();
-    crawler::mark_seed_completed(&pool, "target", at)
-        .await
-        .unwrap();
-    let read = crawler::seed_completed_at(&pool, "target")
-        .await
-        .unwrap()
-        .expect("marker present after mark");
-    // RFC3339 round-trip is millisecond-precise on chrono::Utc; allow a
-    // 1ms tolerance to absorb postgres jsonb whitespace canonicalization.
-    let drift = (read - at).num_milliseconds().abs();
-    assert!(drift <= 1, "round-trip drift: {drift}ms (at={at}, read={read})");
-}
-
-#[sqlx::test(migrations = "./migrations")]
-async fn mark_seed_completed_overwrites_previous_value(pool: PgPool) {
-    crawler::ensure_source(&pool, "target", "T", "https://x.example")
-        .await
-        .unwrap();
-    let first = Utc::now() - chrono::Duration::hours(1);
-    let second = Utc::now();
-    crawler::mark_seed_completed(&pool, "target", first)
-        .await
-        .unwrap();
-    crawler::mark_seed_completed(&pool, "target", second)
-        .await
-        .unwrap();
-    let read = crawler::seed_completed_at(&pool, "target")
-        .await
-        .unwrap()
-        .expect("marker present");
-    let drift = (read - second).num_milliseconds().abs();
-    assert!(drift <= 1, "should reflect the latest mark, not the first");
-}
-
-#[sqlx::test(migrations = "./migrations")]
-async fn seed_completed_is_per_source(pool: PgPool) {
-    // Two sources, only one is marked complete. The other must still
-    // report None — the key is namespaced by source_id.
-    crawler::ensure_source(&pool, "target", "T", "https://x.example")
-        .await
-        .unwrap();
-    crawler::ensure_source(&pool, "other", "O", "https://y.example")
-        .await
-        .unwrap();
-    crawler::mark_seed_completed(&pool, "target", Utc::now())
-        .await
-        .unwrap();
-    assert!(crawler::seed_completed_at(&pool, "target")
-        .await
-        .unwrap()
-        .is_some());
-    assert!(crawler::seed_completed_at(&pool, "other")
-        .await
-        .unwrap()
-        .is_none());
-}
--- a/backend/tests/crawler_jobs.rs
+++ b/backend/tests/crawler_jobs.rs
@@ -9,7 +9,6 @@ use std::time::Duration;
 use mangalord::crawler::jobs::{
    self, EnqueueResult, JobPayload, KIND_SYNC_CHAPTER_CONTENT,
 };
-use mangalord::crawler::source::DiscoverMode;
 use sqlx::PgPool;
 use uuid::Uuid;

@@ -21,10 +20,13 @@ fn chapter_content_payload(chapter_id: Uuid) -> JobPayload {
    }
 }

-fn discover_payload() -> JobPayload {
-    JobPayload::Discover {
+/// A non-`SyncChapterContent` payload, used to assert that only the
+/// chapter-content kind is deduplicated by the partial index and that
+/// `lease`'s kind filter correctly excludes other kinds.
+fn sync_manga_payload(key: &str) -> JobPayload {
+    JobPayload::SyncManga {
        source_id: "target".into(),
-        mode: DiscoverMode::Backfill,
+        source_manga_key: key.into(),
    }
 }

@@ -141,7 +143,7 @@ async fn different_chapter_ids_can_coexist(pool: PgPool) {

 #[sqlx::test(migrations = "./migrations")]
 async fn non_chapter_content_payloads_are_never_deduped(pool: PgPool) {
-    let p = discover_payload();
+    let p = sync_manga_payload("foo");
    assert!(matches!(
        jobs::enqueue(&pool, &p).await.unwrap(),
        EnqueueResult::Inserted(_)
@@ -185,7 +187,10 @@ async fn lease_marks_running_and_bumps_attempts_and_sets_leased_until(pool: PgPo

 #[sqlx::test(migrations = "./migrations")]
 async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
-    let discover_id = match jobs::enqueue(&pool, &discover_payload()).await.unwrap() {
+    let manga_id = match jobs::enqueue(&pool, &sync_manga_payload("foo"))
+        .await
+        .unwrap()
+    {
        EnqueueResult::Inserted(id) => id,
        _ => unreachable!(),
    };
@@ -207,8 +212,8 @@ async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
    .unwrap();
    assert_eq!(leases.len(), 1, "only chapter content payload leases");
    assert_eq!(leases[0].id, chapter_id);
-    // discover is still pending
-    assert_eq!(job_state(&pool, discover_id).await, "pending");
+    // sync_manga is still pending
+    assert_eq!(job_state(&pool, manga_id).await, "pending");
 }

 #[sqlx::test(migrations = "./migrations")]
--- a/backend/tests/crawler_recovery_flag.rs
+++ b/backend/tests/crawler_recovery_flag.rs
@@ -0,0 +1,82 @@
+//! Integration tests for the per-source recovery flag:
+//! `mark_run_started` / `mark_run_completed` / `last_run_completed_cleanly`
+//! round-trip via the `crawler_state` table.
+//!
+//! End-to-end pipeline behavior (a crashed run forcing a recovery sweep
+//! on the next tick) requires a real `chromiumoxide::Browser` to drive
+//! the walker, so that path is covered by `crawler_browser_smoke.rs`.
+//! The pure stop-condition logic itself is unit-tested in
+//! `crawler::pipeline::tests`.
+
+use mangalord::repo::crawler;
+use sqlx::PgPool;
+
+#[sqlx::test(migrations = "./migrations")]
+async fn defaults_to_clean_when_no_marker(pool: PgPool) {
+    // First-ever run semantics: absence of the key must NOT trigger a
+    // recovery walk on a virgin DB. Treat missing as "previous run
+    // completed cleanly" so the first tick can take the early-stop path.
+    crawler::ensure_source(&pool, "target", "T", "https://x.example")
+        .await
+        .unwrap();
+    let clean = crawler::last_run_completed_cleanly(&pool, "target")
+        .await
+        .unwrap();
+    assert!(clean, "absent marker must read as clean");
+}
+
+#[sqlx::test(migrations = "./migrations")]
+async fn mark_run_started_flips_to_false(pool: PgPool) {
+    crawler::ensure_source(&pool, "target", "T", "https://x.example")
+        .await
+        .unwrap();
+    crawler::mark_run_started(&pool, "target").await.unwrap();
+    let clean = crawler::last_run_completed_cleanly(&pool, "target")
+        .await
+        .unwrap();
+    assert!(!clean, "after mark_run_started, flag must read false");
+}
+
+#[sqlx::test(migrations = "./migrations")]
+async fn started_then_completed_round_trips_to_clean(pool: PgPool) {
+    // Steady-state: a run starts (flag → false) and exits cleanly
+    // (flag → true). The next tick should see "clean" and apply the
+    // normal stop condition.
+    crawler::ensure_source(&pool, "target", "T", "https://x.example")
+        .await
+        .unwrap();
+    crawler::mark_run_started(&pool, "target").await.unwrap();
+    crawler::mark_run_completed(&pool, "target").await.unwrap();
+    let clean = crawler::last_run_completed_cleanly(&pool, "target")
+        .await
+        .unwrap();
+    assert!(
+        clean,
+        "after start → complete the flag must round-trip to clean"
+    );
+}
+
+#[sqlx::test(migrations = "./migrations")]
+async fn flag_is_per_source(pool: PgPool) {
+    // Two sources, only one is mid-run. The other must still report
+    // clean — the crawler_state key is namespaced by source_id.
+    crawler::ensure_source(&pool, "target", "T", "https://x.example")
+        .await
+        .unwrap();
+    crawler::ensure_source(&pool, "other", "O", "https://y.example")
+        .await
+        .unwrap();
+    crawler::mark_run_started(&pool, "target").await.unwrap();
+    assert!(
+        !crawler::last_run_completed_cleanly(&pool, "target")
+            .await
+            .unwrap(),
+        "target is mid-run"
+    );
+    assert!(
+        crawler::last_run_completed_cleanly(&pool, "other")
+            .await
+            .unwrap(),
+        "other source is untouched and reads clean"
+    );
+}
--- a/backend/tests/crawler_sync.rs
+++ b/backend/tests/crawler_sync.rs
@@ -528,62 +528,6 @@ async fn sync_chapters_serializes_concurrent_calls_for_same_manga(pool: PgPool)
    );
 }

-#[sqlx::test(migrations = "./migrations")]
-async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
-    crawler::ensure_source(&pool, "target", "T", "https://x.example")
-        .await
-        .unwrap();
-    // Seed two mangas before "now" so a later run_started_at sees them as stale.
-    let _ = crawler::upsert_manga_from_source(
-        &pool,
-        "target",
-        "https://x.example/foo",
-        &sample_manga("foo", "Foo", "hf"),
-    )
-    .await
-    .unwrap();
-    let _ = crawler::upsert_manga_from_source(
-        &pool,
-        "target",
-        "https://x.example/bar",
-        &sample_manga("bar", "Bar", "hb"),
-    )
-    .await
-    .unwrap();
-
-    // Now mark a new "run" beginning. Re-upsert only `foo` — `bar`
-    // should be the one flagged dropped.
-    let run_started = chrono::Utc::now();
-    // Sleep briefly so the second upsert's NOW() > run_started_at.
-    tokio::time::sleep(std::time::Duration::from_millis(20)).await;
-    let _ = crawler::upsert_manga_from_source(
-        &pool,
-        "target",
-        "https://x.example/foo",
-        &sample_manga("foo", "Foo", "hf"),
-    )
-    .await
-    .unwrap();
-
-    let n = crawler::mark_dropped_mangas(&pool, "target", run_started)
-        .await
-        .unwrap();
-    assert_eq!(n, 1, "only bar should have been dropped");
-
-    let foo_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
-        sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'foo'")
-            .fetch_one(&pool)
-            .await
-            .unwrap();
-    assert!(foo_dropped.0.is_none(), "foo seen this run, must not be dropped");
-    let bar_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
-        sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'bar'")
-            .fetch_one(&pool)
-            .await
-            .unwrap();
-    assert!(bar_dropped.0.is_some());
-}
-
 #[sqlx::test(migrations = "./migrations")]
 async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")