feat(crawler): single-mode walker gated by recovery flag (0.36.0)
Collapses the crawler to a single newest-first walker and replaces the N-consecutive-unchanged streak with a per-manga rule: stop on the first manga where metadata is Unchanged AND chapter sync reports zero new chapters. The early stop is gated by a per-source recovery flag stored in `crawler_state` — set to `false` when a run starts, back to `true` only on a clean exit (end-of-walk or intentional stop). A crashed run leaves the flag `false` automatically (no shutdown code runs), so the next tick walks the full catalog instead of bailing at the first caught-up manga. This means a crashed mid-walk run self-heals on the next tick: the flag stays `false`, the next walk visits every page (recovering anything the crash missed past its crash point), and steady state resumes once the recovery sweep reaches end-of-walk. Removed: - DiscoverMode enum, Backfill mode, the boundary re-check + displaced-refs machinery in TargetSourceWalker. - Drop-pass (mark_dropped_mangas) and seed-completion plumbing (mark_seed_completed / seed_completed_at). The recovery flag subsumes the seed-completion signal; drop detection was explicitly opted out. - JobPayload::Discover (no production callers). - CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the CrawlerModePref config type. `should_mark_clean_exit(walked_to_completion, hit_stop_condition)` encodes the clean-exit truth table in its signature — `hit_limit` is deliberately absent so a future edit cannot accidentally count a caller-imposed cap as a clean exit. Net -501 lines, 261 backend tests passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,85 +0,0 @@
|
||||
//! Integration tests for the incremental-mode coordination state:
|
||||
//! `mark_seed_completed` / `seed_completed_at` round-trip via the
|
||||
//! `crawler_state` table.
|
||||
//!
|
||||
//! End-to-end pipeline behavior (walker + stop-on-Unchanged) requires
|
||||
//! a real `chromiumoxide::Browser` to construct a `FetchContext`, so
|
||||
//! the live integration of that path is covered by
|
||||
//! `crawler_browser_smoke.rs` instead. The pure stop logic itself is
|
||||
//! unit-tested in `crawler::pipeline::tests`.
|
||||
|
||||
use chrono::Utc;
|
||||
use mangalord::repo::crawler;
|
||||
use sqlx::PgPool;
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn seed_completed_at_none_before_any_run(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
let res = crawler::seed_completed_at(&pool, "target").await.unwrap();
|
||||
assert!(res.is_none(), "fresh source has no seed marker");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn mark_seed_completed_then_read_round_trips_timestamp(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
let at = Utc::now();
|
||||
crawler::mark_seed_completed(&pool, "target", at)
|
||||
.await
|
||||
.unwrap();
|
||||
let read = crawler::seed_completed_at(&pool, "target")
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("marker present after mark");
|
||||
// RFC3339 round-trip is millisecond-precise on chrono::Utc; allow a
|
||||
// 1ms tolerance to absorb postgres jsonb whitespace canonicalization.
|
||||
let drift = (read - at).num_milliseconds().abs();
|
||||
assert!(drift <= 1, "round-trip drift: {drift}ms (at={at}, read={read})");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn mark_seed_completed_overwrites_previous_value(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
let first = Utc::now() - chrono::Duration::hours(1);
|
||||
let second = Utc::now();
|
||||
crawler::mark_seed_completed(&pool, "target", first)
|
||||
.await
|
||||
.unwrap();
|
||||
crawler::mark_seed_completed(&pool, "target", second)
|
||||
.await
|
||||
.unwrap();
|
||||
let read = crawler::seed_completed_at(&pool, "target")
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("marker present");
|
||||
let drift = (read - second).num_milliseconds().abs();
|
||||
assert!(drift <= 1, "should reflect the latest mark, not the first");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn seed_completed_is_per_source(pool: PgPool) {
|
||||
// Two sources, only one is marked complete. The other must still
|
||||
// report None — the key is namespaced by source_id.
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
crawler::ensure_source(&pool, "other", "O", "https://y.example")
|
||||
.await
|
||||
.unwrap();
|
||||
crawler::mark_seed_completed(&pool, "target", Utc::now())
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(crawler::seed_completed_at(&pool, "target")
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
assert!(crawler::seed_completed_at(&pool, "other")
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none());
|
||||
}
|
||||
@@ -9,7 +9,6 @@ use std::time::Duration;
|
||||
use mangalord::crawler::jobs::{
|
||||
self, EnqueueResult, JobPayload, KIND_SYNC_CHAPTER_CONTENT,
|
||||
};
|
||||
use mangalord::crawler::source::DiscoverMode;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
@@ -21,10 +20,13 @@ fn chapter_content_payload(chapter_id: Uuid) -> JobPayload {
|
||||
}
|
||||
}
|
||||
|
||||
fn discover_payload() -> JobPayload {
|
||||
JobPayload::Discover {
|
||||
/// A non-`SyncChapterContent` payload, used to assert that only the
|
||||
/// chapter-content kind is deduplicated by the partial index and that
|
||||
/// `lease`'s kind filter correctly excludes other kinds.
|
||||
fn sync_manga_payload(key: &str) -> JobPayload {
|
||||
JobPayload::SyncManga {
|
||||
source_id: "target".into(),
|
||||
mode: DiscoverMode::Backfill,
|
||||
source_manga_key: key.into(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -141,7 +143,7 @@ async fn different_chapter_ids_can_coexist(pool: PgPool) {
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn non_chapter_content_payloads_are_never_deduped(pool: PgPool) {
|
||||
let p = discover_payload();
|
||||
let p = sync_manga_payload("foo");
|
||||
assert!(matches!(
|
||||
jobs::enqueue(&pool, &p).await.unwrap(),
|
||||
EnqueueResult::Inserted(_)
|
||||
@@ -185,7 +187,10 @@ async fn lease_marks_running_and_bumps_attempts_and_sets_leased_until(pool: PgPo
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
|
||||
let discover_id = match jobs::enqueue(&pool, &discover_payload()).await.unwrap() {
|
||||
let manga_id = match jobs::enqueue(&pool, &sync_manga_payload("foo"))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
@@ -207,8 +212,8 @@ async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
|
||||
.unwrap();
|
||||
assert_eq!(leases.len(), 1, "only chapter content payload leases");
|
||||
assert_eq!(leases[0].id, chapter_id);
|
||||
// discover is still pending
|
||||
assert_eq!(job_state(&pool, discover_id).await, "pending");
|
||||
// sync_manga is still pending
|
||||
assert_eq!(job_state(&pool, manga_id).await, "pending");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
|
||||
82
backend/tests/crawler_recovery_flag.rs
Normal file
82
backend/tests/crawler_recovery_flag.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
//! Integration tests for the per-source recovery flag:
|
||||
//! `mark_run_started` / `mark_run_completed` / `last_run_completed_cleanly`
|
||||
//! round-trip via the `crawler_state` table.
|
||||
//!
|
||||
//! End-to-end pipeline behavior (a crashed run forcing a recovery sweep
|
||||
//! on the next tick) requires a real `chromiumoxide::Browser` to drive
|
||||
//! the walker, so that path is covered by `crawler_browser_smoke.rs`.
|
||||
//! The pure stop-condition logic itself is unit-tested in
|
||||
//! `crawler::pipeline::tests`.
|
||||
|
||||
use mangalord::repo::crawler;
|
||||
use sqlx::PgPool;
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn defaults_to_clean_when_no_marker(pool: PgPool) {
|
||||
// First-ever run semantics: absence of the key must NOT trigger a
|
||||
// recovery walk on a virgin DB. Treat missing as "previous run
|
||||
// completed cleanly" so the first tick can take the early-stop path.
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
let clean = crawler::last_run_completed_cleanly(&pool, "target")
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(clean, "absent marker must read as clean");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn mark_run_started_flips_to_false(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
crawler::mark_run_started(&pool, "target").await.unwrap();
|
||||
let clean = crawler::last_run_completed_cleanly(&pool, "target")
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!clean, "after mark_run_started, flag must read false");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn started_then_completed_round_trips_to_clean(pool: PgPool) {
|
||||
// Steady-state: a run starts (flag → false) and exits cleanly
|
||||
// (flag → true). The next tick should see "clean" and apply the
|
||||
// normal stop condition.
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
crawler::mark_run_started(&pool, "target").await.unwrap();
|
||||
crawler::mark_run_completed(&pool, "target").await.unwrap();
|
||||
let clean = crawler::last_run_completed_cleanly(&pool, "target")
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(
|
||||
clean,
|
||||
"after start → complete the flag must round-trip to clean"
|
||||
);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn flag_is_per_source(pool: PgPool) {
|
||||
// Two sources, only one is mid-run. The other must still report
|
||||
// clean — the crawler_state key is namespaced by source_id.
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
crawler::ensure_source(&pool, "other", "O", "https://y.example")
|
||||
.await
|
||||
.unwrap();
|
||||
crawler::mark_run_started(&pool, "target").await.unwrap();
|
||||
assert!(
|
||||
!crawler::last_run_completed_cleanly(&pool, "target")
|
||||
.await
|
||||
.unwrap(),
|
||||
"target is mid-run"
|
||||
);
|
||||
assert!(
|
||||
crawler::last_run_completed_cleanly(&pool, "other")
|
||||
.await
|
||||
.unwrap(),
|
||||
"other source is untouched and reads clean"
|
||||
);
|
||||
}
|
||||
@@ -528,62 +528,6 @@ async fn sync_chapters_serializes_concurrent_calls_for_same_manga(pool: PgPool)
|
||||
);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
.await
|
||||
.unwrap();
|
||||
// Seed two mangas before "now" so a later run_started_at sees them as stale.
|
||||
let _ = crawler::upsert_manga_from_source(
|
||||
&pool,
|
||||
"target",
|
||||
"https://x.example/foo",
|
||||
&sample_manga("foo", "Foo", "hf"),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let _ = crawler::upsert_manga_from_source(
|
||||
&pool,
|
||||
"target",
|
||||
"https://x.example/bar",
|
||||
&sample_manga("bar", "Bar", "hb"),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Now mark a new "run" beginning. Re-upsert only `foo` — `bar`
|
||||
// should be the one flagged dropped.
|
||||
let run_started = chrono::Utc::now();
|
||||
// Sleep briefly so the second upsert's NOW() > run_started_at.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
||||
let _ = crawler::upsert_manga_from_source(
|
||||
&pool,
|
||||
"target",
|
||||
"https://x.example/foo",
|
||||
&sample_manga("foo", "Foo", "hf"),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let n = crawler::mark_dropped_mangas(&pool, "target", run_started)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(n, 1, "only bar should have been dropped");
|
||||
|
||||
let foo_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
||||
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'foo'")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(foo_dropped.0.is_none(), "foo seen this run, must not be dropped");
|
||||
let bar_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
||||
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'bar'")
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(bar_dropped.0.is_some());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) {
|
||||
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
||||
|
||||
Reference in New Issue
Block a user