Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
86 lines
3.1 KiB
Rust
86 lines
3.1 KiB
Rust
//! Integration tests for the incremental-mode coordination state:
|
|
//! `mark_seed_completed` / `seed_completed_at` round-trip via the
|
|
//! `crawler_state` table.
|
|
//!
|
|
//! End-to-end pipeline behavior (walker + stop-on-Unchanged) requires
|
|
//! a real `chromiumoxide::Browser` to construct a `FetchContext`, so
|
|
//! the live integration of that path is covered by
|
|
//! `crawler_browser_smoke.rs` instead. The pure stop logic itself is
|
|
//! unit-tested in `crawler::pipeline::tests`.
|
|
|
|
use chrono::Utc;
|
|
use mangalord::repo::crawler;
|
|
use sqlx::PgPool;
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn seed_completed_at_none_before_any_run(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let res = crawler::seed_completed_at(&pool, "target").await.unwrap();
|
|
assert!(res.is_none(), "fresh source has no seed marker");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn mark_seed_completed_then_read_round_trips_timestamp(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let at = Utc::now();
|
|
crawler::mark_seed_completed(&pool, "target", at)
|
|
.await
|
|
.unwrap();
|
|
let read = crawler::seed_completed_at(&pool, "target")
|
|
.await
|
|
.unwrap()
|
|
.expect("marker present after mark");
|
|
// RFC3339 round-trip is millisecond-precise on chrono::Utc; allow a
|
|
// 1ms tolerance to absorb postgres jsonb whitespace canonicalization.
|
|
let drift = (read - at).num_milliseconds().abs();
|
|
assert!(drift <= 1, "round-trip drift: {drift}ms (at={at}, read={read})");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn mark_seed_completed_overwrites_previous_value(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let first = Utc::now() - chrono::Duration::hours(1);
|
|
let second = Utc::now();
|
|
crawler::mark_seed_completed(&pool, "target", first)
|
|
.await
|
|
.unwrap();
|
|
crawler::mark_seed_completed(&pool, "target", second)
|
|
.await
|
|
.unwrap();
|
|
let read = crawler::seed_completed_at(&pool, "target")
|
|
.await
|
|
.unwrap()
|
|
.expect("marker present");
|
|
let drift = (read - second).num_milliseconds().abs();
|
|
assert!(drift <= 1, "should reflect the latest mark, not the first");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn seed_completed_is_per_source(pool: PgPool) {
|
|
// Two sources, only one is marked complete. The other must still
|
|
// report None — the key is namespaced by source_id.
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
crawler::ensure_source(&pool, "other", "O", "https://y.example")
|
|
.await
|
|
.unwrap();
|
|
crawler::mark_seed_completed(&pool, "target", Utc::now())
|
|
.await
|
|
.unwrap();
|
|
assert!(crawler::seed_completed_at(&pool, "target")
|
|
.await
|
|
.unwrap()
|
|
.is_some());
|
|
assert!(crawler::seed_completed_at(&pool, "other")
|
|
.await
|
|
.unwrap()
|
|
.is_none());
|
|
}
|