feat(crawler): single-mode walker gated by recovery flag (0.36.0)

Collapses the crawler to a single newest-first walker and replaces the
N-consecutive-unchanged streak with a per-manga rule: stop on the first
manga where metadata is Unchanged AND chapter sync reports zero new
chapters. The early stop is gated by a per-source recovery flag stored
in `crawler_state` — set to `false` when a run starts, back to `true`
only on a clean exit (end-of-walk or intentional stop). A crashed run
leaves the flag `false` automatically (no shutdown code runs), so the
next tick walks the full catalog instead of bailing at the first
caught-up manga.

This means a crashed mid-walk run self-heals on the next tick: the
flag stays `false`, the next walk visits every page (recovering
anything the crash missed past its crash point), and steady state
resumes once the recovery sweep reaches end-of-walk.

Removed:
- DiscoverMode enum, Backfill mode, the boundary re-check +
  displaced-refs machinery in TargetSourceWalker.
- Drop-pass (mark_dropped_mangas) and seed-completion plumbing
  (mark_seed_completed / seed_completed_at). The recovery flag
  subsumes the seed-completion signal; drop detection was explicitly
  opted out.
- JobPayload::Discover (no production callers).
- CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the
  CrawlerModePref config type.

`should_mark_clean_exit(walked_to_completion, hit_stop_condition)`
encodes the clean-exit truth table in its signature — `hit_limit` is
deliberately absent so a future edit cannot accidentally count a
caller-imposed cap as a clean exit.

Net -501 lines, 261 backend tests passing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-29 23:49:28 +02:00
parent 33f7e19077
commit 9f56f283d4
15 changed files with 387 additions and 888 deletions

View File

@@ -8,14 +8,16 @@
//! updated (metadata_hash changed), or unchanged.
//! - [`sync_manga_chapters`]: per-manga chapter reconciliation. Adds
//! new ones, refreshes URLs on existing ones, soft-drops vanished.
//! - [`mark_dropped_mangas`]: end-of-run pass. Any manga from this
//! source whose `last_seen_at` is older than the run start is
//! soft-dropped.
//! - [`mark_run_started`] / [`mark_run_completed`] /
//! [`last_run_completed_cleanly`]: per-source recovery flag in
//! `crawler_state`. A `false` flag on tick start means the previous
//! run did not exit cleanly and the next walk should ignore the
//! early-stop condition.
//!
//! Each public function is a transaction boundary so a partial failure
//! mid-call leaves the DB in its pre-call state.
use chrono::{DateTime, Utc};
use chrono::Utc;
use sqlx::{PgPool, Postgres, Transaction};
use uuid::Uuid;
@@ -456,19 +458,14 @@ pub async fn sync_manga_chapters(
Ok(diff)
}
/// Record that a complete Backfill walk has finished for `source_id`.
/// The presence of this row is what the daemon's mode auto-detection
/// uses to flip from Backfill to Incremental on subsequent ticks.
///
/// Keyed `seed_completed:<source_id>` in `crawler_state`. JSON payload
/// stores the timestamp so we can surface "last fully reseeded at" in
/// future ops tooling without another migration.
pub async fn mark_seed_completed(
pool: &PgPool,
source_id: &str,
at: DateTime<Utc>,
) -> sqlx::Result<()> {
let key = format!("seed_completed:{source_id}");
/// Mark a metadata pass as in-flight for `source_id`. Stamps
/// `last_run_completed:<source_id>` in `crawler_state` with
/// `{"completed": false, "at": now}`. A crash, panic, or SIGKILL after
/// this point leaves the flag at `false`, which the next tick reads as
/// "previous run did not exit cleanly — walk the full catalog this
/// time" (recovery sweep).
pub async fn mark_run_started(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
let key = format!("last_run_completed:{source_id}");
sqlx::query(
"INSERT INTO crawler_state (key, value, updated_at) \
VALUES ($1, $2, now()) \
@@ -476,50 +473,54 @@ pub async fn mark_seed_completed(
SET value = EXCLUDED.value, updated_at = now()",
)
.bind(&key)
.bind(serde_json::json!({ "at": at.to_rfc3339() }))
.bind(serde_json::json!({
"completed": false,
"at": Utc::now().to_rfc3339(),
}))
.execute(pool)
.await?;
Ok(())
}
/// Read the timestamp written by [`mark_seed_completed`], if any.
/// `None` means no complete Backfill has ever finished for this
/// source — the daemon should run Backfill on the next tick.
pub async fn seed_completed_at(
/// Mark a metadata pass as completed cleanly for `source_id`. Called
/// from the same place a run decides it reached end-of-walk or hit the
/// intentional stop. The next tick reads `true` and applies the normal
/// stop condition.
pub async fn mark_run_completed(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
let key = format!("last_run_completed:{source_id}");
sqlx::query(
"INSERT INTO crawler_state (key, value, updated_at) \
VALUES ($1, $2, now()) \
ON CONFLICT (key) DO UPDATE \
SET value = EXCLUDED.value, updated_at = now()",
)
.bind(&key)
.bind(serde_json::json!({
"completed": true,
"at": Utc::now().to_rfc3339(),
}))
.execute(pool)
.await?;
Ok(())
}
/// Read the recovery flag for `source_id`. A missing row OR an
/// unparseable value reads as `true` ("clean") — the former covers the
/// first-ever run on a virgin DB (no recovery needed), the latter
/// covers forward-compat against future schema changes; both fail-safe
/// toward not making an operator pay for an unnecessary full sweep.
pub async fn last_run_completed_cleanly(
pool: &PgPool,
source_id: &str,
) -> sqlx::Result<Option<DateTime<Utc>>> {
let key = format!("seed_completed:{source_id}");
) -> sqlx::Result<bool> {
let key = format!("last_run_completed:{source_id}");
let row: Option<serde_json::Value> =
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
.bind(&key)
.fetch_optional(pool)
.await?;
Ok(row.and_then(|v| {
v.get("at")
.and_then(|s| s.as_str())
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
.map(|dt| dt.with_timezone(&Utc))
}))
Ok(row
.and_then(|v| v.get("completed").and_then(|b| b.as_bool()))
.unwrap_or(true))
}
pub async fn mark_dropped_mangas(
pool: &PgPool,
source_id: &str,
run_started_at: DateTime<Utc>,
) -> sqlx::Result<u64> {
let res = sqlx::query(
r#"
UPDATE manga_sources
SET dropped_at = NOW()
WHERE source_id = $1
AND last_seen_at < $2
AND dropped_at IS NULL
"#,
)
.bind(source_id)
.bind(run_started_at)
.execute(pool)
.await?;
Ok(res.rows_affected())
}