feat(crawler): single-mode walker gated by recovery flag (0.36.0)
Collapses the crawler to a single newest-first walker and replaces the N-consecutive-unchanged streak with a per-manga rule: stop on the first manga where metadata is Unchanged AND chapter sync reports zero new chapters. The early stop is gated by a per-source recovery flag stored in `crawler_state` — set to `false` when a run starts, back to `true` only on a clean exit (end-of-walk or intentional stop). A crashed run leaves the flag `false` automatically (no shutdown code runs), so the next tick walks the full catalog instead of bailing at the first caught-up manga. This means a crashed mid-walk run self-heals on the next tick: the flag stays `false`, the next walk visits every page (recovering anything the crash missed past its crash point), and steady state resumes once the recovery sweep reaches end-of-walk. Removed: - DiscoverMode enum, Backfill mode, the boundary re-check + displaced-refs machinery in TargetSourceWalker. - Drop-pass (mark_dropped_mangas) and seed-completion plumbing (mark_seed_completed / seed_completed_at). The recovery flag subsumes the seed-completion signal; drop detection was explicitly opted out. - JobPayload::Discover (no production callers). - CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the CrawlerModePref config type. `should_mark_clean_exit(walked_to_completion, hit_stop_condition)` encodes the clean-exit truth table in its signature — `hit_limit` is deliberately absent so a future edit cannot accidentally count a caller-imposed cap as a clean exit. Net -501 lines, 261 backend tests passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -8,14 +8,16 @@
|
||||
//! updated (metadata_hash changed), or unchanged.
|
||||
//! - [`sync_manga_chapters`]: per-manga chapter reconciliation. Adds
|
||||
//! new ones, refreshes URLs on existing ones, soft-drops vanished.
|
||||
//! - [`mark_dropped_mangas`]: end-of-run pass. Any manga from this
|
||||
//! source whose `last_seen_at` is older than the run start is
|
||||
//! soft-dropped.
|
||||
//! - [`mark_run_started`] / [`mark_run_completed`] /
|
||||
//! [`last_run_completed_cleanly`]: per-source recovery flag in
|
||||
//! `crawler_state`. A `false` flag on tick start means the previous
|
||||
//! run did not exit cleanly and the next walk should ignore the
|
||||
//! early-stop condition.
|
||||
//!
|
||||
//! Each public function is a transaction boundary so a partial failure
|
||||
//! mid-call leaves the DB in its pre-call state.
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use chrono::Utc;
|
||||
use sqlx::{PgPool, Postgres, Transaction};
|
||||
use uuid::Uuid;
|
||||
|
||||
@@ -456,19 +458,14 @@ pub async fn sync_manga_chapters(
|
||||
Ok(diff)
|
||||
}
|
||||
|
||||
/// Record that a complete Backfill walk has finished for `source_id`.
|
||||
/// The presence of this row is what the daemon's mode auto-detection
|
||||
/// uses to flip from Backfill to Incremental on subsequent ticks.
|
||||
///
|
||||
/// Keyed `seed_completed:<source_id>` in `crawler_state`. JSON payload
|
||||
/// stores the timestamp so we can surface "last fully reseeded at" in
|
||||
/// future ops tooling without another migration.
|
||||
pub async fn mark_seed_completed(
|
||||
pool: &PgPool,
|
||||
source_id: &str,
|
||||
at: DateTime<Utc>,
|
||||
) -> sqlx::Result<()> {
|
||||
let key = format!("seed_completed:{source_id}");
|
||||
/// Mark a metadata pass as in-flight for `source_id`. Stamps
|
||||
/// `last_run_completed:<source_id>` in `crawler_state` with
|
||||
/// `{"completed": false, "at": now}`. A crash, panic, or SIGKILL after
|
||||
/// this point leaves the flag at `false`, which the next tick reads as
|
||||
/// "previous run did not exit cleanly — walk the full catalog this
|
||||
/// time" (recovery sweep).
|
||||
pub async fn mark_run_started(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
|
||||
let key = format!("last_run_completed:{source_id}");
|
||||
sqlx::query(
|
||||
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||
VALUES ($1, $2, now()) \
|
||||
@@ -476,50 +473,54 @@ pub async fn mark_seed_completed(
|
||||
SET value = EXCLUDED.value, updated_at = now()",
|
||||
)
|
||||
.bind(&key)
|
||||
.bind(serde_json::json!({ "at": at.to_rfc3339() }))
|
||||
.bind(serde_json::json!({
|
||||
"completed": false,
|
||||
"at": Utc::now().to_rfc3339(),
|
||||
}))
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read the timestamp written by [`mark_seed_completed`], if any.
|
||||
/// `None` means no complete Backfill has ever finished for this
|
||||
/// source — the daemon should run Backfill on the next tick.
|
||||
pub async fn seed_completed_at(
|
||||
/// Mark a metadata pass as completed cleanly for `source_id`. Called
|
||||
/// from the same place a run decides it reached end-of-walk or hit the
|
||||
/// intentional stop. The next tick reads `true` and applies the normal
|
||||
/// stop condition.
|
||||
pub async fn mark_run_completed(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
|
||||
let key = format!("last_run_completed:{source_id}");
|
||||
sqlx::query(
|
||||
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||
VALUES ($1, $2, now()) \
|
||||
ON CONFLICT (key) DO UPDATE \
|
||||
SET value = EXCLUDED.value, updated_at = now()",
|
||||
)
|
||||
.bind(&key)
|
||||
.bind(serde_json::json!({
|
||||
"completed": true,
|
||||
"at": Utc::now().to_rfc3339(),
|
||||
}))
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read the recovery flag for `source_id`. A missing row OR an
|
||||
/// unparseable value reads as `true` ("clean") — the former covers the
|
||||
/// first-ever run on a virgin DB (no recovery needed), the latter
|
||||
/// covers forward-compat against future schema changes; both fail-safe
|
||||
/// toward not making an operator pay for an unnecessary full sweep.
|
||||
pub async fn last_run_completed_cleanly(
|
||||
pool: &PgPool,
|
||||
source_id: &str,
|
||||
) -> sqlx::Result<Option<DateTime<Utc>>> {
|
||||
let key = format!("seed_completed:{source_id}");
|
||||
) -> sqlx::Result<bool> {
|
||||
let key = format!("last_run_completed:{source_id}");
|
||||
let row: Option<serde_json::Value> =
|
||||
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
|
||||
.bind(&key)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row.and_then(|v| {
|
||||
v.get("at")
|
||||
.and_then(|s| s.as_str())
|
||||
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
}))
|
||||
Ok(row
|
||||
.and_then(|v| v.get("completed").and_then(|b| b.as_bool()))
|
||||
.unwrap_or(true))
|
||||
}
|
||||
|
||||
pub async fn mark_dropped_mangas(
|
||||
pool: &PgPool,
|
||||
source_id: &str,
|
||||
run_started_at: DateTime<Utc>,
|
||||
) -> sqlx::Result<u64> {
|
||||
let res = sqlx::query(
|
||||
r#"
|
||||
UPDATE manga_sources
|
||||
SET dropped_at = NOW()
|
||||
WHERE source_id = $1
|
||||
AND last_seen_at < $2
|
||||
AND dropped_at IS NULL
|
||||
"#,
|
||||
)
|
||||
.bind(source_id)
|
||||
.bind(run_started_at)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(res.rows_affected())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user