feat: incremental crawl mode with seed-completion gate (0.33.0)

Daemon now auto-detects mode per source: Backfill until the first
full walk records `seed_completed:<source>` in `crawler_state`, then
Incremental (newest-first, stops after N consecutive Unchanged
upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects
`auto` since it has no pre-run DB state.

`Source::discover` returns a lazy `DiscoverWalk` so Incremental can
break out mid-walk without prefetching pages. The drop pass and seed
marker are now gated on a true full walk — fixes a latent soft-drop
of the index tail under partial sweeps.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-28 06:41:26 +02:00
parent 51f42b03e9
commit 45ce0d8f12
11 changed files with 761 additions and 162 deletions

View File

@@ -412,6 +412,53 @@ pub async fn sync_manga_chapters(
Ok(diff)
}
/// Record that a complete Backfill walk has finished for `source_id`.
/// The presence of this row is what the daemon's mode auto-detection
/// uses to flip from Backfill to Incremental on subsequent ticks.
///
/// Keyed `seed_completed:<source_id>` in `crawler_state`. JSON payload
/// stores the timestamp so we can surface "last fully reseeded at" in
/// future ops tooling without another migration.
pub async fn mark_seed_completed(
pool: &PgPool,
source_id: &str,
at: DateTime<Utc>,
) -> sqlx::Result<()> {
let key = format!("seed_completed:{source_id}");
sqlx::query(
"INSERT INTO crawler_state (key, value, updated_at) \
VALUES ($1, $2, now()) \
ON CONFLICT (key) DO UPDATE \
SET value = EXCLUDED.value, updated_at = now()",
)
.bind(&key)
.bind(serde_json::json!({ "at": at.to_rfc3339() }))
.execute(pool)
.await?;
Ok(())
}
/// Read the timestamp written by [`mark_seed_completed`], if any.
/// `None` means no complete Backfill has ever finished for this
/// source — the daemon should run Backfill on the next tick.
pub async fn seed_completed_at(
pool: &PgPool,
source_id: &str,
) -> sqlx::Result<Option<DateTime<Utc>>> {
let key = format!("seed_completed:{source_id}");
let row: Option<serde_json::Value> =
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
.bind(&key)
.fetch_optional(pool)
.await?;
Ok(row.and_then(|v| {
v.get("at")
.and_then(|s| s.as_str())
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
.map(|dt| dt.with_timezone(&Utc))
}))
}
pub async fn mark_dropped_mangas(
pool: &PgPool,
source_id: &str,