feat: incremental crawl mode with seed-completion gate (0.33.0)
Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -412,6 +412,53 @@ pub async fn sync_manga_chapters(
|
||||
Ok(diff)
|
||||
}
|
||||
|
||||
/// Record that a complete Backfill walk has finished for `source_id`.
|
||||
/// The presence of this row is what the daemon's mode auto-detection
|
||||
/// uses to flip from Backfill to Incremental on subsequent ticks.
|
||||
///
|
||||
/// Keyed `seed_completed:<source_id>` in `crawler_state`. JSON payload
|
||||
/// stores the timestamp so we can surface "last fully reseeded at" in
|
||||
/// future ops tooling without another migration.
|
||||
pub async fn mark_seed_completed(
|
||||
pool: &PgPool,
|
||||
source_id: &str,
|
||||
at: DateTime<Utc>,
|
||||
) -> sqlx::Result<()> {
|
||||
let key = format!("seed_completed:{source_id}");
|
||||
sqlx::query(
|
||||
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||
VALUES ($1, $2, now()) \
|
||||
ON CONFLICT (key) DO UPDATE \
|
||||
SET value = EXCLUDED.value, updated_at = now()",
|
||||
)
|
||||
.bind(&key)
|
||||
.bind(serde_json::json!({ "at": at.to_rfc3339() }))
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read the timestamp written by [`mark_seed_completed`], if any.
|
||||
/// `None` means no complete Backfill has ever finished for this
|
||||
/// source — the daemon should run Backfill on the next tick.
|
||||
pub async fn seed_completed_at(
|
||||
pool: &PgPool,
|
||||
source_id: &str,
|
||||
) -> sqlx::Result<Option<DateTime<Utc>>> {
|
||||
let key = format!("seed_completed:{source_id}");
|
||||
let row: Option<serde_json::Value> =
|
||||
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
|
||||
.bind(&key)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row.and_then(|v| {
|
||||
v.get("at")
|
||||
.and_then(|s| s.as_str())
|
||||
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
}))
|
||||
}
|
||||
|
||||
pub async fn mark_dropped_mangas(
|
||||
pool: &PgPool,
|
||||
source_id: &str,
|
||||
|
||||
Reference in New Issue
Block a user