feat: crawler job queue ops and dedup index (0.27.0)

Adds enqueue / lease / ack_done / ack_failed / release / reap_done on
crawler::jobs, backed by the existing crawler_jobs table. lease() uses
a single FOR UPDATE SKIP LOCKED CTE that also re-claims stale running
rows (crashed-worker recovery), and ack_failed applies an exponential
backoff capped at 1h before retrying.

Migration 0014 adds a partial unique index on
(payload->>'chapter_id') restricted to (pending|running)
sync_chapter_content jobs, so producers can just
INSERT ... ON CONFLICT DO NOTHING without racing each other. The slot
frees again the moment the job leaves the in-flight states, so a
future force-refetch can re-enqueue.

Library-only — no daemon, no API hook. Those land in the next two
phases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-25 19:59:09 +02:00
parent 89b84252a5
commit 93c7fd63fc
6 changed files with 676 additions and 6 deletions

View File

@@ -5,11 +5,11 @@
//! `leased_until`, and ack by transitioning to `done` (or backoff /
//! `dead`). Handlers are idempotent so a crash mid-run is recoverable
//! by replay.
//!
//! Scaffold only — the actual queue wrapper and handler dispatch land
//! once we have the first `Source` impl exercising the pipeline.
use std::time::Duration;
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;
use super::source::DiscoverMode;
@@ -53,3 +53,217 @@ pub enum JobState {
Failed,
Dead,
}
/// Kind discriminator stored in `payload->>'kind'`. Public so callers
/// (daemon worker, bookmark hook) can filter `lease()` to a single kind
/// without re-spelling the literal.
pub const KIND_SYNC_CHAPTER_CONTENT: &str = "sync_chapter_content";
#[derive(Debug)]
pub enum EnqueueResult {
Inserted(Uuid),
Skipped,
}
#[derive(Debug, Clone)]
pub struct Lease {
pub id: Uuid,
pub payload: JobPayload,
pub attempts: i32,
pub max_attempts: i32,
}
/// Exponential backoff for `ack_failed` retries. `attempts` is the
/// post-increment value reported by `lease()` (so the first failure has
/// `attempts == 1` and waits 60s, the second 120s, etc.). Capped at 1h to
/// avoid runaway long sleeps that would outlive the daemon process.
fn backoff_for(attempts: i32) -> Duration {
let shift = attempts.saturating_sub(1).clamp(0, 20) as u32;
let secs = 60u64.saturating_mul(1u64 << shift);
Duration::from_secs(secs.min(3600))
}
/// Insert a new pending job. For `SyncChapterContent` payloads the
/// partial unique index `crawler_jobs_chapter_content_dedup_idx` blocks
/// a second `(pending|running)` insert per chapter_id, returning
/// `Skipped`. The slot frees again once the previous job leaves the
/// in-flight states (done/failed/dead), so a re-enqueue after a force
/// refetch succeeds.
pub async fn enqueue(pool: &PgPool, payload: &JobPayload) -> sqlx::Result<EnqueueResult> {
let json = serde_json::to_value(payload).expect("JobPayload is always serializable");
let id: Option<Uuid> = sqlx::query_scalar(
"INSERT INTO crawler_jobs (payload) VALUES ($1) \
ON CONFLICT DO NOTHING RETURNING id",
)
.bind(json)
.fetch_optional(pool)
.await?;
Ok(match id {
Some(id) => EnqueueResult::Inserted(id),
None => EnqueueResult::Skipped,
})
}
/// Lease up to `max` rows whose `state` is `pending`, or `running` with
/// an expired `leased_until` (the crashed-worker recovery path). The
/// inner CTE uses `FOR UPDATE SKIP LOCKED` so concurrent leasers don't
/// block each other and each row is handed to exactly one worker.
///
/// `kind_filter` matches against `payload->>'kind'`; `None` means
/// any kind.
pub async fn lease(
pool: &PgPool,
kind_filter: Option<&str>,
max: i64,
lease_duration: Duration,
) -> sqlx::Result<Vec<Lease>> {
let lease_ms: i64 = lease_duration.as_millis().min(i64::MAX as u128) as i64;
let rows: Vec<(Uuid, serde_json::Value, i32, i32)> = sqlx::query_as(
r#"
WITH leased AS (
SELECT id FROM crawler_jobs
WHERE (state = 'pending' OR (state = 'running' AND leased_until < now()))
AND scheduled_at <= now()
AND ($1::text IS NULL OR payload->>'kind' = $1)
ORDER BY scheduled_at
LIMIT $2
FOR UPDATE SKIP LOCKED
)
UPDATE crawler_jobs j
SET state = 'running',
attempts = j.attempts + 1,
leased_until = now() + ($3::bigint || ' milliseconds')::interval,
updated_at = now()
FROM leased l
WHERE j.id = l.id
RETURNING j.id, j.payload, j.attempts, j.max_attempts
"#,
)
.bind(kind_filter)
.bind(max)
.bind(lease_ms)
.fetch_all(pool)
.await?;
let mut leases = Vec::with_capacity(rows.len());
for (id, payload_json, attempts, max_attempts) in rows {
let payload: JobPayload = serde_json::from_value(payload_json).map_err(|e| {
sqlx::Error::Decode(format!("invalid JobPayload JSON for job {id}: {e}").into())
})?;
leases.push(Lease {
id,
payload,
attempts,
max_attempts,
});
}
Ok(leases)
}
/// Mark a leased job as successfully completed.
pub async fn ack_done(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
sqlx::query(
"UPDATE crawler_jobs \
SET state = 'done', leased_until = NULL, updated_at = now() \
WHERE id = $1",
)
.bind(lease_id)
.execute(pool)
.await?;
Ok(())
}
/// Mark a leased job as failed. If the current attempt count has reached
/// `max_attempts` the job is terminally dead and stops retrying;
/// otherwise it goes back to `pending` with `scheduled_at` pushed into
/// the future by the exponential backoff.
pub async fn ack_failed(
pool: &PgPool,
lease_id: Uuid,
error: &str,
attempts: i32,
max_attempts: i32,
) -> sqlx::Result<()> {
if attempts >= max_attempts {
sqlx::query(
"UPDATE crawler_jobs \
SET state = 'dead', last_error = $2, leased_until = NULL, updated_at = now() \
WHERE id = $1",
)
.bind(lease_id)
.bind(error)
.execute(pool)
.await?;
} else {
let backoff_ms: i64 = backoff_for(attempts).as_millis().min(i64::MAX as u128) as i64;
sqlx::query(
"UPDATE crawler_jobs \
SET state = 'pending', last_error = $2, leased_until = NULL, \
scheduled_at = now() + ($3::bigint || ' milliseconds')::interval, \
updated_at = now() \
WHERE id = $1",
)
.bind(lease_id)
.bind(error)
.bind(backoff_ms)
.execute(pool)
.await?;
}
Ok(())
}
/// Return a leased job to `pending` without burning a retry attempt.
/// Used on graceful shutdown and on session-expired aborts where the
/// failure isn't the job's fault.
pub async fn release(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
sqlx::query(
"UPDATE crawler_jobs \
SET state = 'pending', leased_until = NULL, \
attempts = GREATEST(0, attempts - 1), updated_at = now() \
WHERE id = $1",
)
.bind(lease_id)
.execute(pool)
.await?;
Ok(())
}
/// Delete `done` jobs whose `updated_at` is older than `retention_days`
/// days. `0` disables the reaper without touching the table. Returns the
/// number of rows removed.
pub async fn reap_done(pool: &PgPool, retention_days: u32) -> sqlx::Result<u64> {
if retention_days == 0 {
return Ok(0);
}
let result = sqlx::query(
"DELETE FROM crawler_jobs \
WHERE state = 'done' \
AND updated_at < now() - ($1::bigint || ' days')::interval",
)
.bind(retention_days as i64)
.execute(pool)
.await?;
Ok(result.rows_affected())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn backoff_grows_exponentially_and_caps_at_one_hour() {
// attempts == 1 → 60s, doubling each step.
assert_eq!(backoff_for(1), Duration::from_secs(60));
assert_eq!(backoff_for(2), Duration::from_secs(120));
assert_eq!(backoff_for(3), Duration::from_secs(240));
assert_eq!(backoff_for(4), Duration::from_secs(480));
assert_eq!(backoff_for(5), Duration::from_secs(960));
assert_eq!(backoff_for(6), Duration::from_secs(1920));
// 7th: 60 * 64 = 3840 → capped to 3600.
assert_eq!(backoff_for(7), Duration::from_secs(3600));
assert_eq!(backoff_for(20), Duration::from_secs(3600));
// Garbage / zero / negatives stay sane.
assert_eq!(backoff_for(0), Duration::from_secs(60));
assert_eq!(backoff_for(-5), Duration::from_secs(60));
}
}