feat(crawler): reliability fixes — heartbeat, streaming, jitter, timeout, breaker

A1 Lease heartbeat: jobs::renew keeps a long-but-healthy job's lease fresh
so it is never stolen mid-flight nor inflated toward max_attempts.
A2 Stream chapter pages straight to storage (peak memory = one image) and
persist rows + page_count in one short transaction off the network path
(S3-ready); roll back stored blobs on failure via Storage::delete.
A3 ±20% jitter on exponential backoff to avoid a retry thundering herd.
A4 Outer per-dispatch timeout (CRAWLER_JOB_TIMEOUT_SECS, default 600) so a
hung job is acked-failed instead of wedging a worker.
A5 Metadata circuit-breaker (CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES,
default 10): abort a pass on a source outage without marking a clean exit,
so the next tick recovery-sweeps.

Adds CRAWLER_BROWSER_RESTART_THRESHOLD config (used by the upcoming
coordinated browser restart). Bumps version 0.52.0 -> 0.53.0.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-06-03 20:13:17 +02:00
parent 679abae736
commit 7a6815661f
12 changed files with 578 additions and 71 deletions

View File

@@ -40,6 +40,7 @@ fn make_cfg(
tz: Tz::UTC,
retention_days: 7,
session_expired,
job_timeout: Duration::from_secs(60),
extra_tasks: Vec::new(),
}
}
@@ -88,6 +89,52 @@ impl ChapterDispatcher for PanickingDispatcher {
}
}
/// Never completes — used to verify the worker's outer dispatch timeout.
struct HangingDispatcher {
seen: AtomicUsize,
}
#[async_trait::async_trait]
impl ChapterDispatcher for HangingDispatcher {
async fn dispatch(&self, _payload: JobPayload) -> anyhow::Result<SyncOutcome> {
self.seen.fetch_add(1, Ordering::AcqRel);
std::future::pending::<()>().await;
unreachable!("hanging dispatcher never resolves");
}
}
#[sqlx::test(migrations = "./migrations")]
async fn worker_times_out_a_hung_dispatch_and_acks_failed(pool: PgPool) {
enqueue_chapter_job(&pool).await;
let dispatcher = Arc::new(HangingDispatcher {
seen: AtomicUsize::new(0),
});
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
let cancel = CancellationToken::new();
let mut cfg = make_cfg(None, dispatcher.clone(), session_expired, 1);
cfg.job_timeout = Duration::from_millis(300);
let handle = daemon::spawn(pool.clone(), cancel.clone(), cfg);
// The hung job should time out and return to pending with backoff
// (attempts=1 < max=5). Poll for the recorded error.
let mut timed_out = false;
for _ in 0..40 {
let n: i64 = sqlx::query_scalar(
"SELECT COUNT(*) FROM crawler_jobs WHERE last_error = 'dispatch timed out'",
)
.fetch_one(&pool)
.await
.unwrap();
if n == 1 {
timed_out = true;
break;
}
tokio::time::sleep(Duration::from_millis(50)).await;
}
handle.shutdown().await;
assert!(timed_out, "hung dispatch must be acked failed with a timeout error");
assert!(dispatcher.seen.load(Ordering::Acquire) >= 1);
}
#[sqlx::test(migrations = "./migrations")]
async fn workers_drain_jobs_through_dispatcher(pool: PgPool) {
enqueue_chapter_job(&pool).await;

View File

@@ -185,6 +185,68 @@ async fn lease_marks_running_and_bumps_attempts_and_sets_leased_until(pool: PgPo
assert!(leased_until > chrono::Utc::now());
}
#[sqlx::test(migrations = "./migrations")]
async fn renew_extends_leased_until_while_running(pool: PgPool) {
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
EnqueueResult::Skipped => unreachable!(),
};
// Lease with a short window, then collapse leased_until to the recent
// past so the renew is unambiguously an extension.
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(5))
.await
.unwrap();
assert_eq!(leases.len(), 1);
sqlx::query("UPDATE crawler_jobs SET leased_until = now() - interval '1 second' WHERE id = $1")
.bind(id)
.execute(&pool)
.await
.unwrap();
let still_owned = jobs::renew(&pool, id, Duration::from_secs(120))
.await
.unwrap();
assert!(still_owned, "renew on a running job returns true");
let leased_until: chrono::DateTime<chrono::Utc> =
sqlx::query_scalar("SELECT leased_until FROM crawler_jobs WHERE id = $1")
.bind(id)
.fetch_one(&pool)
.await
.unwrap();
assert!(
leased_until > chrono::Utc::now() + chrono::Duration::seconds(60),
"leased_until pushed ~120s into the future"
);
assert_eq!(job_state(&pool, id).await, "running");
}
#[sqlx::test(migrations = "./migrations")]
async fn renew_is_noop_once_job_no_longer_running(pool: PgPool) {
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
EnqueueResult::Skipped => unreachable!(),
};
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
.await
.unwrap();
// Job completes — heartbeat should now see it's no longer ours.
jobs::ack_done(&pool, leases[0].id).await.unwrap();
let still_owned = jobs::renew(&pool, id, Duration::from_secs(120))
.await
.unwrap();
assert!(!still_owned, "renew on a non-running job returns false");
assert_eq!(job_state(&pool, id).await, "done");
}
#[sqlx::test(migrations = "./migrations")]
async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
let manga_id = match jobs::enqueue(&pool, &sync_manga_payload("foo"))