feat(crawler): reliability fixes — heartbeat, streaming, jitter, timeout, breaker
A1 Lease heartbeat: jobs::renew keeps a long-but-healthy job's lease fresh so it is never stolen mid-flight nor inflated toward max_attempts. A2 Stream chapter pages straight to storage (peak memory = one image) and persist rows + page_count in one short transaction off the network path (S3-ready); roll back stored blobs on failure via Storage::delete. A3 ±20% jitter on exponential backoff to avoid a retry thundering herd. A4 Outer per-dispatch timeout (CRAWLER_JOB_TIMEOUT_SECS, default 600) so a hung job is acked-failed instead of wedging a worker. A5 Metadata circuit-breaker (CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES, default 10): abort a pass on a source outage without marking a clean exit, so the next tick recovery-sweeps. Adds CRAWLER_BROWSER_RESTART_THRESHOLD config (used by the upcoming coordinated browser restart). Bumps version 0.52.0 -> 0.53.0. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,7 @@ fn make_cfg(
|
||||
tz: Tz::UTC,
|
||||
retention_days: 7,
|
||||
session_expired,
|
||||
job_timeout: Duration::from_secs(60),
|
||||
extra_tasks: Vec::new(),
|
||||
}
|
||||
}
|
||||
@@ -88,6 +89,52 @@ impl ChapterDispatcher for PanickingDispatcher {
|
||||
}
|
||||
}
|
||||
|
||||
/// Never completes — used to verify the worker's outer dispatch timeout.
|
||||
struct HangingDispatcher {
|
||||
seen: AtomicUsize,
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl ChapterDispatcher for HangingDispatcher {
|
||||
async fn dispatch(&self, _payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||
self.seen.fetch_add(1, Ordering::AcqRel);
|
||||
std::future::pending::<()>().await;
|
||||
unreachable!("hanging dispatcher never resolves");
|
||||
}
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn worker_times_out_a_hung_dispatch_and_acks_failed(pool: PgPool) {
|
||||
enqueue_chapter_job(&pool).await;
|
||||
let dispatcher = Arc::new(HangingDispatcher {
|
||||
seen: AtomicUsize::new(0),
|
||||
});
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
let cancel = CancellationToken::new();
|
||||
let mut cfg = make_cfg(None, dispatcher.clone(), session_expired, 1);
|
||||
cfg.job_timeout = Duration::from_millis(300);
|
||||
let handle = daemon::spawn(pool.clone(), cancel.clone(), cfg);
|
||||
|
||||
// The hung job should time out and return to pending with backoff
|
||||
// (attempts=1 < max=5). Poll for the recorded error.
|
||||
let mut timed_out = false;
|
||||
for _ in 0..40 {
|
||||
let n: i64 = sqlx::query_scalar(
|
||||
"SELECT COUNT(*) FROM crawler_jobs WHERE last_error = 'dispatch timed out'",
|
||||
)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
if n == 1 {
|
||||
timed_out = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
handle.shutdown().await;
|
||||
assert!(timed_out, "hung dispatch must be acked failed with a timeout error");
|
||||
assert!(dispatcher.seen.load(Ordering::Acquire) >= 1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn workers_drain_jobs_through_dispatcher(pool: PgPool) {
|
||||
enqueue_chapter_job(&pool).await;
|
||||
|
||||
Reference in New Issue
Block a user