fix(crawler): panic-isolate the cron tick body (0.36.5)

Worker dispatch was already wrapped in AssertUnwindSafe(...)
.catch_unwind() — a panicking handler ack's the job failed and the
worker keeps going. The cron tick had no such guard: a panic in
metadata.run, enqueue_bookmarked_pending, reap_done, or
write_last_tick would kill the cron task. The JoinSet would drop it,
workers would keep running, and no future metadata pass would ever
fire until daemon restart.

Wrap the tick body (between advisory-lock acquire and unlock) in the
same AssertUnwindSafe(...).catch_unwind() pattern. The unlock and
connection drop run unconditionally so a panicked tick doesn't leave
the lock held for another replica.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-30 20:08:11 +02:00
parent 655ea42731
commit 5c04b0532b
4 changed files with 49 additions and 20 deletions

View File

@@ -233,23 +233,37 @@ impl CronContext {
return;
}
match self.metadata.run().await {
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
}
match pipeline::enqueue_bookmarked_pending(&self.pool).await {
Ok(summary) => tracing::info!(?summary, "cron: enqueued bookmarked-pending"),
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
}
match jobs::reap_done(&self.pool, self.retention_days).await {
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
}
if let Err(e) = write_last_tick(&self.pool, Utc::now()).await {
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
// Panic-isolate the tick body the same way `process_lease` does
// for worker dispatch. Without this, a panic in metadata.run
// (or any of the follow-on steps) would kill the cron task and
// no future tick would ever run — workers would keep going but
// no new metadata work would be scheduled until daemon restart.
// The advisory unlock below runs unconditionally so a panicked
// tick doesn't leave the lock held for another replica.
let metadata = &self.metadata;
let pool = &self.pool;
let retention_days = self.retention_days;
let body = async move {
match metadata.run().await {
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
}
match pipeline::enqueue_bookmarked_pending(pool).await {
Ok(summary) => {
tracing::info!(?summary, "cron: enqueued bookmarked-pending");
}
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
}
match jobs::reap_done(pool, retention_days).await {
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
}
if let Err(e) = write_last_tick(pool, Utc::now()).await {
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
}
};
if let Err(_panic) = AssertUnwindSafe(body).catch_unwind().await {
tracing::error!("cron: tick body panicked — continuing");
}
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
@@ -626,4 +640,19 @@ mod tests {
let prev = previous_fire(now, at, Tz::UTC);
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
}
/// Documents the panic-isolation pattern `run_tick` now relies on:
/// `AssertUnwindSafe(...).catch_unwind().await` must yield `Err(_)`
/// when the wrapped future panics, so the surrounding loop (or in
/// our case, the unconditional advisory-unlock that follows) keeps
/// running. The shape of this test mirrors the production callsite.
#[tokio::test]
async fn assert_unwind_safe_catches_a_panicking_future() {
let result = AssertUnwindSafe(async {
panic!("boom");
})
.catch_unwind()
.await;
assert!(result.is_err(), "panicking future must yield Err");
}
}