fix(crawler): panic-isolate the cron tick body (0.36.5)
Worker dispatch was already wrapped in AssertUnwindSafe(...) .catch_unwind() — a panicking handler ack's the job failed and the worker keeps going. The cron tick had no such guard: a panic in metadata.run, enqueue_bookmarked_pending, reap_done, or write_last_tick would kill the cron task. The JoinSet would drop it, workers would keep running, and no future metadata pass would ever fire until daemon restart. Wrap the tick body (between advisory-lock acquire and unlock) in the same AssertUnwindSafe(...).catch_unwind() pattern. The unlock and connection drop run unconditionally so a panicked tick doesn't leave the lock held for another replica. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.4"
|
version = "0.36.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.4"
|
version = "0.36.5"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -233,24 +233,38 @@ impl CronContext {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
match self.metadata.run().await {
|
// Panic-isolate the tick body the same way `process_lease` does
|
||||||
|
// for worker dispatch. Without this, a panic in metadata.run
|
||||||
|
// (or any of the follow-on steps) would kill the cron task and
|
||||||
|
// no future tick would ever run — workers would keep going but
|
||||||
|
// no new metadata work would be scheduled until daemon restart.
|
||||||
|
// The advisory unlock below runs unconditionally so a panicked
|
||||||
|
// tick doesn't leave the lock held for another replica.
|
||||||
|
let metadata = &self.metadata;
|
||||||
|
let pool = &self.pool;
|
||||||
|
let retention_days = self.retention_days;
|
||||||
|
let body = async move {
|
||||||
|
match metadata.run().await {
|
||||||
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
|
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
|
||||||
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
|
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
|
||||||
}
|
}
|
||||||
|
match pipeline::enqueue_bookmarked_pending(pool).await {
|
||||||
match pipeline::enqueue_bookmarked_pending(&self.pool).await {
|
Ok(summary) => {
|
||||||
Ok(summary) => tracing::info!(?summary, "cron: enqueued bookmarked-pending"),
|
tracing::info!(?summary, "cron: enqueued bookmarked-pending");
|
||||||
|
}
|
||||||
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
|
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
|
||||||
}
|
}
|
||||||
|
match jobs::reap_done(pool, retention_days).await {
|
||||||
match jobs::reap_done(&self.pool, self.retention_days).await {
|
|
||||||
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
|
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
|
||||||
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
|
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
|
||||||
}
|
}
|
||||||
|
if let Err(e) = write_last_tick(pool, Utc::now()).await {
|
||||||
if let Err(e) = write_last_tick(&self.pool, Utc::now()).await {
|
|
||||||
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
|
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
if let Err(_panic) = AssertUnwindSafe(body).catch_unwind().await {
|
||||||
|
tracing::error!("cron: tick body panicked — continuing");
|
||||||
|
}
|
||||||
|
|
||||||
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
|
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
|
||||||
.bind(CRON_LOCK_KEY)
|
.bind(CRON_LOCK_KEY)
|
||||||
@@ -626,4 +640,19 @@ mod tests {
|
|||||||
let prev = previous_fire(now, at, Tz::UTC);
|
let prev = previous_fire(now, at, Tz::UTC);
|
||||||
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
|
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Documents the panic-isolation pattern `run_tick` now relies on:
|
||||||
|
/// `AssertUnwindSafe(...).catch_unwind().await` must yield `Err(_)`
|
||||||
|
/// when the wrapped future panics, so the surrounding loop (or in
|
||||||
|
/// our case, the unconditional advisory-unlock that follows) keeps
|
||||||
|
/// running. The shape of this test mirrors the production callsite.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn assert_unwind_safe_catches_a_panicking_future() {
|
||||||
|
let result = AssertUnwindSafe(async {
|
||||||
|
panic!("boom");
|
||||||
|
})
|
||||||
|
.catch_unwind()
|
||||||
|
.await;
|
||||||
|
assert!(result.is_err(), "panicking future must yield Err");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.36.4",
|
"version": "0.36.5",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user