fix(crawler): panic-isolate the cron tick body (0.36.5)
Worker dispatch was already wrapped in AssertUnwindSafe(...) .catch_unwind() — a panicking handler ack's the job failed and the worker keeps going. The cron tick had no such guard: a panic in metadata.run, enqueue_bookmarked_pending, reap_done, or write_last_tick would kill the cron task. The JoinSet would drop it, workers would keep running, and no future metadata pass would ever fire until daemon restart. Wrap the tick body (between advisory-lock acquire and unlock) in the same AssertUnwindSafe(...).catch_unwind() pattern. The unlock and connection drop run unconditionally so a panicked tick doesn't leave the lock held for another replica. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.4"
|
version = "0.36.5"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.4"
|
version = "0.36.5"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -233,23 +233,37 @@ impl CronContext {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
match self.metadata.run().await {
|
// Panic-isolate the tick body the same way `process_lease` does
|
||||||
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
|
// for worker dispatch. Without this, a panic in metadata.run
|
||||||
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
|
// (or any of the follow-on steps) would kill the cron task and
|
||||||
}
|
// no future tick would ever run — workers would keep going but
|
||||||
|
// no new metadata work would be scheduled until daemon restart.
|
||||||
match pipeline::enqueue_bookmarked_pending(&self.pool).await {
|
// The advisory unlock below runs unconditionally so a panicked
|
||||||
Ok(summary) => tracing::info!(?summary, "cron: enqueued bookmarked-pending"),
|
// tick doesn't leave the lock held for another replica.
|
||||||
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
|
let metadata = &self.metadata;
|
||||||
}
|
let pool = &self.pool;
|
||||||
|
let retention_days = self.retention_days;
|
||||||
match jobs::reap_done(&self.pool, self.retention_days).await {
|
let body = async move {
|
||||||
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
|
match metadata.run().await {
|
||||||
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
|
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
|
||||||
}
|
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
|
||||||
|
}
|
||||||
if let Err(e) = write_last_tick(&self.pool, Utc::now()).await {
|
match pipeline::enqueue_bookmarked_pending(pool).await {
|
||||||
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
|
Ok(summary) => {
|
||||||
|
tracing::info!(?summary, "cron: enqueued bookmarked-pending");
|
||||||
|
}
|
||||||
|
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
|
||||||
|
}
|
||||||
|
match jobs::reap_done(pool, retention_days).await {
|
||||||
|
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
|
||||||
|
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
|
||||||
|
}
|
||||||
|
if let Err(e) = write_last_tick(pool, Utc::now()).await {
|
||||||
|
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Err(_panic) = AssertUnwindSafe(body).catch_unwind().await {
|
||||||
|
tracing::error!("cron: tick body panicked — continuing");
|
||||||
}
|
}
|
||||||
|
|
||||||
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
|
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
|
||||||
@@ -626,4 +640,19 @@ mod tests {
|
|||||||
let prev = previous_fire(now, at, Tz::UTC);
|
let prev = previous_fire(now, at, Tz::UTC);
|
||||||
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
|
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Documents the panic-isolation pattern `run_tick` now relies on:
|
||||||
|
/// `AssertUnwindSafe(...).catch_unwind().await` must yield `Err(_)`
|
||||||
|
/// when the wrapped future panics, so the surrounding loop (or in
|
||||||
|
/// our case, the unconditional advisory-unlock that follows) keeps
|
||||||
|
/// running. The shape of this test mirrors the production callsite.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn assert_unwind_safe_catches_a_panicking_future() {
|
||||||
|
let result = AssertUnwindSafe(async {
|
||||||
|
panic!("boom");
|
||||||
|
})
|
||||||
|
.catch_unwind()
|
||||||
|
.await;
|
||||||
|
assert!(result.is_err(), "panicking future must yield Err");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.36.4",
|
"version": "0.36.5",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user