From 5c04b0532b7f9057a8b3aa85c6aed5308927c6ca Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sat, 30 May 2026 20:08:11 +0200 Subject: [PATCH] fix(crawler): panic-isolate the cron tick body (0.36.5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Worker dispatch was already wrapped in AssertUnwindSafe(...) .catch_unwind() — a panicking handler ack's the job failed and the worker keeps going. The cron tick had no such guard: a panic in metadata.run, enqueue_bookmarked_pending, reap_done, or write_last_tick would kill the cron task. The JoinSet would drop it, workers would keep running, and no future metadata pass would ever fire until daemon restart. Wrap the tick body (between advisory-lock acquire and unlock) in the same AssertUnwindSafe(...).catch_unwind() pattern. The unlock and connection drop run unconditionally so a panicked tick doesn't leave the lock held for another replica. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/crawler/daemon.rs | 63 +++++++++++++++++++++++++---------- frontend/package.json | 2 +- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/backend/Cargo.lock b/backend/Cargo.lock index acf4894..6250d29 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.36.4" +version = "0.36.5" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 6b62031..9bb5859 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.36.4" +version = "0.36.5" edition = "2021" default-run = "mangalord" diff --git a/backend/src/crawler/daemon.rs b/backend/src/crawler/daemon.rs index d976745..eb3e512 100644 --- a/backend/src/crawler/daemon.rs +++ b/backend/src/crawler/daemon.rs @@ -233,23 +233,37 @@ impl CronContext { return; } - match self.metadata.run().await { - Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"), - Err(e) => tracing::error!(?e, "cron: metadata pass failed"), - } - - match pipeline::enqueue_bookmarked_pending(&self.pool).await { - Ok(summary) => tracing::info!(?summary, "cron: enqueued bookmarked-pending"), - Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"), - } - - match jobs::reap_done(&self.pool, self.retention_days).await { - Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"), - Err(e) => tracing::error!(?e, "cron: done-job reaper failed"), - } - - if let Err(e) = write_last_tick(&self.pool, Utc::now()).await { - tracing::warn!(?e, "cron: persist last_metadata_tick_at failed"); + // Panic-isolate the tick body the same way `process_lease` does + // for worker dispatch. Without this, a panic in metadata.run + // (or any of the follow-on steps) would kill the cron task and + // no future tick would ever run — workers would keep going but + // no new metadata work would be scheduled until daemon restart. + // The advisory unlock below runs unconditionally so a panicked + // tick doesn't leave the lock held for another replica. + let metadata = &self.metadata; + let pool = &self.pool; + let retention_days = self.retention_days; + let body = async move { + match metadata.run().await { + Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"), + Err(e) => tracing::error!(?e, "cron: metadata pass failed"), + } + match pipeline::enqueue_bookmarked_pending(pool).await { + Ok(summary) => { + tracing::info!(?summary, "cron: enqueued bookmarked-pending"); + } + Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"), + } + match jobs::reap_done(pool, retention_days).await { + Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"), + Err(e) => tracing::error!(?e, "cron: done-job reaper failed"), + } + if let Err(e) = write_last_tick(pool, Utc::now()).await { + tracing::warn!(?e, "cron: persist last_metadata_tick_at failed"); + } + }; + if let Err(_panic) = AssertUnwindSafe(body).catch_unwind().await { + tracing::error!("cron: tick body panicked — continuing"); } let _ = sqlx::query("SELECT pg_advisory_unlock($1)") @@ -626,4 +640,19 @@ mod tests { let prev = previous_fire(now, at, Tz::UTC); assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30)); } + + /// Documents the panic-isolation pattern `run_tick` now relies on: + /// `AssertUnwindSafe(...).catch_unwind().await` must yield `Err(_)` + /// when the wrapped future panics, so the surrounding loop (or in + /// our case, the unconditional advisory-unlock that follows) keeps + /// running. The shape of this test mirrors the production callsite. + #[tokio::test] + async fn assert_unwind_safe_catches_a_panicking_future() { + let result = AssertUnwindSafe(async { + panic!("boom"); + }) + .catch_unwind() + .await; + assert!(result.is_err(), "panicking future must yield Err"); + } } diff --git a/frontend/package.json b/frontend/package.json index 7430943..9898b5b 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.36.4", + "version": "0.36.5", "private": true, "type": "module", "scripts": {