fix(crawler): panic-isolate the cron tick body (0.36.5)

Worker dispatch was already wrapped in AssertUnwindSafe(...) .catch_unwind() — a panicking handler ack's the job failed and the worker keeps going. The cron tick had no such guard: a panic in metadata.run, enqueue_bookmarked_pending, reap_done, or write_last_tick would kill the cron task. The JoinSet would drop it, workers would keep running, and no future metadata pass would ever fire until daemon restart. Wrap the tick body (between advisory-lock acquire and unlock) in the same AssertUnwindSafe(...).catch_unwind() pattern. The unlock and connection drop run unconditionally so a panicked tick doesn't leave the lock held for another replica. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-30 20:08:11 +02:00
parent 655ea42731
commit 5c04b0532b
4 changed files with 49 additions and 20 deletions
--- a/backend/src/crawler/daemon.rs
+++ b/backend/src/crawler/daemon.rs
@@ -233,23 +233,37 @@ impl CronContext {
            return;
        }

-        match self.metadata.run().await {
-            Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
-            Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
-        }
-
-        match pipeline::enqueue_bookmarked_pending(&self.pool).await {
-            Ok(summary) => tracing::info!(?summary, "cron: enqueued bookmarked-pending"),
-            Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
-        }
-
-        match jobs::reap_done(&self.pool, self.retention_days).await {
-            Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
-            Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
-        }
-
-        if let Err(e) = write_last_tick(&self.pool, Utc::now()).await {
-            tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
+        // Panic-isolate the tick body the same way `process_lease` does
+        // for worker dispatch. Without this, a panic in metadata.run
+        // (or any of the follow-on steps) would kill the cron task and
+        // no future tick would ever run — workers would keep going but
+        // no new metadata work would be scheduled until daemon restart.
+        // The advisory unlock below runs unconditionally so a panicked
+        // tick doesn't leave the lock held for another replica.
+        let metadata = &self.metadata;
+        let pool = &self.pool;
+        let retention_days = self.retention_days;
+        let body = async move {
+            match metadata.run().await {
+                Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
+                Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
+            }
+            match pipeline::enqueue_bookmarked_pending(pool).await {
+                Ok(summary) => {
+                    tracing::info!(?summary, "cron: enqueued bookmarked-pending");
+                }
+                Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
+            }
+            match jobs::reap_done(pool, retention_days).await {
+                Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
+                Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
+            }
+            if let Err(e) = write_last_tick(pool, Utc::now()).await {
+                tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
+            }
+        };
+        if let Err(_panic) = AssertUnwindSafe(body).catch_unwind().await {
+            tracing::error!("cron: tick body panicked — continuing");
        }

        let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
@@ -626,4 +640,19 @@ mod tests {
        let prev = previous_fire(now, at, Tz::UTC);
        assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
    }
+
+    /// Documents the panic-isolation pattern `run_tick` now relies on:
+    /// `AssertUnwindSafe(...).catch_unwind().await` must yield `Err(_)`
+    /// when the wrapped future panics, so the surrounding loop (or in
+    /// our case, the unconditional advisory-unlock that follows) keeps
+    /// running. The shape of this test mirrors the production callsite.
+    #[tokio::test]
+    async fn assert_unwind_safe_catches_a_panicking_future() {
+        let result = AssertUnwindSafe(async {
+            panic!("boom");
+        })
+        .catch_unwind()
+        .await;
+        assert!(result.is_err(), "panicking future must yield Err");
+    }
 }