diff --git a/backend/Cargo.lock b/backend/Cargo.lock index a7ca47d..c14f3de 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.36.2" +version = "0.36.3" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index a5dac77..189c027 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.36.2" +version = "0.36.3" edition = "2021" default-run = "mangalord" diff --git a/backend/src/app.rs b/backend/src/app.rs index b3c5ef2..4eeb2aa 100644 --- a/backend/src/app.rs +++ b/backend/src/app.rs @@ -230,7 +230,7 @@ struct RealMetadataPass { #[async_trait] impl MetadataPass for RealMetadataPass { async fn run(&self) -> anyhow::Result { - pipeline::run_metadata_pass( + let result = pipeline::run_metadata_pass( &self.browser_manager, &self.db, self.storage.as_ref(), @@ -242,7 +242,13 @@ impl MetadataPass for RealMetadataPass { &self.download_allowlist, self.max_image_bytes, ) - .await + .await; + if let Err(e) = &result { + if crate::crawler::nav::anyhow_looks_browser_dead(e) { + self.browser_manager.invalidate().await; + } + } + result } } @@ -273,7 +279,7 @@ impl ChapterDispatcher for RealChapterDispatcher { return Ok(SyncOutcome::Skipped); }; let lease = self.browser_manager.acquire().await?; - let outcome = content::sync_chapter_content( + let result = content::sync_chapter_content( &lease, &self.db, self.storage.as_ref(), @@ -286,9 +292,17 @@ impl ChapterDispatcher for RealChapterDispatcher { &self.download_allowlist, self.max_image_bytes, ) - .await?; + .await; drop(lease); - Ok(outcome) + match result { + Ok(outcome) => Ok(outcome), + Err(e) => { + if crate::crawler::nav::anyhow_looks_browser_dead(&e) { + self.browser_manager.invalidate().await; + } + Err(e) + } + } } // Other payload kinds aren't dispatched by this daemon yet — // SyncManga / SyncChapterList are handled inline by the cron's diff --git a/backend/src/crawler/browser_manager.rs b/backend/src/crawler/browser_manager.rs index 54a411c..c1725e4 100644 --- a/backend/src/crawler/browser_manager.rs +++ b/backend/src/crawler/browser_manager.rs @@ -145,6 +145,28 @@ impl BrowserManager { } } + /// Mark the cached browser handle as unhealthy. The next `acquire` + /// will re-launch Chromium from scratch. + /// + /// Same semantics as `shutdown` — the difference is intent: + /// `shutdown` runs once at daemon teardown, while `invalidate` is a + /// recovery hook callers fire after a CDP / connection / navigation + /// failure that suggests the underlying process has died. Calling + /// this while other workers still hold leases is safe — their + /// outstanding CDP operations will return channel-closed errors + /// and those workers will then re-acquire (re-launching Chromium). + /// + /// Idempotent: calling on an already-invalidated manager is a + /// no-op. + pub async fn invalidate(&self) { + let mut guard = self.inner.lock().await; + guard.shared = None; + if let Some(handle) = guard.handle.take() { + let _ = handle.close().await; + tracing::warn!("BrowserManager: handle invalidated — next acquire will relaunch"); + } + } + fn idle_timeout(&self) -> Duration { self.idle_timeout } @@ -231,6 +253,23 @@ mod tests { assert_send_sync(&h); } + /// Invalidate is the only `BrowserManager` method that's safe to + /// exercise in a unit test without launching Chromium — it's a + /// no-op when no handle has been cached, and that path is exactly + /// the one we want to verify is idempotent. + #[tokio::test] + async fn invalidate_is_a_noop_when_no_handle_cached() { + let mgr = BrowserManager::new( + crate::crawler::browser::LaunchOptions::default(), + Duration::ZERO, + noop_on_launch(), + ); + // Two back-to-back invalidates must both complete; the second + // would hang or panic if the first had left torn state. + mgr.invalidate().await; + mgr.invalidate().await; + } + #[tokio::test] async fn active_tracker_signals_idle_only_on_zero_transition() { let tracker = ActiveTracker::new(); diff --git a/backend/src/crawler/nav.rs b/backend/src/crawler/nav.rs index e5fcd07..287d22a 100644 --- a/backend/src/crawler/nav.rs +++ b/backend/src/crawler/nav.rs @@ -85,6 +85,54 @@ pub async fn wait_for_selector( /// under a second. pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10); +impl NavError { + /// Heuristic: does this error look like the underlying Chromium + /// process has died or its CDP connection has dropped? Used by the + /// dispatcher to decide whether to invalidate the + /// [`crate::crawler::browser_manager::BrowserManager`] handle so + /// the next acquire re-launches. + /// + /// Errs on the side of "yes" — false positives cost one extra + /// Chromium launch, false negatives leave every subsequent worker + /// hitting a zombie process until the idle reaper fires. + pub fn is_likely_browser_dead(&self) -> bool { + match self { + Self::Timeout(_) => true, + Self::Cdp(e) => cdp_error_looks_dead(&e.to_string()), + } + } +} + +/// Walk an `anyhow::Error` chain looking for a [`NavError`] flagged as +/// likely-dead, or a fallback substring match against well-known +/// channel/connection messages in any wrapped error. The substring +/// pass catches `chromiumoxide` errors that surface as the +/// `anyhow::Error::msg` path rather than as a downcastable type +/// (notably the `ChannelSendError` family). +pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool { + for cause in err.chain() { + if let Some(nav) = cause.downcast_ref::() { + if nav.is_likely_browser_dead() { + return true; + } + } + if cdp_error_looks_dead(&cause.to_string()) { + return true; + } + } + false +} + +fn cdp_error_looks_dead(msg: &str) -> bool { + let m = msg.to_ascii_lowercase(); + m.contains("channel") + || m.contains("connection") + || m.contains("websocket") + || m.contains("transport") + || m.contains("closed") + || m.contains("nav timeout") +} + #[cfg(test)] mod tests { use super::*; @@ -108,6 +156,34 @@ mod tests { assert_eq!(e.to_string(), "navigation timed out after 30s"); } + #[test] + fn timeout_is_treated_as_likely_browser_dead() { + let e = NavError::Timeout(NAV_TIMEOUT); + assert!(e.is_likely_browser_dead()); + } + + #[test] + fn anyhow_with_nav_timeout_in_chain_is_flagged() { + let inner: Result<(), NavError> = Err(NavError::Timeout(NAV_TIMEOUT)); + let outer = inner.unwrap_err(); + let wrapped: anyhow::Error = + anyhow::Error::new(outer).context("wait for chapter nav"); + assert!(anyhow_looks_browser_dead(&wrapped)); + } + + #[test] + fn anyhow_with_channel_message_is_flagged() { + let e: anyhow::Error = anyhow::anyhow!("CDP channel send error: closed"); + assert!(anyhow_looks_browser_dead(&e)); + } + + #[test] + fn anyhow_with_innocuous_parse_error_is_not_flagged() { + let e: anyhow::Error = + anyhow::anyhow!("parse manga detail: chapter row regex did not match"); + assert!(!anyhow_looks_browser_dead(&e)); + } + /// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`], /// but for the [`wait_for_selector`] polling pattern: the loop must /// surrender on `Elapsed` rather than spinning past the deadline. diff --git a/frontend/package.json b/frontend/package.json index 9950e1a..b210564 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.36.2", + "version": "0.36.3", "private": true, "type": "module", "scripts": {