From 9925f54695eab009ed48599272b349bb29f03f63 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sat, 30 May 2026 20:41:59 +0200 Subject: [PATCH] fix(crawler): narrow browser-dead heuristic to typed downcasts (0.36.7) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit anyhow_looks_browser_dead substring-matched any chain message containing channel / connection / websocket / transport / closed / nav timeout. Real chromium failures hit those words, but so do reqwest TCP-reset errors during CDN image downloads, sqlx pool- timeout errors, and any number of non-browser failures — each of which triggered a wasted chromium relaunch + session-probe re-run against the catalog's rate-limit budget. Drop the substring pass. Walk the chain looking only for typed NavError (flagged via is_likely_browser_dead) or CdpError. Every place we feed a chromium error into anyhow goes through one of those types, so the typed downcasts cover the real cases without the false-positive surface. NavError::is_likely_browser_dead also drops its own substring check on Cdp(e); any CdpError surfacing at the navigation layer means the chromium-facing channel is the failing layer. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/crawler/nav.rs | 81 ++++++++++++++++++++++++++------------ frontend/package.json | 2 +- 4 files changed, 59 insertions(+), 28 deletions(-) diff --git a/backend/Cargo.lock b/backend/Cargo.lock index c6ffb28..e3975cc 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.36.6" +version = "0.36.7" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 6ddd1e5..20e3af9 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.36.6" +version = "0.36.7" edition = "2021" default-run = "mangalord" diff --git a/backend/src/crawler/nav.rs b/backend/src/crawler/nav.rs index 287d22a..516cda4 100644 --- a/backend/src/crawler/nav.rs +++ b/backend/src/crawler/nav.rs @@ -86,29 +86,39 @@ pub async fn wait_for_selector( pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10); impl NavError { - /// Heuristic: does this error look like the underlying Chromium + /// Does this navigation error indicate the underlying Chromium /// process has died or its CDP connection has dropped? Used by the /// dispatcher to decide whether to invalidate the /// [`crate::crawler::browser_manager::BrowserManager`] handle so /// the next acquire re-launches. /// - /// Errs on the side of "yes" — false positives cost one extra - /// Chromium launch, false negatives leave every subsequent worker - /// hitting a zombie process until the idle reaper fires. + /// Both variants count: a `Timeout` past [`NAV_TIMEOUT`] is in + /// practice always either a hung CDP transport or a wedged page + /// the browser can't recover from on its own, and a `Cdp` error + /// surfacing at the navigation layer means the chromium-facing + /// channel is the failing layer. pub fn is_likely_browser_dead(&self) -> bool { match self { Self::Timeout(_) => true, - Self::Cdp(e) => cdp_error_looks_dead(&e.to_string()), + Self::Cdp(_) => true, } } } -/// Walk an `anyhow::Error` chain looking for a [`NavError`] flagged as -/// likely-dead, or a fallback substring match against well-known -/// channel/connection messages in any wrapped error. The substring -/// pass catches `chromiumoxide` errors that surface as the -/// `anyhow::Error::msg` path rather than as a downcastable type -/// (notably the `ChannelSendError` family). +/// Walk an `anyhow::Error` chain looking for typed evidence that the +/// chromium-facing layer is the failing one. Two markers count: +/// +/// 1. A wrapped [`NavError`] flagged by [`NavError::is_likely_browser_dead`]. +/// 2. A wrapped [`CdpError`] (via `anyhow::Error::from(CdpError)` at a +/// `Browser::new_page` call site, or any other direct CDP boundary). +/// +/// Earlier versions also substring-matched the chain for "connection", +/// "closed", "channel", etc. as a fallback. That was too broad — +/// reqwest TCP-reset errors during CDN image downloads, sqlx +/// connection-pool errors, and similar non-browser failures contain +/// those words and triggered spurious chromium relaunches. The typed +/// downcasts cover every place we hand a chromium error to anyhow, +/// so the fallback is unnecessary. pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool { for cause in err.chain() { if let Some(nav) = cause.downcast_ref::() { @@ -116,23 +126,13 @@ pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool { return true; } } - if cdp_error_looks_dead(&cause.to_string()) { + if cause.downcast_ref::().is_some() { return true; } } false } -fn cdp_error_looks_dead(msg: &str) -> bool { - let m = msg.to_ascii_lowercase(); - m.contains("channel") - || m.contains("connection") - || m.contains("websocket") - || m.contains("transport") - || m.contains("closed") - || m.contains("nav timeout") -} - #[cfg(test)] mod tests { use super::*; @@ -172,9 +172,18 @@ mod tests { } #[test] - fn anyhow_with_channel_message_is_flagged() { - let e: anyhow::Error = anyhow::anyhow!("CDP channel send error: closed"); - assert!(anyhow_looks_browser_dead(&e)); + fn anyhow_with_cdp_error_in_chain_is_flagged() { + // `Browser::new_page` errors get wrapped via + // `anyhow::Error::from(CdpError)` at the navigate / dispatch + // call sites. Walking the chain and downcasting to CdpError is + // what catches that path. Any CdpError variant counts; the + // Serde variant is the easiest to construct in a unit test. + let serde_err: serde_json::Error = + serde_json::from_str::("not a number").unwrap_err(); + let cdp = CdpError::Serde(serde_err); + let wrapped: anyhow::Error = + anyhow::Error::from(cdp).context("open chapter page"); + assert!(anyhow_looks_browser_dead(&wrapped)); } #[test] @@ -184,6 +193,28 @@ mod tests { assert!(!anyhow_looks_browser_dead(&e)); } + #[test] + fn anyhow_with_reqwest_style_connection_message_is_not_flagged() { + // Regression: the earlier substring fallback flagged any error + // whose message contained "connection" or "closed" as browser- + // dead. A TCP reset from a CDN during image download, or a + // sqlx pool-connection error, would burn a chromium relaunch + // even though the browser is fine. Typed downcasts only — + // these untyped strings must pass through. + for msg in [ + "error sending request: connection reset by peer", + "PoolTimedOut: timed out waiting for a connection", + "request to https://cdn/x.jpg: connection closed before message completed", + "transport error during image fetch", + ] { + let e: anyhow::Error = anyhow::anyhow!("{msg}"); + assert!( + !anyhow_looks_browser_dead(&e), + "must not flag non-browser error: {msg}" + ); + } + } + /// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`], /// but for the [`wait_for_selector`] polling pattern: the loop must /// surrender on `Elapsed` rather than spinning past the deadline. diff --git a/frontend/package.json b/frontend/package.json index 644840e..dbdd054 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.36.6", + "version": "0.36.7", "private": true, "type": "module", "scripts": {