fix(crawler): narrow browser-dead heuristic to typed downcasts (0.36.7)
anyhow_looks_browser_dead substring-matched any chain message containing channel / connection / websocket / transport / closed / nav timeout. Real chromium failures hit those words, but so do reqwest TCP-reset errors during CDN image downloads, sqlx pool- timeout errors, and any number of non-browser failures — each of which triggered a wasted chromium relaunch + session-probe re-run against the catalog's rate-limit budget. Drop the substring pass. Walk the chain looking only for typed NavError (flagged via is_likely_browser_dead) or CdpError. Every place we feed a chromium error into anyhow goes through one of those types, so the typed downcasts cover the real cases without the false-positive surface. NavError::is_likely_browser_dead also drops its own substring check on Cdp(e); any CdpError surfacing at the navigation layer means the chromium-facing channel is the failing layer. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mangalord"
|
||||
version = "0.36.6"
|
||||
version = "0.36.7"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.36.6"
|
||||
version = "0.36.7"
|
||||
edition = "2021"
|
||||
default-run = "mangalord"
|
||||
|
||||
|
||||
@@ -86,29 +86,39 @@ pub async fn wait_for_selector(
|
||||
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
impl NavError {
|
||||
/// Heuristic: does this error look like the underlying Chromium
|
||||
/// Does this navigation error indicate the underlying Chromium
|
||||
/// process has died or its CDP connection has dropped? Used by the
|
||||
/// dispatcher to decide whether to invalidate the
|
||||
/// [`crate::crawler::browser_manager::BrowserManager`] handle so
|
||||
/// the next acquire re-launches.
|
||||
///
|
||||
/// Errs on the side of "yes" — false positives cost one extra
|
||||
/// Chromium launch, false negatives leave every subsequent worker
|
||||
/// hitting a zombie process until the idle reaper fires.
|
||||
/// Both variants count: a `Timeout` past [`NAV_TIMEOUT`] is in
|
||||
/// practice always either a hung CDP transport or a wedged page
|
||||
/// the browser can't recover from on its own, and a `Cdp` error
|
||||
/// surfacing at the navigation layer means the chromium-facing
|
||||
/// channel is the failing layer.
|
||||
pub fn is_likely_browser_dead(&self) -> bool {
|
||||
match self {
|
||||
Self::Timeout(_) => true,
|
||||
Self::Cdp(e) => cdp_error_looks_dead(&e.to_string()),
|
||||
Self::Cdp(_) => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Walk an `anyhow::Error` chain looking for a [`NavError`] flagged as
|
||||
/// likely-dead, or a fallback substring match against well-known
|
||||
/// channel/connection messages in any wrapped error. The substring
|
||||
/// pass catches `chromiumoxide` errors that surface as the
|
||||
/// `anyhow::Error::msg` path rather than as a downcastable type
|
||||
/// (notably the `ChannelSendError` family).
|
||||
/// Walk an `anyhow::Error` chain looking for typed evidence that the
|
||||
/// chromium-facing layer is the failing one. Two markers count:
|
||||
///
|
||||
/// 1. A wrapped [`NavError`] flagged by [`NavError::is_likely_browser_dead`].
|
||||
/// 2. A wrapped [`CdpError`] (via `anyhow::Error::from(CdpError)` at a
|
||||
/// `Browser::new_page` call site, or any other direct CDP boundary).
|
||||
///
|
||||
/// Earlier versions also substring-matched the chain for "connection",
|
||||
/// "closed", "channel", etc. as a fallback. That was too broad —
|
||||
/// reqwest TCP-reset errors during CDN image downloads, sqlx
|
||||
/// connection-pool errors, and similar non-browser failures contain
|
||||
/// those words and triggered spurious chromium relaunches. The typed
|
||||
/// downcasts cover every place we hand a chromium error to anyhow,
|
||||
/// so the fallback is unnecessary.
|
||||
pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool {
|
||||
for cause in err.chain() {
|
||||
if let Some(nav) = cause.downcast_ref::<NavError>() {
|
||||
@@ -116,23 +126,13 @@ pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if cdp_error_looks_dead(&cause.to_string()) {
|
||||
if cause.downcast_ref::<CdpError>().is_some() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn cdp_error_looks_dead(msg: &str) -> bool {
|
||||
let m = msg.to_ascii_lowercase();
|
||||
m.contains("channel")
|
||||
|| m.contains("connection")
|
||||
|| m.contains("websocket")
|
||||
|| m.contains("transport")
|
||||
|| m.contains("closed")
|
||||
|| m.contains("nav timeout")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -172,9 +172,18 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anyhow_with_channel_message_is_flagged() {
|
||||
let e: anyhow::Error = anyhow::anyhow!("CDP channel send error: closed");
|
||||
assert!(anyhow_looks_browser_dead(&e));
|
||||
fn anyhow_with_cdp_error_in_chain_is_flagged() {
|
||||
// `Browser::new_page` errors get wrapped via
|
||||
// `anyhow::Error::from(CdpError)` at the navigate / dispatch
|
||||
// call sites. Walking the chain and downcasting to CdpError is
|
||||
// what catches that path. Any CdpError variant counts; the
|
||||
// Serde variant is the easiest to construct in a unit test.
|
||||
let serde_err: serde_json::Error =
|
||||
serde_json::from_str::<i32>("not a number").unwrap_err();
|
||||
let cdp = CdpError::Serde(serde_err);
|
||||
let wrapped: anyhow::Error =
|
||||
anyhow::Error::from(cdp).context("open chapter page");
|
||||
assert!(anyhow_looks_browser_dead(&wrapped));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -184,6 +193,28 @@ mod tests {
|
||||
assert!(!anyhow_looks_browser_dead(&e));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anyhow_with_reqwest_style_connection_message_is_not_flagged() {
|
||||
// Regression: the earlier substring fallback flagged any error
|
||||
// whose message contained "connection" or "closed" as browser-
|
||||
// dead. A TCP reset from a CDN during image download, or a
|
||||
// sqlx pool-connection error, would burn a chromium relaunch
|
||||
// even though the browser is fine. Typed downcasts only —
|
||||
// these untyped strings must pass through.
|
||||
for msg in [
|
||||
"error sending request: connection reset by peer",
|
||||
"PoolTimedOut: timed out waiting for a connection",
|
||||
"request to https://cdn/x.jpg: connection closed before message completed",
|
||||
"transport error during image fetch",
|
||||
] {
|
||||
let e: anyhow::Error = anyhow::anyhow!("{msg}");
|
||||
assert!(
|
||||
!anyhow_looks_browser_dead(&e),
|
||||
"must not flag non-browser error: {msg}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
|
||||
/// but for the [`wait_for_selector`] polling pattern: the loop must
|
||||
/// surrender on `Elapsed` rather than spinning past the deadline.
|
||||
|
||||
Reference in New Issue
Block a user