fix(crawler): relaunch chromium on CDP / nav-timeout errors (0.36.3)

BrowserManager only re-launched chromium when the cached handle was
None. A crash mid-pass left the handle Some pointing at a dead
process — every subsequent acquire returned the zombie Browser, and
every nav cascaded CDP errors until the idle reaper fired.

Add BrowserManager::invalidate(): take the inner mutex, drop the
handle (closing it if present), and signal the next acquire to
relaunch. Idempotent — invalidating an empty handle is a no-op.

Wire detection via NavError::is_likely_browser_dead and a
chain-walking anyhow_looks_browser_dead helper: substring-match
common channel/connection/transport/WebSocket markers and surface
NavError::Timeout as "presumed dead." Apply at both error
boundaries — RealChapterDispatcher::dispatch and
RealMetadataPass::run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-30 18:39:19 +02:00
parent 8e0b638e3f
commit 70e8a7895c
6 changed files with 137 additions and 8 deletions

View File

@@ -230,7 +230,7 @@ struct RealMetadataPass {
#[async_trait]
impl MetadataPass for RealMetadataPass {
async fn run(&self) -> anyhow::Result<MetadataStats> {
pipeline::run_metadata_pass(
let result = pipeline::run_metadata_pass(
&self.browser_manager,
&self.db,
self.storage.as_ref(),
@@ -242,7 +242,13 @@ impl MetadataPass for RealMetadataPass {
&self.download_allowlist,
self.max_image_bytes,
)
.await
.await;
if let Err(e) = &result {
if crate::crawler::nav::anyhow_looks_browser_dead(e) {
self.browser_manager.invalidate().await;
}
}
result
}
}
@@ -273,7 +279,7 @@ impl ChapterDispatcher for RealChapterDispatcher {
return Ok(SyncOutcome::Skipped);
};
let lease = self.browser_manager.acquire().await?;
let outcome = content::sync_chapter_content(
let result = content::sync_chapter_content(
&lease,
&self.db,
self.storage.as_ref(),
@@ -286,9 +292,17 @@ impl ChapterDispatcher for RealChapterDispatcher {
&self.download_allowlist,
self.max_image_bytes,
)
.await?;
.await;
drop(lease);
Ok(outcome)
match result {
Ok(outcome) => Ok(outcome),
Err(e) => {
if crate::crawler::nav::anyhow_looks_browser_dead(&e) {
self.browser_manager.invalidate().await;
}
Err(e)
}
}
}
// Other payload kinds aren't dispatched by this daemon yet —
// SyncManga / SyncChapterList are handled inline by the cron's

View File

@@ -145,6 +145,28 @@ impl BrowserManager {
}
}
/// Mark the cached browser handle as unhealthy. The next `acquire`
/// will re-launch Chromium from scratch.
///
/// Same semantics as `shutdown` — the difference is intent:
/// `shutdown` runs once at daemon teardown, while `invalidate` is a
/// recovery hook callers fire after a CDP / connection / navigation
/// failure that suggests the underlying process has died. Calling
/// this while other workers still hold leases is safe — their
/// outstanding CDP operations will return channel-closed errors
/// and those workers will then re-acquire (re-launching Chromium).
///
/// Idempotent: calling on an already-invalidated manager is a
/// no-op.
pub async fn invalidate(&self) {
let mut guard = self.inner.lock().await;
guard.shared = None;
if let Some(handle) = guard.handle.take() {
let _ = handle.close().await;
tracing::warn!("BrowserManager: handle invalidated — next acquire will relaunch");
}
}
fn idle_timeout(&self) -> Duration {
self.idle_timeout
}
@@ -231,6 +253,23 @@ mod tests {
assert_send_sync(&h);
}
/// Invalidate is the only `BrowserManager` method that's safe to
/// exercise in a unit test without launching Chromium — it's a
/// no-op when no handle has been cached, and that path is exactly
/// the one we want to verify is idempotent.
#[tokio::test]
async fn invalidate_is_a_noop_when_no_handle_cached() {
let mgr = BrowserManager::new(
crate::crawler::browser::LaunchOptions::default(),
Duration::ZERO,
noop_on_launch(),
);
// Two back-to-back invalidates must both complete; the second
// would hang or panic if the first had left torn state.
mgr.invalidate().await;
mgr.invalidate().await;
}
#[tokio::test]
async fn active_tracker_signals_idle_only_on_zero_transition() {
let tracker = ActiveTracker::new();

View File

@@ -85,6 +85,54 @@ pub async fn wait_for_selector(
/// under a second.
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
impl NavError {
/// Heuristic: does this error look like the underlying Chromium
/// process has died or its CDP connection has dropped? Used by the
/// dispatcher to decide whether to invalidate the
/// [`crate::crawler::browser_manager::BrowserManager`] handle so
/// the next acquire re-launches.
///
/// Errs on the side of "yes" — false positives cost one extra
/// Chromium launch, false negatives leave every subsequent worker
/// hitting a zombie process until the idle reaper fires.
pub fn is_likely_browser_dead(&self) -> bool {
match self {
Self::Timeout(_) => true,
Self::Cdp(e) => cdp_error_looks_dead(&e.to_string()),
}
}
}
/// Walk an `anyhow::Error` chain looking for a [`NavError`] flagged as
/// likely-dead, or a fallback substring match against well-known
/// channel/connection messages in any wrapped error. The substring
/// pass catches `chromiumoxide` errors that surface as the
/// `anyhow::Error::msg` path rather than as a downcastable type
/// (notably the `ChannelSendError` family).
pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool {
for cause in err.chain() {
if let Some(nav) = cause.downcast_ref::<NavError>() {
if nav.is_likely_browser_dead() {
return true;
}
}
if cdp_error_looks_dead(&cause.to_string()) {
return true;
}
}
false
}
fn cdp_error_looks_dead(msg: &str) -> bool {
let m = msg.to_ascii_lowercase();
m.contains("channel")
|| m.contains("connection")
|| m.contains("websocket")
|| m.contains("transport")
|| m.contains("closed")
|| m.contains("nav timeout")
}
#[cfg(test)]
mod tests {
use super::*;
@@ -108,6 +156,34 @@ mod tests {
assert_eq!(e.to_string(), "navigation timed out after 30s");
}
#[test]
fn timeout_is_treated_as_likely_browser_dead() {
let e = NavError::Timeout(NAV_TIMEOUT);
assert!(e.is_likely_browser_dead());
}
#[test]
fn anyhow_with_nav_timeout_in_chain_is_flagged() {
let inner: Result<(), NavError> = Err(NavError::Timeout(NAV_TIMEOUT));
let outer = inner.unwrap_err();
let wrapped: anyhow::Error =
anyhow::Error::new(outer).context("wait for chapter nav");
assert!(anyhow_looks_browser_dead(&wrapped));
}
#[test]
fn anyhow_with_channel_message_is_flagged() {
let e: anyhow::Error = anyhow::anyhow!("CDP channel send error: closed");
assert!(anyhow_looks_browser_dead(&e));
}
#[test]
fn anyhow_with_innocuous_parse_error_is_not_flagged() {
let e: anyhow::Error =
anyhow::anyhow!("parse manga detail: chapter row regex did not match");
assert!(!anyhow_looks_browser_dead(&e));
}
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
/// but for the [`wait_for_selector`] polling pattern: the loop must
/// surrender on `Elapsed` rather than spinning past the deadline.