fix(crawler): relaunch chromium on CDP / nav-timeout errors (0.36.3)
BrowserManager only re-launched chromium when the cached handle was None. A crash mid-pass left the handle Some pointing at a dead process — every subsequent acquire returned the zombie Browser, and every nav cascaded CDP errors until the idle reaper fired. Add BrowserManager::invalidate(): take the inner mutex, drop the handle (closing it if present), and signal the next acquire to relaunch. Idempotent — invalidating an empty handle is a no-op. Wire detection via NavError::is_likely_browser_dead and a chain-walking anyhow_looks_browser_dead helper: substring-match common channel/connection/transport/WebSocket markers and surface NavError::Timeout as "presumed dead." Apply at both error boundaries — RealChapterDispatcher::dispatch and RealMetadataPass::run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mangalord"
|
||||
version = "0.36.2"
|
||||
version = "0.36.3"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.36.2"
|
||||
version = "0.36.3"
|
||||
edition = "2021"
|
||||
default-run = "mangalord"
|
||||
|
||||
|
||||
@@ -230,7 +230,7 @@ struct RealMetadataPass {
|
||||
#[async_trait]
|
||||
impl MetadataPass for RealMetadataPass {
|
||||
async fn run(&self) -> anyhow::Result<MetadataStats> {
|
||||
pipeline::run_metadata_pass(
|
||||
let result = pipeline::run_metadata_pass(
|
||||
&self.browser_manager,
|
||||
&self.db,
|
||||
self.storage.as_ref(),
|
||||
@@ -242,7 +242,13 @@ impl MetadataPass for RealMetadataPass {
|
||||
&self.download_allowlist,
|
||||
self.max_image_bytes,
|
||||
)
|
||||
.await
|
||||
.await;
|
||||
if let Err(e) = &result {
|
||||
if crate::crawler::nav::anyhow_looks_browser_dead(e) {
|
||||
self.browser_manager.invalidate().await;
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -273,7 +279,7 @@ impl ChapterDispatcher for RealChapterDispatcher {
|
||||
return Ok(SyncOutcome::Skipped);
|
||||
};
|
||||
let lease = self.browser_manager.acquire().await?;
|
||||
let outcome = content::sync_chapter_content(
|
||||
let result = content::sync_chapter_content(
|
||||
&lease,
|
||||
&self.db,
|
||||
self.storage.as_ref(),
|
||||
@@ -286,9 +292,17 @@ impl ChapterDispatcher for RealChapterDispatcher {
|
||||
&self.download_allowlist,
|
||||
self.max_image_bytes,
|
||||
)
|
||||
.await?;
|
||||
.await;
|
||||
drop(lease);
|
||||
Ok(outcome)
|
||||
match result {
|
||||
Ok(outcome) => Ok(outcome),
|
||||
Err(e) => {
|
||||
if crate::crawler::nav::anyhow_looks_browser_dead(&e) {
|
||||
self.browser_manager.invalidate().await;
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Other payload kinds aren't dispatched by this daemon yet —
|
||||
// SyncManga / SyncChapterList are handled inline by the cron's
|
||||
|
||||
@@ -145,6 +145,28 @@ impl BrowserManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark the cached browser handle as unhealthy. The next `acquire`
|
||||
/// will re-launch Chromium from scratch.
|
||||
///
|
||||
/// Same semantics as `shutdown` — the difference is intent:
|
||||
/// `shutdown` runs once at daemon teardown, while `invalidate` is a
|
||||
/// recovery hook callers fire after a CDP / connection / navigation
|
||||
/// failure that suggests the underlying process has died. Calling
|
||||
/// this while other workers still hold leases is safe — their
|
||||
/// outstanding CDP operations will return channel-closed errors
|
||||
/// and those workers will then re-acquire (re-launching Chromium).
|
||||
///
|
||||
/// Idempotent: calling on an already-invalidated manager is a
|
||||
/// no-op.
|
||||
pub async fn invalidate(&self) {
|
||||
let mut guard = self.inner.lock().await;
|
||||
guard.shared = None;
|
||||
if let Some(handle) = guard.handle.take() {
|
||||
let _ = handle.close().await;
|
||||
tracing::warn!("BrowserManager: handle invalidated — next acquire will relaunch");
|
||||
}
|
||||
}
|
||||
|
||||
fn idle_timeout(&self) -> Duration {
|
||||
self.idle_timeout
|
||||
}
|
||||
@@ -231,6 +253,23 @@ mod tests {
|
||||
assert_send_sync(&h);
|
||||
}
|
||||
|
||||
/// Invalidate is the only `BrowserManager` method that's safe to
|
||||
/// exercise in a unit test without launching Chromium — it's a
|
||||
/// no-op when no handle has been cached, and that path is exactly
|
||||
/// the one we want to verify is idempotent.
|
||||
#[tokio::test]
|
||||
async fn invalidate_is_a_noop_when_no_handle_cached() {
|
||||
let mgr = BrowserManager::new(
|
||||
crate::crawler::browser::LaunchOptions::default(),
|
||||
Duration::ZERO,
|
||||
noop_on_launch(),
|
||||
);
|
||||
// Two back-to-back invalidates must both complete; the second
|
||||
// would hang or panic if the first had left torn state.
|
||||
mgr.invalidate().await;
|
||||
mgr.invalidate().await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn active_tracker_signals_idle_only_on_zero_transition() {
|
||||
let tracker = ActiveTracker::new();
|
||||
|
||||
@@ -85,6 +85,54 @@ pub async fn wait_for_selector(
|
||||
/// under a second.
|
||||
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
impl NavError {
|
||||
/// Heuristic: does this error look like the underlying Chromium
|
||||
/// process has died or its CDP connection has dropped? Used by the
|
||||
/// dispatcher to decide whether to invalidate the
|
||||
/// [`crate::crawler::browser_manager::BrowserManager`] handle so
|
||||
/// the next acquire re-launches.
|
||||
///
|
||||
/// Errs on the side of "yes" — false positives cost one extra
|
||||
/// Chromium launch, false negatives leave every subsequent worker
|
||||
/// hitting a zombie process until the idle reaper fires.
|
||||
pub fn is_likely_browser_dead(&self) -> bool {
|
||||
match self {
|
||||
Self::Timeout(_) => true,
|
||||
Self::Cdp(e) => cdp_error_looks_dead(&e.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Walk an `anyhow::Error` chain looking for a [`NavError`] flagged as
|
||||
/// likely-dead, or a fallback substring match against well-known
|
||||
/// channel/connection messages in any wrapped error. The substring
|
||||
/// pass catches `chromiumoxide` errors that surface as the
|
||||
/// `anyhow::Error::msg` path rather than as a downcastable type
|
||||
/// (notably the `ChannelSendError` family).
|
||||
pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool {
|
||||
for cause in err.chain() {
|
||||
if let Some(nav) = cause.downcast_ref::<NavError>() {
|
||||
if nav.is_likely_browser_dead() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if cdp_error_looks_dead(&cause.to_string()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn cdp_error_looks_dead(msg: &str) -> bool {
|
||||
let m = msg.to_ascii_lowercase();
|
||||
m.contains("channel")
|
||||
|| m.contains("connection")
|
||||
|| m.contains("websocket")
|
||||
|| m.contains("transport")
|
||||
|| m.contains("closed")
|
||||
|| m.contains("nav timeout")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -108,6 +156,34 @@ mod tests {
|
||||
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeout_is_treated_as_likely_browser_dead() {
|
||||
let e = NavError::Timeout(NAV_TIMEOUT);
|
||||
assert!(e.is_likely_browser_dead());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anyhow_with_nav_timeout_in_chain_is_flagged() {
|
||||
let inner: Result<(), NavError> = Err(NavError::Timeout(NAV_TIMEOUT));
|
||||
let outer = inner.unwrap_err();
|
||||
let wrapped: anyhow::Error =
|
||||
anyhow::Error::new(outer).context("wait for chapter nav");
|
||||
assert!(anyhow_looks_browser_dead(&wrapped));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anyhow_with_channel_message_is_flagged() {
|
||||
let e: anyhow::Error = anyhow::anyhow!("CDP channel send error: closed");
|
||||
assert!(anyhow_looks_browser_dead(&e));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn anyhow_with_innocuous_parse_error_is_not_flagged() {
|
||||
let e: anyhow::Error =
|
||||
anyhow::anyhow!("parse manga detail: chapter row regex did not match");
|
||||
assert!(!anyhow_looks_browser_dead(&e));
|
||||
}
|
||||
|
||||
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
|
||||
/// but for the [`wait_for_selector`] polling pattern: the loop must
|
||||
/// surrender on `Elapsed` rather than spinning past the deadline.
|
||||
|
||||
Reference in New Issue
Block a user