fix(crawler): relaunch chromium on CDP / nav-timeout errors (0.36.3)
BrowserManager only re-launched chromium when the cached handle was None. A crash mid-pass left the handle Some pointing at a dead process — every subsequent acquire returned the zombie Browser, and every nav cascaded CDP errors until the idle reaper fired. Add BrowserManager::invalidate(): take the inner mutex, drop the handle (closing it if present), and signal the next acquire to relaunch. Idempotent — invalidating an empty handle is a no-op. Wire detection via NavError::is_likely_browser_dead and a chain-walking anyhow_looks_browser_dead helper: substring-match common channel/connection/transport/WebSocket markers and surface NavError::Timeout as "presumed dead." Apply at both error boundaries — RealChapterDispatcher::dispatch and RealMetadataPass::run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.2"
|
version = "0.36.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.2"
|
version = "0.36.3"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -230,7 +230,7 @@ struct RealMetadataPass {
|
|||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl MetadataPass for RealMetadataPass {
|
impl MetadataPass for RealMetadataPass {
|
||||||
async fn run(&self) -> anyhow::Result<MetadataStats> {
|
async fn run(&self) -> anyhow::Result<MetadataStats> {
|
||||||
pipeline::run_metadata_pass(
|
let result = pipeline::run_metadata_pass(
|
||||||
&self.browser_manager,
|
&self.browser_manager,
|
||||||
&self.db,
|
&self.db,
|
||||||
self.storage.as_ref(),
|
self.storage.as_ref(),
|
||||||
@@ -242,7 +242,13 @@ impl MetadataPass for RealMetadataPass {
|
|||||||
&self.download_allowlist,
|
&self.download_allowlist,
|
||||||
self.max_image_bytes,
|
self.max_image_bytes,
|
||||||
)
|
)
|
||||||
.await
|
.await;
|
||||||
|
if let Err(e) = &result {
|
||||||
|
if crate::crawler::nav::anyhow_looks_browser_dead(e) {
|
||||||
|
self.browser_manager.invalidate().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -273,7 +279,7 @@ impl ChapterDispatcher for RealChapterDispatcher {
|
|||||||
return Ok(SyncOutcome::Skipped);
|
return Ok(SyncOutcome::Skipped);
|
||||||
};
|
};
|
||||||
let lease = self.browser_manager.acquire().await?;
|
let lease = self.browser_manager.acquire().await?;
|
||||||
let outcome = content::sync_chapter_content(
|
let result = content::sync_chapter_content(
|
||||||
&lease,
|
&lease,
|
||||||
&self.db,
|
&self.db,
|
||||||
self.storage.as_ref(),
|
self.storage.as_ref(),
|
||||||
@@ -286,9 +292,17 @@ impl ChapterDispatcher for RealChapterDispatcher {
|
|||||||
&self.download_allowlist,
|
&self.download_allowlist,
|
||||||
self.max_image_bytes,
|
self.max_image_bytes,
|
||||||
)
|
)
|
||||||
.await?;
|
.await;
|
||||||
drop(lease);
|
drop(lease);
|
||||||
Ok(outcome)
|
match result {
|
||||||
|
Ok(outcome) => Ok(outcome),
|
||||||
|
Err(e) => {
|
||||||
|
if crate::crawler::nav::anyhow_looks_browser_dead(&e) {
|
||||||
|
self.browser_manager.invalidate().await;
|
||||||
|
}
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Other payload kinds aren't dispatched by this daemon yet —
|
// Other payload kinds aren't dispatched by this daemon yet —
|
||||||
// SyncManga / SyncChapterList are handled inline by the cron's
|
// SyncManga / SyncChapterList are handled inline by the cron's
|
||||||
|
|||||||
@@ -145,6 +145,28 @@ impl BrowserManager {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Mark the cached browser handle as unhealthy. The next `acquire`
|
||||||
|
/// will re-launch Chromium from scratch.
|
||||||
|
///
|
||||||
|
/// Same semantics as `shutdown` — the difference is intent:
|
||||||
|
/// `shutdown` runs once at daemon teardown, while `invalidate` is a
|
||||||
|
/// recovery hook callers fire after a CDP / connection / navigation
|
||||||
|
/// failure that suggests the underlying process has died. Calling
|
||||||
|
/// this while other workers still hold leases is safe — their
|
||||||
|
/// outstanding CDP operations will return channel-closed errors
|
||||||
|
/// and those workers will then re-acquire (re-launching Chromium).
|
||||||
|
///
|
||||||
|
/// Idempotent: calling on an already-invalidated manager is a
|
||||||
|
/// no-op.
|
||||||
|
pub async fn invalidate(&self) {
|
||||||
|
let mut guard = self.inner.lock().await;
|
||||||
|
guard.shared = None;
|
||||||
|
if let Some(handle) = guard.handle.take() {
|
||||||
|
let _ = handle.close().await;
|
||||||
|
tracing::warn!("BrowserManager: handle invalidated — next acquire will relaunch");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn idle_timeout(&self) -> Duration {
|
fn idle_timeout(&self) -> Duration {
|
||||||
self.idle_timeout
|
self.idle_timeout
|
||||||
}
|
}
|
||||||
@@ -231,6 +253,23 @@ mod tests {
|
|||||||
assert_send_sync(&h);
|
assert_send_sync(&h);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Invalidate is the only `BrowserManager` method that's safe to
|
||||||
|
/// exercise in a unit test without launching Chromium — it's a
|
||||||
|
/// no-op when no handle has been cached, and that path is exactly
|
||||||
|
/// the one we want to verify is idempotent.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn invalidate_is_a_noop_when_no_handle_cached() {
|
||||||
|
let mgr = BrowserManager::new(
|
||||||
|
crate::crawler::browser::LaunchOptions::default(),
|
||||||
|
Duration::ZERO,
|
||||||
|
noop_on_launch(),
|
||||||
|
);
|
||||||
|
// Two back-to-back invalidates must both complete; the second
|
||||||
|
// would hang or panic if the first had left torn state.
|
||||||
|
mgr.invalidate().await;
|
||||||
|
mgr.invalidate().await;
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn active_tracker_signals_idle_only_on_zero_transition() {
|
async fn active_tracker_signals_idle_only_on_zero_transition() {
|
||||||
let tracker = ActiveTracker::new();
|
let tracker = ActiveTracker::new();
|
||||||
|
|||||||
@@ -85,6 +85,54 @@ pub async fn wait_for_selector(
|
|||||||
/// under a second.
|
/// under a second.
|
||||||
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
|
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
|
impl NavError {
|
||||||
|
/// Heuristic: does this error look like the underlying Chromium
|
||||||
|
/// process has died or its CDP connection has dropped? Used by the
|
||||||
|
/// dispatcher to decide whether to invalidate the
|
||||||
|
/// [`crate::crawler::browser_manager::BrowserManager`] handle so
|
||||||
|
/// the next acquire re-launches.
|
||||||
|
///
|
||||||
|
/// Errs on the side of "yes" — false positives cost one extra
|
||||||
|
/// Chromium launch, false negatives leave every subsequent worker
|
||||||
|
/// hitting a zombie process until the idle reaper fires.
|
||||||
|
pub fn is_likely_browser_dead(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
Self::Timeout(_) => true,
|
||||||
|
Self::Cdp(e) => cdp_error_looks_dead(&e.to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Walk an `anyhow::Error` chain looking for a [`NavError`] flagged as
|
||||||
|
/// likely-dead, or a fallback substring match against well-known
|
||||||
|
/// channel/connection messages in any wrapped error. The substring
|
||||||
|
/// pass catches `chromiumoxide` errors that surface as the
|
||||||
|
/// `anyhow::Error::msg` path rather than as a downcastable type
|
||||||
|
/// (notably the `ChannelSendError` family).
|
||||||
|
pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool {
|
||||||
|
for cause in err.chain() {
|
||||||
|
if let Some(nav) = cause.downcast_ref::<NavError>() {
|
||||||
|
if nav.is_likely_browser_dead() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cdp_error_looks_dead(&cause.to_string()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cdp_error_looks_dead(msg: &str) -> bool {
|
||||||
|
let m = msg.to_ascii_lowercase();
|
||||||
|
m.contains("channel")
|
||||||
|
|| m.contains("connection")
|
||||||
|
|| m.contains("websocket")
|
||||||
|
|| m.contains("transport")
|
||||||
|
|| m.contains("closed")
|
||||||
|
|| m.contains("nav timeout")
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -108,6 +156,34 @@ mod tests {
|
|||||||
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn timeout_is_treated_as_likely_browser_dead() {
|
||||||
|
let e = NavError::Timeout(NAV_TIMEOUT);
|
||||||
|
assert!(e.is_likely_browser_dead());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn anyhow_with_nav_timeout_in_chain_is_flagged() {
|
||||||
|
let inner: Result<(), NavError> = Err(NavError::Timeout(NAV_TIMEOUT));
|
||||||
|
let outer = inner.unwrap_err();
|
||||||
|
let wrapped: anyhow::Error =
|
||||||
|
anyhow::Error::new(outer).context("wait for chapter nav");
|
||||||
|
assert!(anyhow_looks_browser_dead(&wrapped));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn anyhow_with_channel_message_is_flagged() {
|
||||||
|
let e: anyhow::Error = anyhow::anyhow!("CDP channel send error: closed");
|
||||||
|
assert!(anyhow_looks_browser_dead(&e));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn anyhow_with_innocuous_parse_error_is_not_flagged() {
|
||||||
|
let e: anyhow::Error =
|
||||||
|
anyhow::anyhow!("parse manga detail: chapter row regex did not match");
|
||||||
|
assert!(!anyhow_looks_browser_dead(&e));
|
||||||
|
}
|
||||||
|
|
||||||
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
|
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
|
||||||
/// but for the [`wait_for_selector`] polling pattern: the loop must
|
/// but for the [`wait_for_selector`] polling pattern: the loop must
|
||||||
/// surrender on `Elapsed` rather than spinning past the deadline.
|
/// surrender on `Elapsed` rather than spinning past the deadline.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.36.2",
|
"version": "0.36.3",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user