From e2bd1462ba6f03b6b785986deb3baccb570be50b Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sat, 30 May 2026 18:10:51 +0200 Subject: [PATCH] fix(crawler): wrap wait_for_navigation in 30s timeout (0.36.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hung TLS handshake or a page that never fires load could wedge a worker (or the cron metadata pass) indefinitely — chromiumoxide imposes no navigation timeout of its own. New crawler::nav::wait_for_nav caps each navigation at NAV_TIMEOUT (30s) and returns a typed NavError so timeouts surface as transient (retryable) errors. Wired at the three navigation sites: - source::target::navigate (catalog/detail/pagination) - content::sync_chapter_content (chapter reader) - session::fetch_probe_html (session probe) Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/crawler/content.rs | 4 +- backend/src/crawler/mod.rs | 1 + backend/src/crawler/nav.rs | 68 ++++++++++++++++++++++++++++ backend/src/crawler/session.rs | 4 +- backend/src/crawler/source/target.rs | 15 ++++-- frontend/package.json | 2 +- 8 files changed, 90 insertions(+), 8 deletions(-) create mode 100644 backend/src/crawler/nav.rs diff --git a/backend/Cargo.lock b/backend/Cargo.lock index ba42dde..1507bfd 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.36.0" +version = "0.36.1" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index be56adf..69d73ee 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.36.0" +version = "0.36.1" edition = "2021" default-run = "mangalord" diff --git a/backend/src/crawler/content.rs b/backend/src/crawler/content.rs index 1d5706a..c08e727 100644 --- a/backend/src/crawler/content.rs +++ b/backend/src/crawler/content.rs @@ -111,7 +111,9 @@ pub async fn sync_chapter_content( .new_page(source_url) .await .with_context(|| format!("open chapter page {source_url}"))?; - page.wait_for_navigation().await.context("wait for chapter nav")?; + crate::crawler::nav::wait_for_nav(&page) + .await + .context("wait for chapter nav")?; let html = page.content().await.context("read chapter html")?; page.close().await.ok(); diff --git a/backend/src/crawler/mod.rs b/backend/src/crawler/mod.rs index 3e265df..e82e737 100644 --- a/backend/src/crawler/mod.rs +++ b/backend/src/crawler/mod.rs @@ -20,6 +20,7 @@ pub mod daemon; pub mod detect; pub mod diff; pub mod jobs; +pub mod nav; pub mod pipeline; pub mod rate_limit; pub mod safety; diff --git a/backend/src/crawler/nav.rs b/backend/src/crawler/nav.rs new file mode 100644 index 0000000..24ff0e0 --- /dev/null +++ b/backend/src/crawler/nav.rs @@ -0,0 +1,68 @@ +//! Page navigation helpers — wrap `chromiumoxide` `wait_for_navigation` +//! with a timeout so a hung TLS handshake or a page that never fires +//! `load` cannot wedge a worker (or the cron metadata pass) forever. +//! +//! [`NAV_TIMEOUT`] is the global budget. Callers in the crawler use +//! [`wait_for_nav`] to get back a typed error so transient timeouts can +//! be reported separately from underlying CDP errors. + +use std::time::Duration; + +use chromiumoxide::error::CdpError; +use chromiumoxide::Page; +use thiserror::Error; + +/// Maximum wall-clock time we'll wait for a single page navigation. A +/// healthy Chromium reaches `load` in well under a second on the target +/// site; a 30-second cap is generous enough for slow TLS handshakes on +/// the first request after a fresh process while still catching real +/// hangs before they wedge the daemon. +pub const NAV_TIMEOUT: Duration = Duration::from_secs(30); + +/// Outcome of a timed-out navigation. `Timeout` is the transient signal +/// callers translate into a retry-friendly error +/// ([`crate::crawler::detect::PageError::Transient`] in the source path, +/// a context'd anyhow elsewhere). `Cdp` carries the underlying +/// chromiumoxide error unchanged. +#[derive(Debug, Error)] +pub enum NavError { + #[error("navigation timed out after {0:?}")] + Timeout(Duration), + #[error(transparent)] + Cdp(#[from] CdpError), +} + +/// Wait for the page's next navigation to complete, capped at +/// [`NAV_TIMEOUT`]. Replaces bare `page.wait_for_navigation().await` +/// throughout the crawler. +pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> { + match tokio::time::timeout(NAV_TIMEOUT, page.wait_for_navigation()).await { + Err(_elapsed) => Err(NavError::Timeout(NAV_TIMEOUT)), + Ok(Err(e)) => Err(NavError::Cdp(e)), + Ok(Ok(_)) => Ok(()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::future::pending; + + /// Sanity-check the timeout pattern used by [`wait_for_nav`]: a + /// future that never resolves must yield `Elapsed` within the + /// configured budget. We can't easily stand up a real `Page` in a + /// unit test, so we assert the underlying primitive behaves the way + /// the helper depends on. + #[tokio::test(flavor = "current_thread", start_paused = true)] + async fn timeout_elapses_on_a_future_that_never_resolves() { + let result = + tokio::time::timeout(Duration::from_millis(50), pending::<()>()).await; + assert!(result.is_err(), "expected Elapsed on a hung future"); + } + + #[test] + fn nav_error_timeout_message_includes_duration() { + let e = NavError::Timeout(Duration::from_secs(30)); + assert_eq!(e.to_string(), "navigation timed out after 30s"); + } +} diff --git a/backend/src/crawler/session.rs b/backend/src/crawler/session.rs index 1782c83..d222d43 100644 --- a/backend/src/crawler/session.rs +++ b/backend/src/crawler/session.rs @@ -203,7 +203,9 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result< .new_page(probe_url) .await .with_context(|| format!("open probe page {probe_url}"))?; - page.wait_for_navigation().await.context("wait for nav on probe")?; + crate::crawler::nav::wait_for_nav(&page) + .await + .context("wait for nav on probe")?; let html = page.content().await.context("read probe html")?; page.close().await.ok(); Ok(html) diff --git a/backend/src/crawler/source/target.rs b/backend/src/crawler/source/target.rs index a608530..426ed3c 100644 --- a/backend/src/crawler/source/target.rs +++ b/backend/src/crawler/source/target.rs @@ -21,6 +21,7 @@ use super::{ use crate::crawler::detect::{ has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError, }; +use crate::crawler::nav::{wait_for_nav, NavError}; /// `sources.id` value for this Source impl. Exposed as a const so the /// daemon can look up per-source state (e.g. the recovery flag) before @@ -216,9 +217,17 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result {} + Err(NavError::Timeout(_)) => { + page.close().await.ok(); + return Err(PageError::transient("nav timeout")); + } + Err(NavError::Cdp(e)) => { + page.close().await.ok(); + return Err(PageError::Other(anyhow::Error::from(e))); + } + } // Stopgap until we wait on a specific selector per page type — // gives any post-load JS a beat to finish injecting content. tokio::time::sleep(Duration::from_secs(1)).await; diff --git a/frontend/package.json b/frontend/package.json index 6db3d00..436bfac 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.36.0", + "version": "0.36.1", "private": true, "type": "module", "scripts": {