fix(crawler): wrap wait_for_navigation in 30s timeout (0.36.1)
A hung TLS handshake or a page that never fires load could wedge a worker (or the cron metadata pass) indefinitely — chromiumoxide imposes no navigation timeout of its own. New crawler::nav::wait_for_nav caps each navigation at NAV_TIMEOUT (30s) and returns a typed NavError so timeouts surface as transient (retryable) errors. Wired at the three navigation sites: - source::target::navigate (catalog/detail/pagination) - content::sync_chapter_content (chapter reader) - session::fetch_probe_html (session probe) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -111,7 +111,9 @@ pub async fn sync_chapter_content(
|
||||
.new_page(source_url)
|
||||
.await
|
||||
.with_context(|| format!("open chapter page {source_url}"))?;
|
||||
page.wait_for_navigation().await.context("wait for chapter nav")?;
|
||||
crate::crawler::nav::wait_for_nav(&page)
|
||||
.await
|
||||
.context("wait for chapter nav")?;
|
||||
|
||||
let html = page.content().await.context("read chapter html")?;
|
||||
page.close().await.ok();
|
||||
|
||||
@@ -20,6 +20,7 @@ pub mod daemon;
|
||||
pub mod detect;
|
||||
pub mod diff;
|
||||
pub mod jobs;
|
||||
pub mod nav;
|
||||
pub mod pipeline;
|
||||
pub mod rate_limit;
|
||||
pub mod safety;
|
||||
|
||||
68
backend/src/crawler/nav.rs
Normal file
68
backend/src/crawler/nav.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
//! Page navigation helpers — wrap `chromiumoxide` `wait_for_navigation`
|
||||
//! with a timeout so a hung TLS handshake or a page that never fires
|
||||
//! `load` cannot wedge a worker (or the cron metadata pass) forever.
|
||||
//!
|
||||
//! [`NAV_TIMEOUT`] is the global budget. Callers in the crawler use
|
||||
//! [`wait_for_nav`] to get back a typed error so transient timeouts can
|
||||
//! be reported separately from underlying CDP errors.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use chromiumoxide::error::CdpError;
|
||||
use chromiumoxide::Page;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Maximum wall-clock time we'll wait for a single page navigation. A
|
||||
/// healthy Chromium reaches `load` in well under a second on the target
|
||||
/// site; a 30-second cap is generous enough for slow TLS handshakes on
|
||||
/// the first request after a fresh process while still catching real
|
||||
/// hangs before they wedge the daemon.
|
||||
pub const NAV_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// Outcome of a timed-out navigation. `Timeout` is the transient signal
|
||||
/// callers translate into a retry-friendly error
|
||||
/// ([`crate::crawler::detect::PageError::Transient`] in the source path,
|
||||
/// a context'd anyhow elsewhere). `Cdp` carries the underlying
|
||||
/// chromiumoxide error unchanged.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum NavError {
|
||||
#[error("navigation timed out after {0:?}")]
|
||||
Timeout(Duration),
|
||||
#[error(transparent)]
|
||||
Cdp(#[from] CdpError),
|
||||
}
|
||||
|
||||
/// Wait for the page's next navigation to complete, capped at
|
||||
/// [`NAV_TIMEOUT`]. Replaces bare `page.wait_for_navigation().await`
|
||||
/// throughout the crawler.
|
||||
pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
|
||||
match tokio::time::timeout(NAV_TIMEOUT, page.wait_for_navigation()).await {
|
||||
Err(_elapsed) => Err(NavError::Timeout(NAV_TIMEOUT)),
|
||||
Ok(Err(e)) => Err(NavError::Cdp(e)),
|
||||
Ok(Ok(_)) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::future::pending;
|
||||
|
||||
/// Sanity-check the timeout pattern used by [`wait_for_nav`]: a
|
||||
/// future that never resolves must yield `Elapsed` within the
|
||||
/// configured budget. We can't easily stand up a real `Page` in a
|
||||
/// unit test, so we assert the underlying primitive behaves the way
|
||||
/// the helper depends on.
|
||||
#[tokio::test(flavor = "current_thread", start_paused = true)]
|
||||
async fn timeout_elapses_on_a_future_that_never_resolves() {
|
||||
let result =
|
||||
tokio::time::timeout(Duration::from_millis(50), pending::<()>()).await;
|
||||
assert!(result.is_err(), "expected Elapsed on a hung future");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nav_error_timeout_message_includes_duration() {
|
||||
let e = NavError::Timeout(Duration::from_secs(30));
|
||||
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
||||
}
|
||||
}
|
||||
@@ -203,7 +203,9 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
|
||||
.new_page(probe_url)
|
||||
.await
|
||||
.with_context(|| format!("open probe page {probe_url}"))?;
|
||||
page.wait_for_navigation().await.context("wait for nav on probe")?;
|
||||
crate::crawler::nav::wait_for_nav(&page)
|
||||
.await
|
||||
.context("wait for nav on probe")?;
|
||||
let html = page.content().await.context("read probe html")?;
|
||||
page.close().await.ok();
|
||||
Ok(html)
|
||||
|
||||
@@ -21,6 +21,7 @@ use super::{
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
};
|
||||
use crate::crawler::nav::{wait_for_nav, NavError};
|
||||
|
||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||
/// daemon can look up per-source state (e.g. the recovery flag) before
|
||||
@@ -216,9 +217,17 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
|
||||
.new_page(url)
|
||||
.await
|
||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||
page.wait_for_navigation()
|
||||
.await
|
||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||
match wait_for_nav(&page).await {
|
||||
Ok(()) => {}
|
||||
Err(NavError::Timeout(_)) => {
|
||||
page.close().await.ok();
|
||||
return Err(PageError::transient("nav timeout"));
|
||||
}
|
||||
Err(NavError::Cdp(e)) => {
|
||||
page.close().await.ok();
|
||||
return Err(PageError::Other(anyhow::Error::from(e)));
|
||||
}
|
||||
}
|
||||
// Stopgap until we wait on a specific selector per page type —
|
||||
// gives any post-load JS a beat to finish injecting content.
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
|
||||
Reference in New Issue
Block a user