fix(crawler): wrap wait_for_navigation in 30s timeout (0.36.1)
A hung TLS handshake or a page that never fires load could wedge a worker (or the cron metadata pass) indefinitely — chromiumoxide imposes no navigation timeout of its own. New crawler::nav::wait_for_nav caps each navigation at NAV_TIMEOUT (30s) and returns a typed NavError so timeouts surface as transient (retryable) errors. Wired at the three navigation sites: - source::target::navigate (catalog/detail/pagination) - content::sync_chapter_content (chapter reader) - session::fetch_probe_html (session probe) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.0"
|
version = "0.36.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.0"
|
version = "0.36.1"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -111,7 +111,9 @@ pub async fn sync_chapter_content(
|
|||||||
.new_page(source_url)
|
.new_page(source_url)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("open chapter page {source_url}"))?;
|
.with_context(|| format!("open chapter page {source_url}"))?;
|
||||||
page.wait_for_navigation().await.context("wait for chapter nav")?;
|
crate::crawler::nav::wait_for_nav(&page)
|
||||||
|
.await
|
||||||
|
.context("wait for chapter nav")?;
|
||||||
|
|
||||||
let html = page.content().await.context("read chapter html")?;
|
let html = page.content().await.context("read chapter html")?;
|
||||||
page.close().await.ok();
|
page.close().await.ok();
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ pub mod daemon;
|
|||||||
pub mod detect;
|
pub mod detect;
|
||||||
pub mod diff;
|
pub mod diff;
|
||||||
pub mod jobs;
|
pub mod jobs;
|
||||||
|
pub mod nav;
|
||||||
pub mod pipeline;
|
pub mod pipeline;
|
||||||
pub mod rate_limit;
|
pub mod rate_limit;
|
||||||
pub mod safety;
|
pub mod safety;
|
||||||
|
|||||||
68
backend/src/crawler/nav.rs
Normal file
68
backend/src/crawler/nav.rs
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
//! Page navigation helpers — wrap `chromiumoxide` `wait_for_navigation`
|
||||||
|
//! with a timeout so a hung TLS handshake or a page that never fires
|
||||||
|
//! `load` cannot wedge a worker (or the cron metadata pass) forever.
|
||||||
|
//!
|
||||||
|
//! [`NAV_TIMEOUT`] is the global budget. Callers in the crawler use
|
||||||
|
//! [`wait_for_nav`] to get back a typed error so transient timeouts can
|
||||||
|
//! be reported separately from underlying CDP errors.
|
||||||
|
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use chromiumoxide::error::CdpError;
|
||||||
|
use chromiumoxide::Page;
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// Maximum wall-clock time we'll wait for a single page navigation. A
|
||||||
|
/// healthy Chromium reaches `load` in well under a second on the target
|
||||||
|
/// site; a 30-second cap is generous enough for slow TLS handshakes on
|
||||||
|
/// the first request after a fresh process while still catching real
|
||||||
|
/// hangs before they wedge the daemon.
|
||||||
|
pub const NAV_TIMEOUT: Duration = Duration::from_secs(30);
|
||||||
|
|
||||||
|
/// Outcome of a timed-out navigation. `Timeout` is the transient signal
|
||||||
|
/// callers translate into a retry-friendly error
|
||||||
|
/// ([`crate::crawler::detect::PageError::Transient`] in the source path,
|
||||||
|
/// a context'd anyhow elsewhere). `Cdp` carries the underlying
|
||||||
|
/// chromiumoxide error unchanged.
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub enum NavError {
|
||||||
|
#[error("navigation timed out after {0:?}")]
|
||||||
|
Timeout(Duration),
|
||||||
|
#[error(transparent)]
|
||||||
|
Cdp(#[from] CdpError),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for the page's next navigation to complete, capped at
|
||||||
|
/// [`NAV_TIMEOUT`]. Replaces bare `page.wait_for_navigation().await`
|
||||||
|
/// throughout the crawler.
|
||||||
|
pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
|
||||||
|
match tokio::time::timeout(NAV_TIMEOUT, page.wait_for_navigation()).await {
|
||||||
|
Err(_elapsed) => Err(NavError::Timeout(NAV_TIMEOUT)),
|
||||||
|
Ok(Err(e)) => Err(NavError::Cdp(e)),
|
||||||
|
Ok(Ok(_)) => Ok(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::future::pending;
|
||||||
|
|
||||||
|
/// Sanity-check the timeout pattern used by [`wait_for_nav`]: a
|
||||||
|
/// future that never resolves must yield `Elapsed` within the
|
||||||
|
/// configured budget. We can't easily stand up a real `Page` in a
|
||||||
|
/// unit test, so we assert the underlying primitive behaves the way
|
||||||
|
/// the helper depends on.
|
||||||
|
#[tokio::test(flavor = "current_thread", start_paused = true)]
|
||||||
|
async fn timeout_elapses_on_a_future_that_never_resolves() {
|
||||||
|
let result =
|
||||||
|
tokio::time::timeout(Duration::from_millis(50), pending::<()>()).await;
|
||||||
|
assert!(result.is_err(), "expected Elapsed on a hung future");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nav_error_timeout_message_includes_duration() {
|
||||||
|
let e = NavError::Timeout(Duration::from_secs(30));
|
||||||
|
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -203,7 +203,9 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
|
|||||||
.new_page(probe_url)
|
.new_page(probe_url)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("open probe page {probe_url}"))?;
|
.with_context(|| format!("open probe page {probe_url}"))?;
|
||||||
page.wait_for_navigation().await.context("wait for nav on probe")?;
|
crate::crawler::nav::wait_for_nav(&page)
|
||||||
|
.await
|
||||||
|
.context("wait for nav on probe")?;
|
||||||
let html = page.content().await.context("read probe html")?;
|
let html = page.content().await.context("read probe html")?;
|
||||||
page.close().await.ok();
|
page.close().await.ok();
|
||||||
Ok(html)
|
Ok(html)
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ use super::{
|
|||||||
use crate::crawler::detect::{
|
use crate::crawler::detect::{
|
||||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||||
};
|
};
|
||||||
|
use crate::crawler::nav::{wait_for_nav, NavError};
|
||||||
|
|
||||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||||
/// daemon can look up per-source state (e.g. the recovery flag) before
|
/// daemon can look up per-source state (e.g. the recovery flag) before
|
||||||
@@ -216,9 +217,17 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
|
|||||||
.new_page(url)
|
.new_page(url)
|
||||||
.await
|
.await
|
||||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||||
page.wait_for_navigation()
|
match wait_for_nav(&page).await {
|
||||||
.await
|
Ok(()) => {}
|
||||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
Err(NavError::Timeout(_)) => {
|
||||||
|
page.close().await.ok();
|
||||||
|
return Err(PageError::transient("nav timeout"));
|
||||||
|
}
|
||||||
|
Err(NavError::Cdp(e)) => {
|
||||||
|
page.close().await.ok();
|
||||||
|
return Err(PageError::Other(anyhow::Error::from(e)));
|
||||||
|
}
|
||||||
|
}
|
||||||
// Stopgap until we wait on a specific selector per page type —
|
// Stopgap until we wait on a specific selector per page type —
|
||||||
// gives any post-load JS a beat to finish injecting content.
|
// gives any post-load JS a beat to finish injecting content.
|
||||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.36.0",
|
"version": "0.36.1",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user