fix(crawler): wrap wait_for_navigation in 30s timeout (0.36.1)

A hung TLS handshake or a page that never fires load could wedge a
worker (or the cron metadata pass) indefinitely — chromiumoxide
imposes no navigation timeout of its own.

New crawler::nav::wait_for_nav caps each navigation at NAV_TIMEOUT
(30s) and returns a typed NavError so timeouts surface as transient
(retryable) errors. Wired at the three navigation sites:
- source::target::navigate (catalog/detail/pagination)
- content::sync_chapter_content (chapter reader)
- session::fetch_probe_html (session probe)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-30 18:10:51 +02:00
parent 9f56f283d4
commit e2bd1462ba
8 changed files with 90 additions and 8 deletions

View File

@@ -21,6 +21,7 @@ use super::{
use crate::crawler::detect::{
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
};
use crate::crawler::nav::{wait_for_nav, NavError};
/// `sources.id` value for this Source impl. Exposed as a const so the
/// daemon can look up per-source state (e.g. the recovery flag) before
@@ -216,9 +217,17 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
.new_page(url)
.await
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
page.wait_for_navigation()
.await
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
match wait_for_nav(&page).await {
Ok(()) => {}
Err(NavError::Timeout(_)) => {
page.close().await.ok();
return Err(PageError::transient("nav timeout"));
}
Err(NavError::Cdp(e)) => {
page.close().await.ok();
return Err(PageError::Other(anyhow::Error::from(e)));
}
}
// Stopgap until we wait on a specific selector per page type —
// gives any post-load JS a beat to finish injecting content.
tokio::time::sleep(Duration::from_secs(1)).await;