fix(crawler): wrap wait_for_navigation in 30s timeout (0.36.1)
A hung TLS handshake or a page that never fires load could wedge a worker (or the cron metadata pass) indefinitely — chromiumoxide imposes no navigation timeout of its own. New crawler::nav::wait_for_nav caps each navigation at NAV_TIMEOUT (30s) and returns a typed NavError so timeouts surface as transient (retryable) errors. Wired at the three navigation sites: - source::target::navigate (catalog/detail/pagination) - content::sync_chapter_content (chapter reader) - session::fetch_probe_html (session probe) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ use super::{
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
};
|
||||
use crate::crawler::nav::{wait_for_nav, NavError};
|
||||
|
||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||
/// daemon can look up per-source state (e.g. the recovery flag) before
|
||||
@@ -216,9 +217,17 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
|
||||
.new_page(url)
|
||||
.await
|
||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||
page.wait_for_navigation()
|
||||
.await
|
||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||
match wait_for_nav(&page).await {
|
||||
Ok(()) => {}
|
||||
Err(NavError::Timeout(_)) => {
|
||||
page.close().await.ok();
|
||||
return Err(PageError::transient("nav timeout"));
|
||||
}
|
||||
Err(NavError::Cdp(e)) => {
|
||||
page.close().await.ok();
|
||||
return Err(PageError::Other(anyhow::Error::from(e)));
|
||||
}
|
||||
}
|
||||
// Stopgap until we wait on a specific selector per page type —
|
||||
// gives any post-load JS a beat to finish injecting content.
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
|
||||
Reference in New Issue
Block a user