fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)
A chromium snapshot taken between the wrapper-render and row-render phases let parse_chapter_list return Ok(vec![]) for a manga that actually has chapters — the soft-drop branch in sync_manga_chapters then flipped every existing chapter to dropped_at. Add wait_for_selector to crawler::nav. navigate() now takes a CSS marker matching the most-specific element the downstream parser will look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER / DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass through because the parser's #chapter_table sentinel and the universal broken-page body check stay in force. Same pattern wired at the reader nav (a#pic_container) and probe nav (#logo), replacing the implicit assumption that the post-load JS had finished within 1 second. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -114,6 +114,16 @@ pub async fn sync_chapter_content(
|
||||
crate::crawler::nav::wait_for_nav(&page)
|
||||
.await
|
||||
.context("wait for chapter nav")?;
|
||||
// Best-effort wait for the reader marker — same partial-render
|
||||
// race that bit the chapter-list parser can hit here. Timeout is
|
||||
// not an error; the chapter probe + parser sentinels still catch
|
||||
// real failures.
|
||||
let _ = crate::crawler::nav::wait_for_selector(
|
||||
&page,
|
||||
"a#pic_container",
|
||||
crate::crawler::nav::SELECTOR_TIMEOUT,
|
||||
)
|
||||
.await;
|
||||
|
||||
let html = page.content().await.context("read chapter html")?;
|
||||
page.close().await.ok();
|
||||
|
||||
@@ -43,6 +43,48 @@ pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Poll interval for [`wait_for_selector`]. 100ms is fast enough that a
|
||||
/// page rendering in 200ms isn't held back noticeably, and slow enough
|
||||
/// not to spam CDP with `find_element` calls on a page that's actually
|
||||
/// taking its time.
|
||||
const SELECTOR_POLL_INTERVAL: Duration = Duration::from_millis(100);
|
||||
|
||||
/// Wait until `selector` matches at least one element on `page`, or
|
||||
/// `timeout` elapses. Used after a navigation to confirm a page-type-
|
||||
/// specific marker is in the DOM before parsing — replaces the fixed
|
||||
/// post-nav sleep that previously masked partial-render races.
|
||||
///
|
||||
/// chromiumoxide 0.7.0 has no built-in `wait_for_selector`, so we poll
|
||||
/// `find_element` at [`SELECTOR_POLL_INTERVAL`] until success or budget
|
||||
/// exhaustion. A failed `find_element` is *not* an error here — it just
|
||||
/// means "not yet" — we only surface an error once the overall
|
||||
/// `timeout` is up.
|
||||
pub async fn wait_for_selector(
|
||||
page: &Page,
|
||||
selector: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<(), NavError> {
|
||||
let deadline = tokio::time::Instant::now() + timeout;
|
||||
loop {
|
||||
if page.find_element(selector).await.is_ok() {
|
||||
return Ok(());
|
||||
}
|
||||
if tokio::time::Instant::now() >= deadline {
|
||||
return Err(NavError::Timeout(timeout));
|
||||
}
|
||||
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
|
||||
let sleep_for = SELECTOR_POLL_INTERVAL.min(remaining);
|
||||
tokio::time::sleep(sleep_for).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-page-type budget for [`wait_for_selector`]. Shorter than
|
||||
/// [`NAV_TIMEOUT`] because by the time we're waiting on a selector, the
|
||||
/// page has already responded — we're only absorbing post-load JS
|
||||
/// finishing its row injection, which on a healthy site takes well
|
||||
/// under a second.
|
||||
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -65,4 +107,28 @@ mod tests {
|
||||
let e = NavError::Timeout(Duration::from_secs(30));
|
||||
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
||||
}
|
||||
|
||||
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
|
||||
/// but for the [`wait_for_selector`] polling pattern: the loop must
|
||||
/// surrender on `Elapsed` rather than spinning past the deadline.
|
||||
#[tokio::test(flavor = "current_thread", start_paused = true)]
|
||||
async fn selector_polling_pattern_surrenders_at_deadline() {
|
||||
let timeout = Duration::from_millis(300);
|
||||
let start = tokio::time::Instant::now();
|
||||
let deadline = start + timeout;
|
||||
// Simulate find_element forever returning "not found".
|
||||
let mut polls = 0u32;
|
||||
let result: Result<(), NavError> = loop {
|
||||
polls += 1;
|
||||
if tokio::time::Instant::now() >= deadline {
|
||||
break Err(NavError::Timeout(timeout));
|
||||
}
|
||||
tokio::time::sleep(SELECTOR_POLL_INTERVAL).await;
|
||||
};
|
||||
assert!(matches!(result, Err(NavError::Timeout(_))));
|
||||
// 300ms / 100ms poll interval ≈ 3 iterations plus the final check
|
||||
// that breaks out. Allow some slack since the first poll happens
|
||||
// before any sleep.
|
||||
assert!(polls >= 3, "expected at least 3 poll iterations, got {polls}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,6 +206,15 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
|
||||
crate::crawler::nav::wait_for_nav(&page)
|
||||
.await
|
||||
.context("wait for nav on probe")?;
|
||||
// Best-effort wait for the layout marker. Timeout is fine — the
|
||||
// probe classifier handles a missing `#logo` as Transient anyway,
|
||||
// and the verify loop retries on Transient.
|
||||
let _ = crate::crawler::nav::wait_for_selector(
|
||||
&page,
|
||||
"#logo",
|
||||
crate::crawler::nav::SELECTOR_TIMEOUT,
|
||||
)
|
||||
.await;
|
||||
let html = page.content().await.context("read probe html")?;
|
||||
page.close().await.ok();
|
||||
Ok(html)
|
||||
|
||||
@@ -21,7 +21,7 @@ use super::{
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
};
|
||||
use crate::crawler::nav::{wait_for_nav, NavError};
|
||||
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
|
||||
|
||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||
/// daemon can look up per-source state (e.g. the recovery flag) before
|
||||
@@ -80,7 +80,9 @@ impl Source for TargetSource {
|
||||
// page would otherwise abort the whole walk before we've even
|
||||
// started.
|
||||
let first_html = retry_on_transient(
|
||||
|| async { navigate(ctx, self.base_url.as_str()).await },
|
||||
|| async {
|
||||
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
@@ -109,7 +111,17 @@ impl Source for TargetSource {
|
||||
ctx: &FetchContext<'_>,
|
||||
r: &SourceMangaRef,
|
||||
) -> anyhow::Result<SourceManga> {
|
||||
let html = navigate(ctx, r.url.as_str()).await?;
|
||||
// When we'll parse the chapter table, wait for at least one
|
||||
// chapter row to appear — that's the marker most sensitive to
|
||||
// the post-load JS partial-render race. When we won't, fall
|
||||
// back to the layout-level `#logo` so we still wait for the
|
||||
// page to settle.
|
||||
let marker = if self.parse_chapters {
|
||||
DETAIL_PAGE_CHAPTERS_MARKER
|
||||
} else {
|
||||
DETAIL_PAGE_LAYOUT_MARKER
|
||||
};
|
||||
let html = navigate(ctx, r.url.as_str(), marker).await?;
|
||||
// Convert PageError → anyhow::Error via `?`. PageError stays
|
||||
// downcastable from the wrapped anyhow::Error so the pipeline
|
||||
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
|
||||
@@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
None => {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let html = navigate(ctx, self.base_url.as_str()).await?;
|
||||
let html = navigate(
|
||||
ctx,
|
||||
self.base_url.as_str(),
|
||||
LIST_PAGE_MARKER,
|
||||
)
|
||||
.await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
@@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let url = page_url(&self.base_url, page_num);
|
||||
let html = navigate(ctx, &url).await?;
|
||||
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
@@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-page-type markers used by `navigate`'s post-navigation wait.
|
||||
/// Each is the most specific element the parser will later look for —
|
||||
/// waiting on it closes the partial-render race (e.g. `#chapter_table`
|
||||
/// wrapper present but rows still being injected by post-load JS) that
|
||||
/// the old fixed 1s sleep masked. See [`navigate`].
|
||||
const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
|
||||
const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
|
||||
const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
|
||||
|
||||
/// Single point of rate-limited navigation. Every Source request goes
|
||||
/// through here, so the per-host limiter map is the only knob that
|
||||
/// controls per-origin RPS. Also the choke point for transient-page
|
||||
/// detection — every fetched body is screened by
|
||||
/// [`classify_navigate_html`] before being handed to a selector.
|
||||
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
|
||||
///
|
||||
/// `marker` is a CSS selector the caller expects to find on the loaded
|
||||
/// page. The wait is best-effort: a timeout is **not** an error
|
||||
/// (legitimately-empty pages may never render the marker), it just
|
||||
/// caps how long we'll hold for post-load JS to finish injecting
|
||||
/// content. The parser's own sentinels and the universal broken-page
|
||||
/// body check still catch real failures.
|
||||
async fn navigate(
|
||||
ctx: &FetchContext<'_>,
|
||||
url: &str,
|
||||
marker: &str,
|
||||
) -> Result<String, PageError> {
|
||||
ctx.rate.wait_for(url).await?;
|
||||
let page = ctx
|
||||
.browser
|
||||
@@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
|
||||
return Err(PageError::Other(anyhow::Error::from(e)));
|
||||
}
|
||||
}
|
||||
// Stopgap until we wait on a specific selector per page type —
|
||||
// gives any post-load JS a beat to finish injecting content.
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
// Best-effort wait for the page-type marker. We deliberately
|
||||
// discard a timeout here — see fn-level doc.
|
||||
let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
|
||||
let html = page
|
||||
.content()
|
||||
.await
|
||||
|
||||
Reference in New Issue
Block a user