fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)

A chromium snapshot taken between the wrapper-render and row-render
phases let parse_chapter_list return Ok(vec![]) for a manga that
actually has chapters — the soft-drop branch in sync_manga_chapters
then flipped every existing chapter to dropped_at.

Add wait_for_selector to crawler::nav. navigate() now takes a CSS
marker matching the most-specific element the downstream parser will
look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER /
DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by
SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass
through because the parser's #chapter_table sentinel and the
universal broken-page body check stay in force.

Same pattern wired at the reader nav (a#pic_container) and probe
nav (#logo), replacing the implicit assumption that the post-load
JS had finished within 1 second.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-30 18:29:38 +02:00
parent e2bd1462ba
commit 8e0b638e3f
7 changed files with 134 additions and 12 deletions

2
backend/Cargo.lock generated
View File

@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mangalord"
version = "0.36.1"
version = "0.36.2"
dependencies = [
"anyhow",
"argon2",

View File

@@ -1,6 +1,6 @@
[package]
name = "mangalord"
version = "0.36.1"
version = "0.36.2"
edition = "2021"
default-run = "mangalord"

View File

@@ -114,6 +114,16 @@ pub async fn sync_chapter_content(
crate::crawler::nav::wait_for_nav(&page)
.await
.context("wait for chapter nav")?;
// Best-effort wait for the reader marker — same partial-render
// race that bit the chapter-list parser can hit here. Timeout is
// not an error; the chapter probe + parser sentinels still catch
// real failures.
let _ = crate::crawler::nav::wait_for_selector(
&page,
"a#pic_container",
crate::crawler::nav::SELECTOR_TIMEOUT,
)
.await;
let html = page.content().await.context("read chapter html")?;
page.close().await.ok();

View File

@@ -43,6 +43,48 @@ pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
}
}
/// Poll interval for [`wait_for_selector`]. 100ms is fast enough that a
/// page rendering in 200ms isn't held back noticeably, and slow enough
/// not to spam CDP with `find_element` calls on a page that's actually
/// taking its time.
const SELECTOR_POLL_INTERVAL: Duration = Duration::from_millis(100);
/// Wait until `selector` matches at least one element on `page`, or
/// `timeout` elapses. Used after a navigation to confirm a page-type-
/// specific marker is in the DOM before parsing — replaces the fixed
/// post-nav sleep that previously masked partial-render races.
///
/// chromiumoxide 0.7.0 has no built-in `wait_for_selector`, so we poll
/// `find_element` at [`SELECTOR_POLL_INTERVAL`] until success or budget
/// exhaustion. A failed `find_element` is *not* an error here — it just
/// means "not yet" — we only surface an error once the overall
/// `timeout` is up.
pub async fn wait_for_selector(
page: &Page,
selector: &str,
timeout: Duration,
) -> Result<(), NavError> {
let deadline = tokio::time::Instant::now() + timeout;
loop {
if page.find_element(selector).await.is_ok() {
return Ok(());
}
if tokio::time::Instant::now() >= deadline {
return Err(NavError::Timeout(timeout));
}
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
let sleep_for = SELECTOR_POLL_INTERVAL.min(remaining);
tokio::time::sleep(sleep_for).await;
}
}
/// Per-page-type budget for [`wait_for_selector`]. Shorter than
/// [`NAV_TIMEOUT`] because by the time we're waiting on a selector, the
/// page has already responded — we're only absorbing post-load JS
/// finishing its row injection, which on a healthy site takes well
/// under a second.
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
#[cfg(test)]
mod tests {
use super::*;
@@ -65,4 +107,28 @@ mod tests {
let e = NavError::Timeout(Duration::from_secs(30));
assert_eq!(e.to_string(), "navigation timed out after 30s");
}
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
/// but for the [`wait_for_selector`] polling pattern: the loop must
/// surrender on `Elapsed` rather than spinning past the deadline.
#[tokio::test(flavor = "current_thread", start_paused = true)]
async fn selector_polling_pattern_surrenders_at_deadline() {
let timeout = Duration::from_millis(300);
let start = tokio::time::Instant::now();
let deadline = start + timeout;
// Simulate find_element forever returning "not found".
let mut polls = 0u32;
let result: Result<(), NavError> = loop {
polls += 1;
if tokio::time::Instant::now() >= deadline {
break Err(NavError::Timeout(timeout));
}
tokio::time::sleep(SELECTOR_POLL_INTERVAL).await;
};
assert!(matches!(result, Err(NavError::Timeout(_))));
// 300ms / 100ms poll interval ≈ 3 iterations plus the final check
// that breaks out. Allow some slack since the first poll happens
// before any sleep.
assert!(polls >= 3, "expected at least 3 poll iterations, got {polls}");
}
}

View File

@@ -206,6 +206,15 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
crate::crawler::nav::wait_for_nav(&page)
.await
.context("wait for nav on probe")?;
// Best-effort wait for the layout marker. Timeout is fine — the
// probe classifier handles a missing `#logo` as Transient anyway,
// and the verify loop retries on Transient.
let _ = crate::crawler::nav::wait_for_selector(
&page,
"#logo",
crate::crawler::nav::SELECTOR_TIMEOUT,
)
.await;
let html = page.content().await.context("read probe html")?;
page.close().await.ok();
Ok(html)

View File

@@ -21,7 +21,7 @@ use super::{
use crate::crawler::detect::{
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
};
use crate::crawler::nav::{wait_for_nav, NavError};
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
/// `sources.id` value for this Source impl. Exposed as a const so the
/// daemon can look up per-source state (e.g. the recovery flag) before
@@ -80,7 +80,9 @@ impl Source for TargetSource {
// page would otherwise abort the whole walk before we've even
// started.
let first_html = retry_on_transient(
|| async { navigate(ctx, self.base_url.as_str()).await },
|| async {
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
},
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
)
@@ -109,7 +111,17 @@ impl Source for TargetSource {
ctx: &FetchContext<'_>,
r: &SourceMangaRef,
) -> anyhow::Result<SourceManga> {
let html = navigate(ctx, r.url.as_str()).await?;
// When we'll parse the chapter table, wait for at least one
// chapter row to appear — that's the marker most sensitive to
// the post-load JS partial-render race. When we won't, fall
// back to the layout-level `#logo` so we still wait for the
// page to settle.
let marker = if self.parse_chapters {
DETAIL_PAGE_CHAPTERS_MARKER
} else {
DETAIL_PAGE_LAYOUT_MARKER
};
let html = navigate(ctx, r.url.as_str(), marker).await?;
// Convert PageError → anyhow::Error via `?`. PageError stays
// downcastable from the wrapped anyhow::Error so the pipeline
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
@@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker {
None => {
retry_on_transient(
|| async {
let html = navigate(ctx, self.base_url.as_str()).await?;
let html = navigate(
ctx,
self.base_url.as_str(),
LIST_PAGE_MARKER,
)
.await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
@@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker {
retry_on_transient(
|| async {
let url = page_url(&self.base_url, page_num);
let html = navigate(ctx, &url).await?;
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
@@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker {
}
}
/// Per-page-type markers used by `navigate`'s post-navigation wait.
/// Each is the most specific element the parser will later look for —
/// waiting on it closes the partial-render race (e.g. `#chapter_table`
/// wrapper present but rows still being injected by post-load JS) that
/// the old fixed 1s sleep masked. See [`navigate`].
const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
/// Single point of rate-limited navigation. Every Source request goes
/// through here, so the per-host limiter map is the only knob that
/// controls per-origin RPS. Also the choke point for transient-page
/// detection — every fetched body is screened by
/// [`classify_navigate_html`] before being handed to a selector.
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
///
/// `marker` is a CSS selector the caller expects to find on the loaded
/// page. The wait is best-effort: a timeout is **not** an error
/// (legitimately-empty pages may never render the marker), it just
/// caps how long we'll hold for post-load JS to finish injecting
/// content. The parser's own sentinels and the universal broken-page
/// body check still catch real failures.
async fn navigate(
ctx: &FetchContext<'_>,
url: &str,
marker: &str,
) -> Result<String, PageError> {
ctx.rate.wait_for(url).await?;
let page = ctx
.browser
@@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
return Err(PageError::Other(anyhow::Error::from(e)));
}
}
// Stopgap until we wait on a specific selector per page type
// gives any post-load JS a beat to finish injecting content.
tokio::time::sleep(Duration::from_secs(1)).await;
// Best-effort wait for the page-type marker. We deliberately
// discard a timeout here — see fn-level doc.
let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
let html = page
.content()
.await