fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)

A chromium snapshot taken between the wrapper-render and row-render phases let parse_chapter_list return Ok(vec![]) for a manga that actually has chapters — the soft-drop branch in sync_manga_chapters then flipped every existing chapter to dropped_at. Add wait_for_selector to crawler::nav. navigate() now takes a CSS marker matching the most-specific element the downstream parser will look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER / DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass through because the parser's #chapter_table sentinel and the universal broken-page body check stay in force. Same pattern wired at the reader nav (a#pic_container) and probe nav (#logo), replacing the implicit assumption that the post-load JS had finished within 1 second. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-30 18:29:38 +02:00
parent e2bd1462ba
commit 8e0b638e3f
7 changed files with 134 additions and 12 deletions
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"

 [[package]]
 name = "mangalord"
-version = "0.36.1"
+version = "0.36.2"
 dependencies = [
 "anyhow",
 "argon2",
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mangalord"
-version = "0.36.1"
+version = "0.36.2"
 edition = "2021"
 default-run = "mangalord"

--- a/backend/src/crawler/content.rs
+++ b/backend/src/crawler/content.rs
@@ -114,6 +114,16 @@ pub async fn sync_chapter_content(
    crate::crawler::nav::wait_for_nav(&page)
        .await
        .context("wait for chapter nav")?;
+    // Best-effort wait for the reader marker — same partial-render
+    // race that bit the chapter-list parser can hit here. Timeout is
+    // not an error; the chapter probe + parser sentinels still catch
+    // real failures.
+    let _ = crate::crawler::nav::wait_for_selector(
+        &page,
+        "a#pic_container",
+        crate::crawler::nav::SELECTOR_TIMEOUT,
+    )
+    .await;

    let html = page.content().await.context("read chapter html")?;
    page.close().await.ok();
--- a/backend/src/crawler/nav.rs
+++ b/backend/src/crawler/nav.rs
@@ -43,6 +43,48 @@ pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
    }
 }

+/// Poll interval for [`wait_for_selector`]. 100ms is fast enough that a
+/// page rendering in 200ms isn't held back noticeably, and slow enough
+/// not to spam CDP with `find_element` calls on a page that's actually
+/// taking its time.
+const SELECTOR_POLL_INTERVAL: Duration = Duration::from_millis(100);
+
+/// Wait until `selector` matches at least one element on `page`, or
+/// `timeout` elapses. Used after a navigation to confirm a page-type-
+/// specific marker is in the DOM before parsing — replaces the fixed
+/// post-nav sleep that previously masked partial-render races.
+///
+/// chromiumoxide 0.7.0 has no built-in `wait_for_selector`, so we poll
+/// `find_element` at [`SELECTOR_POLL_INTERVAL`] until success or budget
+/// exhaustion. A failed `find_element` is *not* an error here — it just
+/// means "not yet" — we only surface an error once the overall
+/// `timeout` is up.
+pub async fn wait_for_selector(
+    page: &Page,
+    selector: &str,
+    timeout: Duration,
+) -> Result<(), NavError> {
+    let deadline = tokio::time::Instant::now() + timeout;
+    loop {
+        if page.find_element(selector).await.is_ok() {
+            return Ok(());
+        }
+        if tokio::time::Instant::now() >= deadline {
+            return Err(NavError::Timeout(timeout));
+        }
+        let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
+        let sleep_for = SELECTOR_POLL_INTERVAL.min(remaining);
+        tokio::time::sleep(sleep_for).await;
+    }
+}
+
+/// Per-page-type budget for [`wait_for_selector`]. Shorter than
+/// [`NAV_TIMEOUT`] because by the time we're waiting on a selector, the
+/// page has already responded — we're only absorbing post-load JS
+/// finishing its row injection, which on a healthy site takes well
+/// under a second.
+pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -65,4 +107,28 @@ mod tests {
        let e = NavError::Timeout(Duration::from_secs(30));
        assert_eq!(e.to_string(), "navigation timed out after 30s");
    }
+
+    /// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
+    /// but for the [`wait_for_selector`] polling pattern: the loop must
+    /// surrender on `Elapsed` rather than spinning past the deadline.
+    #[tokio::test(flavor = "current_thread", start_paused = true)]
+    async fn selector_polling_pattern_surrenders_at_deadline() {
+        let timeout = Duration::from_millis(300);
+        let start = tokio::time::Instant::now();
+        let deadline = start + timeout;
+        // Simulate find_element forever returning "not found".
+        let mut polls = 0u32;
+        let result: Result<(), NavError> = loop {
+            polls += 1;
+            if tokio::time::Instant::now() >= deadline {
+                break Err(NavError::Timeout(timeout));
+            }
+            tokio::time::sleep(SELECTOR_POLL_INTERVAL).await;
+        };
+        assert!(matches!(result, Err(NavError::Timeout(_))));
+        // 300ms / 100ms poll interval ≈ 3 iterations plus the final check
+        // that breaks out. Allow some slack since the first poll happens
+        // before any sleep.
+        assert!(polls >= 3, "expected at least 3 poll iterations, got {polls}");
+    }
 }
--- a/backend/src/crawler/session.rs
+++ b/backend/src/crawler/session.rs
@@ -206,6 +206,15 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
    crate::crawler::nav::wait_for_nav(&page)
        .await
        .context("wait for nav on probe")?;
+    // Best-effort wait for the layout marker. Timeout is fine — the
+    // probe classifier handles a missing `#logo` as Transient anyway,
+    // and the verify loop retries on Transient.
+    let _ = crate::crawler::nav::wait_for_selector(
+        &page,
+        "#logo",
+        crate::crawler::nav::SELECTOR_TIMEOUT,
+    )
+    .await;
    let html = page.content().await.context("read probe html")?;
    page.close().await.ok();
    Ok(html)
--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -21,7 +21,7 @@ use super::{
 use crate::crawler::detect::{
    has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
 };
-use crate::crawler::nav::{wait_for_nav, NavError};
+use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};

 /// `sources.id` value for this Source impl. Exposed as a const so the
 /// daemon can look up per-source state (e.g. the recovery flag) before
@@ -80,7 +80,9 @@ impl Source for TargetSource {
        // page would otherwise abort the whole walk before we've even
        // started.
        let first_html = retry_on_transient(
-            || async { navigate(ctx, self.base_url.as_str()).await },
+            || async {
+                navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
+            },
            PAGE_TRANSIENT_RETRY_ATTEMPTS,
            PAGE_TRANSIENT_RETRY_DELAY,
        )
@@ -109,7 +111,17 @@ impl Source for TargetSource {
        ctx: &FetchContext<'_>,
        r: &SourceMangaRef,
    ) -> anyhow::Result<SourceManga> {
-        let html = navigate(ctx, r.url.as_str()).await?;
+        // When we'll parse the chapter table, wait for at least one
+        // chapter row to appear — that's the marker most sensitive to
+        // the post-load JS partial-render race. When we won't, fall
+        // back to the layout-level `#logo` so we still wait for the
+        // page to settle.
+        let marker = if self.parse_chapters {
+            DETAIL_PAGE_CHAPTERS_MARKER
+        } else {
+            DETAIL_PAGE_LAYOUT_MARKER
+        };
+        let html = navigate(ctx, r.url.as_str(), marker).await?;
        // Convert PageError → anyhow::Error via `?`. PageError stays
        // downcastable from the wrapped anyhow::Error so the pipeline
        // can still recognize Transient via `error.downcast_ref::<PageError>()`.
@@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker {
                None => {
                    retry_on_transient(
                        || async {
-                            let html = navigate(ctx, self.base_url.as_str()).await?;
+                            let html = navigate(
+                                ctx,
+                                self.base_url.as_str(),
+                                LIST_PAGE_MARKER,
+                            )
+                            .await?;
                            let doc = scraper::Html::parse_document(&html);
                            parse_manga_list_from(&doc)
                        },
@@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker {
            retry_on_transient(
                || async {
                    let url = page_url(&self.base_url, page_num);
-                    let html = navigate(ctx, &url).await?;
+                    let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
                    let doc = scraper::Html::parse_document(&html);
                    parse_manga_list_from(&doc)
                },
@@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker {
    }
 }

+/// Per-page-type markers used by `navigate`'s post-navigation wait.
+/// Each is the most specific element the parser will later look for —
+/// waiting on it closes the partial-render race (e.g. `#chapter_table`
+/// wrapper present but rows still being injected by post-load JS) that
+/// the old fixed 1s sleep masked. See [`navigate`].
+const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
+const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
+const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
+
 /// Single point of rate-limited navigation. Every Source request goes
 /// through here, so the per-host limiter map is the only knob that
 /// controls per-origin RPS. Also the choke point for transient-page
 /// detection — every fetched body is screened by
 /// [`classify_navigate_html`] before being handed to a selector.
-async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
+///
+/// `marker` is a CSS selector the caller expects to find on the loaded
+/// page. The wait is best-effort: a timeout is **not** an error
+/// (legitimately-empty pages may never render the marker), it just
+/// caps how long we'll hold for post-load JS to finish injecting
+/// content. The parser's own sentinels and the universal broken-page
+/// body check still catch real failures.
+async fn navigate(
+    ctx: &FetchContext<'_>,
+    url: &str,
+    marker: &str,
+) -> Result<String, PageError> {
    ctx.rate.wait_for(url).await?;
    let page = ctx
        .browser
@@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
            return Err(PageError::Other(anyhow::Error::from(e)));
        }
    }
-    // Stopgap until we wait on a specific selector per page type —
-    // gives any post-load JS a beat to finish injecting content.
-    tokio::time::sleep(Duration::from_secs(1)).await;
+    // Best-effort wait for the page-type marker. We deliberately
+    // discard a timeout here — see fn-level doc.
+    let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
    let html = page
        .content()
        .await