fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)

A chromium snapshot taken between the wrapper-render and row-render phases let parse_chapter_list return Ok(vec![]) for a manga that actually has chapters — the soft-drop branch in sync_manga_chapters then flipped every existing chapter to dropped_at. Add wait_for_selector to crawler::nav. navigate() now takes a CSS marker matching the most-specific element the downstream parser will look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER / DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass through because the parser's #chapter_table sentinel and the universal broken-page body check stay in force. Same pattern wired at the reader nav (a#pic_container) and probe nav (#logo), replacing the implicit assumption that the post-load JS had finished within 1 second. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-30 18:29:38 +02:00
parent e2bd1462ba
commit 8e0b638e3f
7 changed files with 134 additions and 12 deletions
--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -21,7 +21,7 @@ use super::{
 use crate::crawler::detect::{
    has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
 };
-use crate::crawler::nav::{wait_for_nav, NavError};
+use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};

 /// `sources.id` value for this Source impl. Exposed as a const so the
 /// daemon can look up per-source state (e.g. the recovery flag) before
@@ -80,7 +80,9 @@ impl Source for TargetSource {
        // page would otherwise abort the whole walk before we've even
        // started.
        let first_html = retry_on_transient(
-            || async { navigate(ctx, self.base_url.as_str()).await },
+            || async {
+                navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
+            },
            PAGE_TRANSIENT_RETRY_ATTEMPTS,
            PAGE_TRANSIENT_RETRY_DELAY,
        )
@@ -109,7 +111,17 @@ impl Source for TargetSource {
        ctx: &FetchContext<'_>,
        r: &SourceMangaRef,
    ) -> anyhow::Result<SourceManga> {
-        let html = navigate(ctx, r.url.as_str()).await?;
+        // When we'll parse the chapter table, wait for at least one
+        // chapter row to appear — that's the marker most sensitive to
+        // the post-load JS partial-render race. When we won't, fall
+        // back to the layout-level `#logo` so we still wait for the
+        // page to settle.
+        let marker = if self.parse_chapters {
+            DETAIL_PAGE_CHAPTERS_MARKER
+        } else {
+            DETAIL_PAGE_LAYOUT_MARKER
+        };
+        let html = navigate(ctx, r.url.as_str(), marker).await?;
        // Convert PageError → anyhow::Error via `?`. PageError stays
        // downcastable from the wrapped anyhow::Error so the pipeline
        // can still recognize Transient via `error.downcast_ref::<PageError>()`.
@@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker {
                None => {
                    retry_on_transient(
                        || async {
-                            let html = navigate(ctx, self.base_url.as_str()).await?;
+                            let html = navigate(
+                                ctx,
+                                self.base_url.as_str(),
+                                LIST_PAGE_MARKER,
+                            )
+                            .await?;
                            let doc = scraper::Html::parse_document(&html);
                            parse_manga_list_from(&doc)
                        },
@@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker {
            retry_on_transient(
                || async {
                    let url = page_url(&self.base_url, page_num);
-                    let html = navigate(ctx, &url).await?;
+                    let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
                    let doc = scraper::Html::parse_document(&html);
                    parse_manga_list_from(&doc)
                },
@@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker {
    }
 }

+/// Per-page-type markers used by `navigate`'s post-navigation wait.
+/// Each is the most specific element the parser will later look for —
+/// waiting on it closes the partial-render race (e.g. `#chapter_table`
+/// wrapper present but rows still being injected by post-load JS) that
+/// the old fixed 1s sleep masked. See [`navigate`].
+const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
+const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
+const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
+
 /// Single point of rate-limited navigation. Every Source request goes
 /// through here, so the per-host limiter map is the only knob that
 /// controls per-origin RPS. Also the choke point for transient-page
 /// detection — every fetched body is screened by
 /// [`classify_navigate_html`] before being handed to a selector.
-async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
+///
+/// `marker` is a CSS selector the caller expects to find on the loaded
+/// page. The wait is best-effort: a timeout is **not** an error
+/// (legitimately-empty pages may never render the marker), it just
+/// caps how long we'll hold for post-load JS to finish injecting
+/// content. The parser's own sentinels and the universal broken-page
+/// body check still catch real failures.
+async fn navigate(
+    ctx: &FetchContext<'_>,
+    url: &str,
+    marker: &str,
+) -> Result<String, PageError> {
    ctx.rate.wait_for(url).await?;
    let page = ctx
        .browser
@@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
            return Err(PageError::Other(anyhow::Error::from(e)));
        }
    }
-    // Stopgap until we wait on a specific selector per page type —
-    // gives any post-load JS a beat to finish injecting content.
-    tokio::time::sleep(Duration::from_secs(1)).await;
+    // Best-effort wait for the page-type marker. We deliberately
+    // discard a timeout here — see fn-level doc.
+    let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
    let html = page
        .content()
        .await