diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 1507bfd..a7ca47d 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.36.1" +version = "0.36.2" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 69d73ee..a5dac77 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.36.1" +version = "0.36.2" edition = "2021" default-run = "mangalord" diff --git a/backend/src/crawler/content.rs b/backend/src/crawler/content.rs index c08e727..7843561 100644 --- a/backend/src/crawler/content.rs +++ b/backend/src/crawler/content.rs @@ -114,6 +114,16 @@ pub async fn sync_chapter_content( crate::crawler::nav::wait_for_nav(&page) .await .context("wait for chapter nav")?; + // Best-effort wait for the reader marker — same partial-render + // race that bit the chapter-list parser can hit here. Timeout is + // not an error; the chapter probe + parser sentinels still catch + // real failures. + let _ = crate::crawler::nav::wait_for_selector( + &page, + "a#pic_container", + crate::crawler::nav::SELECTOR_TIMEOUT, + ) + .await; let html = page.content().await.context("read chapter html")?; page.close().await.ok(); diff --git a/backend/src/crawler/nav.rs b/backend/src/crawler/nav.rs index 24ff0e0..e5fcd07 100644 --- a/backend/src/crawler/nav.rs +++ b/backend/src/crawler/nav.rs @@ -43,6 +43,48 @@ pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> { } } +/// Poll interval for [`wait_for_selector`]. 100ms is fast enough that a +/// page rendering in 200ms isn't held back noticeably, and slow enough +/// not to spam CDP with `find_element` calls on a page that's actually +/// taking its time. +const SELECTOR_POLL_INTERVAL: Duration = Duration::from_millis(100); + +/// Wait until `selector` matches at least one element on `page`, or +/// `timeout` elapses. Used after a navigation to confirm a page-type- +/// specific marker is in the DOM before parsing — replaces the fixed +/// post-nav sleep that previously masked partial-render races. +/// +/// chromiumoxide 0.7.0 has no built-in `wait_for_selector`, so we poll +/// `find_element` at [`SELECTOR_POLL_INTERVAL`] until success or budget +/// exhaustion. A failed `find_element` is *not* an error here — it just +/// means "not yet" — we only surface an error once the overall +/// `timeout` is up. +pub async fn wait_for_selector( + page: &Page, + selector: &str, + timeout: Duration, +) -> Result<(), NavError> { + let deadline = tokio::time::Instant::now() + timeout; + loop { + if page.find_element(selector).await.is_ok() { + return Ok(()); + } + if tokio::time::Instant::now() >= deadline { + return Err(NavError::Timeout(timeout)); + } + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + let sleep_for = SELECTOR_POLL_INTERVAL.min(remaining); + tokio::time::sleep(sleep_for).await; + } +} + +/// Per-page-type budget for [`wait_for_selector`]. Shorter than +/// [`NAV_TIMEOUT`] because by the time we're waiting on a selector, the +/// page has already responded — we're only absorbing post-load JS +/// finishing its row injection, which on a healthy site takes well +/// under a second. +pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10); + #[cfg(test)] mod tests { use super::*; @@ -65,4 +107,28 @@ mod tests { let e = NavError::Timeout(Duration::from_secs(30)); assert_eq!(e.to_string(), "navigation timed out after 30s"); } + + /// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`], + /// but for the [`wait_for_selector`] polling pattern: the loop must + /// surrender on `Elapsed` rather than spinning past the deadline. + #[tokio::test(flavor = "current_thread", start_paused = true)] + async fn selector_polling_pattern_surrenders_at_deadline() { + let timeout = Duration::from_millis(300); + let start = tokio::time::Instant::now(); + let deadline = start + timeout; + // Simulate find_element forever returning "not found". + let mut polls = 0u32; + let result: Result<(), NavError> = loop { + polls += 1; + if tokio::time::Instant::now() >= deadline { + break Err(NavError::Timeout(timeout)); + } + tokio::time::sleep(SELECTOR_POLL_INTERVAL).await; + }; + assert!(matches!(result, Err(NavError::Timeout(_)))); + // 300ms / 100ms poll interval ≈ 3 iterations plus the final check + // that breaks out. Allow some slack since the first poll happens + // before any sleep. + assert!(polls >= 3, "expected at least 3 poll iterations, got {polls}"); + } } diff --git a/backend/src/crawler/session.rs b/backend/src/crawler/session.rs index d222d43..060b4ae 100644 --- a/backend/src/crawler/session.rs +++ b/backend/src/crawler/session.rs @@ -206,6 +206,15 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result< crate::crawler::nav::wait_for_nav(&page) .await .context("wait for nav on probe")?; + // Best-effort wait for the layout marker. Timeout is fine — the + // probe classifier handles a missing `#logo` as Transient anyway, + // and the verify loop retries on Transient. + let _ = crate::crawler::nav::wait_for_selector( + &page, + "#logo", + crate::crawler::nav::SELECTOR_TIMEOUT, + ) + .await; let html = page.content().await.context("read probe html")?; page.close().await.ok(); Ok(html) diff --git a/backend/src/crawler/source/target.rs b/backend/src/crawler/source/target.rs index 426ed3c..1ae00fc 100644 --- a/backend/src/crawler/source/target.rs +++ b/backend/src/crawler/source/target.rs @@ -21,7 +21,7 @@ use super::{ use crate::crawler::detect::{ has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError, }; -use crate::crawler::nav::{wait_for_nav, NavError}; +use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT}; /// `sources.id` value for this Source impl. Exposed as a const so the /// daemon can look up per-source state (e.g. the recovery flag) before @@ -80,7 +80,9 @@ impl Source for TargetSource { // page would otherwise abort the whole walk before we've even // started. let first_html = retry_on_transient( - || async { navigate(ctx, self.base_url.as_str()).await }, + || async { + navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await + }, PAGE_TRANSIENT_RETRY_ATTEMPTS, PAGE_TRANSIENT_RETRY_DELAY, ) @@ -109,7 +111,17 @@ impl Source for TargetSource { ctx: &FetchContext<'_>, r: &SourceMangaRef, ) -> anyhow::Result { - let html = navigate(ctx, r.url.as_str()).await?; + // When we'll parse the chapter table, wait for at least one + // chapter row to appear — that's the marker most sensitive to + // the post-load JS partial-render race. When we won't, fall + // back to the layout-level `#logo` so we still wait for the + // page to settle. + let marker = if self.parse_chapters { + DETAIL_PAGE_CHAPTERS_MARKER + } else { + DETAIL_PAGE_LAYOUT_MARKER + }; + let html = navigate(ctx, r.url.as_str(), marker).await?; // Convert PageError → anyhow::Error via `?`. PageError stays // downcastable from the wrapped anyhow::Error so the pipeline // can still recognize Transient via `error.downcast_ref::()`. @@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker { None => { retry_on_transient( || async { - let html = navigate(ctx, self.base_url.as_str()).await?; + let html = navigate( + ctx, + self.base_url.as_str(), + LIST_PAGE_MARKER, + ) + .await?; let doc = scraper::Html::parse_document(&html); parse_manga_list_from(&doc) }, @@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker { retry_on_transient( || async { let url = page_url(&self.base_url, page_num); - let html = navigate(ctx, &url).await?; + let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?; let doc = scraper::Html::parse_document(&html); parse_manga_list_from(&doc) }, @@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker { } } +/// Per-page-type markers used by `navigate`'s post-navigation wait. +/// Each is the most specific element the parser will later look for — +/// waiting on it closes the partial-render race (e.g. `#chapter_table` +/// wrapper present but rows still being injected by post-load JS) that +/// the old fixed 1s sleep masked. See [`navigate`]. +const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli"; +const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico"; +const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo"; + /// Single point of rate-limited navigation. Every Source request goes /// through here, so the per-host limiter map is the only knob that /// controls per-origin RPS. Also the choke point for transient-page /// detection — every fetched body is screened by /// [`classify_navigate_html`] before being handed to a selector. -async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result { +/// +/// `marker` is a CSS selector the caller expects to find on the loaded +/// page. The wait is best-effort: a timeout is **not** an error +/// (legitimately-empty pages may never render the marker), it just +/// caps how long we'll hold for post-load JS to finish injecting +/// content. The parser's own sentinels and the universal broken-page +/// body check still catch real failures. +async fn navigate( + ctx: &FetchContext<'_>, + url: &str, + marker: &str, +) -> Result { ctx.rate.wait_for(url).await?; let page = ctx .browser @@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result