fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)
A chromium snapshot taken between the wrapper-render and row-render phases let parse_chapter_list return Ok(vec![]) for a manga that actually has chapters — the soft-drop branch in sync_manga_chapters then flipped every existing chapter to dropped_at. Add wait_for_selector to crawler::nav. navigate() now takes a CSS marker matching the most-specific element the downstream parser will look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER / DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass through because the parser's #chapter_table sentinel and the universal broken-page body check stay in force. Same pattern wired at the reader nav (a#pic_container) and probe nav (#logo), replacing the implicit assumption that the post-load JS had finished within 1 second. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.1"
|
version = "0.36.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.36.1"
|
version = "0.36.2"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -114,6 +114,16 @@ pub async fn sync_chapter_content(
|
|||||||
crate::crawler::nav::wait_for_nav(&page)
|
crate::crawler::nav::wait_for_nav(&page)
|
||||||
.await
|
.await
|
||||||
.context("wait for chapter nav")?;
|
.context("wait for chapter nav")?;
|
||||||
|
// Best-effort wait for the reader marker — same partial-render
|
||||||
|
// race that bit the chapter-list parser can hit here. Timeout is
|
||||||
|
// not an error; the chapter probe + parser sentinels still catch
|
||||||
|
// real failures.
|
||||||
|
let _ = crate::crawler::nav::wait_for_selector(
|
||||||
|
&page,
|
||||||
|
"a#pic_container",
|
||||||
|
crate::crawler::nav::SELECTOR_TIMEOUT,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
let html = page.content().await.context("read chapter html")?;
|
let html = page.content().await.context("read chapter html")?;
|
||||||
page.close().await.ok();
|
page.close().await.ok();
|
||||||
|
|||||||
@@ -43,6 +43,48 @@ pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Poll interval for [`wait_for_selector`]. 100ms is fast enough that a
|
||||||
|
/// page rendering in 200ms isn't held back noticeably, and slow enough
|
||||||
|
/// not to spam CDP with `find_element` calls on a page that's actually
|
||||||
|
/// taking its time.
|
||||||
|
const SELECTOR_POLL_INTERVAL: Duration = Duration::from_millis(100);
|
||||||
|
|
||||||
|
/// Wait until `selector` matches at least one element on `page`, or
|
||||||
|
/// `timeout` elapses. Used after a navigation to confirm a page-type-
|
||||||
|
/// specific marker is in the DOM before parsing — replaces the fixed
|
||||||
|
/// post-nav sleep that previously masked partial-render races.
|
||||||
|
///
|
||||||
|
/// chromiumoxide 0.7.0 has no built-in `wait_for_selector`, so we poll
|
||||||
|
/// `find_element` at [`SELECTOR_POLL_INTERVAL`] until success or budget
|
||||||
|
/// exhaustion. A failed `find_element` is *not* an error here — it just
|
||||||
|
/// means "not yet" — we only surface an error once the overall
|
||||||
|
/// `timeout` is up.
|
||||||
|
pub async fn wait_for_selector(
|
||||||
|
page: &Page,
|
||||||
|
selector: &str,
|
||||||
|
timeout: Duration,
|
||||||
|
) -> Result<(), NavError> {
|
||||||
|
let deadline = tokio::time::Instant::now() + timeout;
|
||||||
|
loop {
|
||||||
|
if page.find_element(selector).await.is_ok() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
if tokio::time::Instant::now() >= deadline {
|
||||||
|
return Err(NavError::Timeout(timeout));
|
||||||
|
}
|
||||||
|
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
|
||||||
|
let sleep_for = SELECTOR_POLL_INTERVAL.min(remaining);
|
||||||
|
tokio::time::sleep(sleep_for).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-page-type budget for [`wait_for_selector`]. Shorter than
|
||||||
|
/// [`NAV_TIMEOUT`] because by the time we're waiting on a selector, the
|
||||||
|
/// page has already responded — we're only absorbing post-load JS
|
||||||
|
/// finishing its row injection, which on a healthy site takes well
|
||||||
|
/// under a second.
|
||||||
|
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -65,4 +107,28 @@ mod tests {
|
|||||||
let e = NavError::Timeout(Duration::from_secs(30));
|
let e = NavError::Timeout(Duration::from_secs(30));
|
||||||
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
|
||||||
|
/// but for the [`wait_for_selector`] polling pattern: the loop must
|
||||||
|
/// surrender on `Elapsed` rather than spinning past the deadline.
|
||||||
|
#[tokio::test(flavor = "current_thread", start_paused = true)]
|
||||||
|
async fn selector_polling_pattern_surrenders_at_deadline() {
|
||||||
|
let timeout = Duration::from_millis(300);
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
let deadline = start + timeout;
|
||||||
|
// Simulate find_element forever returning "not found".
|
||||||
|
let mut polls = 0u32;
|
||||||
|
let result: Result<(), NavError> = loop {
|
||||||
|
polls += 1;
|
||||||
|
if tokio::time::Instant::now() >= deadline {
|
||||||
|
break Err(NavError::Timeout(timeout));
|
||||||
|
}
|
||||||
|
tokio::time::sleep(SELECTOR_POLL_INTERVAL).await;
|
||||||
|
};
|
||||||
|
assert!(matches!(result, Err(NavError::Timeout(_))));
|
||||||
|
// 300ms / 100ms poll interval ≈ 3 iterations plus the final check
|
||||||
|
// that breaks out. Allow some slack since the first poll happens
|
||||||
|
// before any sleep.
|
||||||
|
assert!(polls >= 3, "expected at least 3 poll iterations, got {polls}");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -206,6 +206,15 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
|
|||||||
crate::crawler::nav::wait_for_nav(&page)
|
crate::crawler::nav::wait_for_nav(&page)
|
||||||
.await
|
.await
|
||||||
.context("wait for nav on probe")?;
|
.context("wait for nav on probe")?;
|
||||||
|
// Best-effort wait for the layout marker. Timeout is fine — the
|
||||||
|
// probe classifier handles a missing `#logo` as Transient anyway,
|
||||||
|
// and the verify loop retries on Transient.
|
||||||
|
let _ = crate::crawler::nav::wait_for_selector(
|
||||||
|
&page,
|
||||||
|
"#logo",
|
||||||
|
crate::crawler::nav::SELECTOR_TIMEOUT,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
let html = page.content().await.context("read probe html")?;
|
let html = page.content().await.context("read probe html")?;
|
||||||
page.close().await.ok();
|
page.close().await.ok();
|
||||||
Ok(html)
|
Ok(html)
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ use super::{
|
|||||||
use crate::crawler::detect::{
|
use crate::crawler::detect::{
|
||||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||||
};
|
};
|
||||||
use crate::crawler::nav::{wait_for_nav, NavError};
|
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
|
||||||
|
|
||||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||||
/// daemon can look up per-source state (e.g. the recovery flag) before
|
/// daemon can look up per-source state (e.g. the recovery flag) before
|
||||||
@@ -80,7 +80,9 @@ impl Source for TargetSource {
|
|||||||
// page would otherwise abort the whole walk before we've even
|
// page would otherwise abort the whole walk before we've even
|
||||||
// started.
|
// started.
|
||||||
let first_html = retry_on_transient(
|
let first_html = retry_on_transient(
|
||||||
|| async { navigate(ctx, self.base_url.as_str()).await },
|
|| async {
|
||||||
|
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
||||||
|
},
|
||||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||||
PAGE_TRANSIENT_RETRY_DELAY,
|
PAGE_TRANSIENT_RETRY_DELAY,
|
||||||
)
|
)
|
||||||
@@ -109,7 +111,17 @@ impl Source for TargetSource {
|
|||||||
ctx: &FetchContext<'_>,
|
ctx: &FetchContext<'_>,
|
||||||
r: &SourceMangaRef,
|
r: &SourceMangaRef,
|
||||||
) -> anyhow::Result<SourceManga> {
|
) -> anyhow::Result<SourceManga> {
|
||||||
let html = navigate(ctx, r.url.as_str()).await?;
|
// When we'll parse the chapter table, wait for at least one
|
||||||
|
// chapter row to appear — that's the marker most sensitive to
|
||||||
|
// the post-load JS partial-render race. When we won't, fall
|
||||||
|
// back to the layout-level `#logo` so we still wait for the
|
||||||
|
// page to settle.
|
||||||
|
let marker = if self.parse_chapters {
|
||||||
|
DETAIL_PAGE_CHAPTERS_MARKER
|
||||||
|
} else {
|
||||||
|
DETAIL_PAGE_LAYOUT_MARKER
|
||||||
|
};
|
||||||
|
let html = navigate(ctx, r.url.as_str(), marker).await?;
|
||||||
// Convert PageError → anyhow::Error via `?`. PageError stays
|
// Convert PageError → anyhow::Error via `?`. PageError stays
|
||||||
// downcastable from the wrapped anyhow::Error so the pipeline
|
// downcastable from the wrapped anyhow::Error so the pipeline
|
||||||
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
|
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
|
||||||
@@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker {
|
|||||||
None => {
|
None => {
|
||||||
retry_on_transient(
|
retry_on_transient(
|
||||||
|| async {
|
|| async {
|
||||||
let html = navigate(ctx, self.base_url.as_str()).await?;
|
let html = navigate(
|
||||||
|
ctx,
|
||||||
|
self.base_url.as_str(),
|
||||||
|
LIST_PAGE_MARKER,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
let doc = scraper::Html::parse_document(&html);
|
let doc = scraper::Html::parse_document(&html);
|
||||||
parse_manga_list_from(&doc)
|
parse_manga_list_from(&doc)
|
||||||
},
|
},
|
||||||
@@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker {
|
|||||||
retry_on_transient(
|
retry_on_transient(
|
||||||
|| async {
|
|| async {
|
||||||
let url = page_url(&self.base_url, page_num);
|
let url = page_url(&self.base_url, page_num);
|
||||||
let html = navigate(ctx, &url).await?;
|
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
|
||||||
let doc = scraper::Html::parse_document(&html);
|
let doc = scraper::Html::parse_document(&html);
|
||||||
parse_manga_list_from(&doc)
|
parse_manga_list_from(&doc)
|
||||||
},
|
},
|
||||||
@@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Per-page-type markers used by `navigate`'s post-navigation wait.
|
||||||
|
/// Each is the most specific element the parser will later look for —
|
||||||
|
/// waiting on it closes the partial-render race (e.g. `#chapter_table`
|
||||||
|
/// wrapper present but rows still being injected by post-load JS) that
|
||||||
|
/// the old fixed 1s sleep masked. See [`navigate`].
|
||||||
|
const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
|
||||||
|
const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
|
||||||
|
const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
|
||||||
|
|
||||||
/// Single point of rate-limited navigation. Every Source request goes
|
/// Single point of rate-limited navigation. Every Source request goes
|
||||||
/// through here, so the per-host limiter map is the only knob that
|
/// through here, so the per-host limiter map is the only knob that
|
||||||
/// controls per-origin RPS. Also the choke point for transient-page
|
/// controls per-origin RPS. Also the choke point for transient-page
|
||||||
/// detection — every fetched body is screened by
|
/// detection — every fetched body is screened by
|
||||||
/// [`classify_navigate_html`] before being handed to a selector.
|
/// [`classify_navigate_html`] before being handed to a selector.
|
||||||
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
|
///
|
||||||
|
/// `marker` is a CSS selector the caller expects to find on the loaded
|
||||||
|
/// page. The wait is best-effort: a timeout is **not** an error
|
||||||
|
/// (legitimately-empty pages may never render the marker), it just
|
||||||
|
/// caps how long we'll hold for post-load JS to finish injecting
|
||||||
|
/// content. The parser's own sentinels and the universal broken-page
|
||||||
|
/// body check still catch real failures.
|
||||||
|
async fn navigate(
|
||||||
|
ctx: &FetchContext<'_>,
|
||||||
|
url: &str,
|
||||||
|
marker: &str,
|
||||||
|
) -> Result<String, PageError> {
|
||||||
ctx.rate.wait_for(url).await?;
|
ctx.rate.wait_for(url).await?;
|
||||||
let page = ctx
|
let page = ctx
|
||||||
.browser
|
.browser
|
||||||
@@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
|
|||||||
return Err(PageError::Other(anyhow::Error::from(e)));
|
return Err(PageError::Other(anyhow::Error::from(e)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Stopgap until we wait on a specific selector per page type —
|
// Best-effort wait for the page-type marker. We deliberately
|
||||||
// gives any post-load JS a beat to finish injecting content.
|
// discard a timeout here — see fn-level doc.
|
||||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
|
||||||
let html = page
|
let html = page
|
||||||
.content()
|
.content()
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.36.1",
|
"version": "0.36.2",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user