fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)

A chromium snapshot taken between the wrapper-render and row-render
phases let parse_chapter_list return Ok(vec![]) for a manga that
actually has chapters — the soft-drop branch in sync_manga_chapters
then flipped every existing chapter to dropped_at.

Add wait_for_selector to crawler::nav. navigate() now takes a CSS
marker matching the most-specific element the downstream parser will
look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER /
DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by
SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass
through because the parser's #chapter_table sentinel and the
universal broken-page body check stay in force.

Same pattern wired at the reader nav (a#pic_container) and probe
nav (#logo), replacing the implicit assumption that the post-load
JS had finished within 1 second.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-30 18:29:38 +02:00
parent e2bd1462ba
commit 8e0b638e3f
7 changed files with 134 additions and 12 deletions

View File

@@ -21,7 +21,7 @@ use super::{
use crate::crawler::detect::{
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
};
use crate::crawler::nav::{wait_for_nav, NavError};
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
/// `sources.id` value for this Source impl. Exposed as a const so the
/// daemon can look up per-source state (e.g. the recovery flag) before
@@ -80,7 +80,9 @@ impl Source for TargetSource {
// page would otherwise abort the whole walk before we've even
// started.
let first_html = retry_on_transient(
|| async { navigate(ctx, self.base_url.as_str()).await },
|| async {
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
},
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
)
@@ -109,7 +111,17 @@ impl Source for TargetSource {
ctx: &FetchContext<'_>,
r: &SourceMangaRef,
) -> anyhow::Result<SourceManga> {
let html = navigate(ctx, r.url.as_str()).await?;
// When we'll parse the chapter table, wait for at least one
// chapter row to appear — that's the marker most sensitive to
// the post-load JS partial-render race. When we won't, fall
// back to the layout-level `#logo` so we still wait for the
// page to settle.
let marker = if self.parse_chapters {
DETAIL_PAGE_CHAPTERS_MARKER
} else {
DETAIL_PAGE_LAYOUT_MARKER
};
let html = navigate(ctx, r.url.as_str(), marker).await?;
// Convert PageError → anyhow::Error via `?`. PageError stays
// downcastable from the wrapped anyhow::Error so the pipeline
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
@@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker {
None => {
retry_on_transient(
|| async {
let html = navigate(ctx, self.base_url.as_str()).await?;
let html = navigate(
ctx,
self.base_url.as_str(),
LIST_PAGE_MARKER,
)
.await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
@@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker {
retry_on_transient(
|| async {
let url = page_url(&self.base_url, page_num);
let html = navigate(ctx, &url).await?;
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
@@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker {
}
}
/// Per-page-type markers used by `navigate`'s post-navigation wait.
/// Each is the most specific element the parser will later look for —
/// waiting on it closes the partial-render race (e.g. `#chapter_table`
/// wrapper present but rows still being injected by post-load JS) that
/// the old fixed 1s sleep masked. See [`navigate`].
const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
/// Single point of rate-limited navigation. Every Source request goes
/// through here, so the per-host limiter map is the only knob that
/// controls per-origin RPS. Also the choke point for transient-page
/// detection — every fetched body is screened by
/// [`classify_navigate_html`] before being handed to a selector.
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
///
/// `marker` is a CSS selector the caller expects to find on the loaded
/// page. The wait is best-effort: a timeout is **not** an error
/// (legitimately-empty pages may never render the marker), it just
/// caps how long we'll hold for post-load JS to finish injecting
/// content. The parser's own sentinels and the universal broken-page
/// body check still catch real failures.
async fn navigate(
ctx: &FetchContext<'_>,
url: &str,
marker: &str,
) -> Result<String, PageError> {
ctx.rate.wait_for(url).await?;
let page = ctx
.browser
@@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
return Err(PageError::Other(anyhow::Error::from(e)));
}
}
// Stopgap until we wait on a specific selector per page type
// gives any post-load JS a beat to finish injecting content.
tokio::time::sleep(Duration::from_secs(1)).await;
// Best-effort wait for the page-type marker. We deliberately
// discard a timeout here — see fn-level doc.
let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
let html = page
.content()
.await