fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)
A chromium snapshot taken between the wrapper-render and row-render phases let parse_chapter_list return Ok(vec![]) for a manga that actually has chapters — the soft-drop branch in sync_manga_chapters then flipped every existing chapter to dropped_at. Add wait_for_selector to crawler::nav. navigate() now takes a CSS marker matching the most-specific element the downstream parser will look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER / DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass through because the parser's #chapter_table sentinel and the universal broken-page body check stay in force. Same pattern wired at the reader nav (a#pic_container) and probe nav (#logo), replacing the implicit assumption that the post-load JS had finished within 1 second. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -21,7 +21,7 @@ use super::{
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
};
|
||||
use crate::crawler::nav::{wait_for_nav, NavError};
|
||||
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
|
||||
|
||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||
/// daemon can look up per-source state (e.g. the recovery flag) before
|
||||
@@ -80,7 +80,9 @@ impl Source for TargetSource {
|
||||
// page would otherwise abort the whole walk before we've even
|
||||
// started.
|
||||
let first_html = retry_on_transient(
|
||||
|| async { navigate(ctx, self.base_url.as_str()).await },
|
||||
|| async {
|
||||
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
@@ -109,7 +111,17 @@ impl Source for TargetSource {
|
||||
ctx: &FetchContext<'_>,
|
||||
r: &SourceMangaRef,
|
||||
) -> anyhow::Result<SourceManga> {
|
||||
let html = navigate(ctx, r.url.as_str()).await?;
|
||||
// When we'll parse the chapter table, wait for at least one
|
||||
// chapter row to appear — that's the marker most sensitive to
|
||||
// the post-load JS partial-render race. When we won't, fall
|
||||
// back to the layout-level `#logo` so we still wait for the
|
||||
// page to settle.
|
||||
let marker = if self.parse_chapters {
|
||||
DETAIL_PAGE_CHAPTERS_MARKER
|
||||
} else {
|
||||
DETAIL_PAGE_LAYOUT_MARKER
|
||||
};
|
||||
let html = navigate(ctx, r.url.as_str(), marker).await?;
|
||||
// Convert PageError → anyhow::Error via `?`. PageError stays
|
||||
// downcastable from the wrapped anyhow::Error so the pipeline
|
||||
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
|
||||
@@ -177,7 +189,12 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
None => {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let html = navigate(ctx, self.base_url.as_str()).await?;
|
||||
let html = navigate(
|
||||
ctx,
|
||||
self.base_url.as_str(),
|
||||
LIST_PAGE_MARKER,
|
||||
)
|
||||
.await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
@@ -191,7 +208,7 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let url = page_url(&self.base_url, page_num);
|
||||
let html = navigate(ctx, &url).await?;
|
||||
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
@@ -205,12 +222,32 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-page-type markers used by `navigate`'s post-navigation wait.
|
||||
/// Each is the most specific element the parser will later look for —
|
||||
/// waiting on it closes the partial-render race (e.g. `#chapter_table`
|
||||
/// wrapper present but rows still being injected by post-load JS) that
|
||||
/// the old fixed 1s sleep masked. See [`navigate`].
|
||||
const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
|
||||
const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
|
||||
const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
|
||||
|
||||
/// Single point of rate-limited navigation. Every Source request goes
|
||||
/// through here, so the per-host limiter map is the only knob that
|
||||
/// controls per-origin RPS. Also the choke point for transient-page
|
||||
/// detection — every fetched body is screened by
|
||||
/// [`classify_navigate_html`] before being handed to a selector.
|
||||
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
|
||||
///
|
||||
/// `marker` is a CSS selector the caller expects to find on the loaded
|
||||
/// page. The wait is best-effort: a timeout is **not** an error
|
||||
/// (legitimately-empty pages may never render the marker), it just
|
||||
/// caps how long we'll hold for post-load JS to finish injecting
|
||||
/// content. The parser's own sentinels and the universal broken-page
|
||||
/// body check still catch real failures.
|
||||
async fn navigate(
|
||||
ctx: &FetchContext<'_>,
|
||||
url: &str,
|
||||
marker: &str,
|
||||
) -> Result<String, PageError> {
|
||||
ctx.rate.wait_for(url).await?;
|
||||
let page = ctx
|
||||
.browser
|
||||
@@ -228,9 +265,9 @@ async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError
|
||||
return Err(PageError::Other(anyhow::Error::from(e)));
|
||||
}
|
||||
}
|
||||
// Stopgap until we wait on a specific selector per page type —
|
||||
// gives any post-load JS a beat to finish injecting content.
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
// Best-effort wait for the page-type marker. We deliberately
|
||||
// discard a timeout here — see fn-level doc.
|
||||
let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
|
||||
let html = page
|
||||
.content()
|
||||
.await
|
||||
|
||||
Reference in New Issue
Block a user