From 9ff49166a52ebad886488f740115bf79c6af3fe7 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Tue, 26 May 2026 22:47:21 +0200 Subject: [PATCH] feat: transient-page detection across the crawler (0.30.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now, when the target site returned its 403 "we're sorry, the request file are not found" response on a page that actually exists, selectors matched nothing and the crawler treated the page as "legitimately empty". Pagination walks silently dropped whole pages worth of mangas, fetch_manga skipped individual entries, and the startup session probe blamed PHPSESSID for what was a site hiccup. This branch adds a single detection layer that the whole pipeline routes through: - `crawler::detect`: PageError::Transient typed signal, plus two primitives (`is_broken_page_body` matches the universal 403 body; `has_logo_sentinel` asserts #logo, the site-wide header element) and a `retry_on_transient` helper that retries a closure on Transient with a small attempt budget. - `navigate()` screens every fetched body for the broken-page signature before handing it to a selector. - Parsers (`parse_manga_list_from`, `parse_manga_detail`, `parse_chapter_pages`) check their structural sentinels (#logo for full-layout pages; a#pic_container for the reader, which doesn't render #logo) and return Result<_, PageError>. Empty Vec is now reserved for genuinely empty pages. - `discover()` retries each pagination page up to 3× (2s apart) before failing the whole Discover job — at which point the existing job system's retry/backoff takes over for longer outages. - `verify_session` is three-state: broken-page → retry probe; #logo present but #avatar_menu absent → genuine logout (the only state that should blame PHPSESSID); both present → ok. Test coverage added at the helper level: 13 unit tests for the detection module (body signature, logo sentinel, PageError, retry helper), parser-level tests for both transient and legitimately-empty inputs, and 6 unit tests for the session probe classifier. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/crawler/content.rs | 36 +++- backend/src/crawler/detect.rs | 250 +++++++++++++++++++++++++++ backend/src/crawler/mod.rs | 1 + backend/src/crawler/session.rs | 175 ++++++++++++++++--- backend/src/crawler/source/target.rs | 185 +++++++++++++++++--- frontend/package.json | 2 +- 8 files changed, 594 insertions(+), 59 deletions(-) create mode 100644 backend/src/crawler/detect.rs diff --git a/backend/Cargo.lock b/backend/Cargo.lock index d22b297..f56c520 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.29.0" +version = "0.30.0" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 6bfc280..c31dec1 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.29.0" +version = "0.30.0" edition = "2021" default-run = "mangalord" diff --git a/backend/src/crawler/content.rs b/backend/src/crawler/content.rs index c804683..71fbfe3 100644 --- a/backend/src/crawler/content.rs +++ b/backend/src/crawler/content.rs @@ -16,6 +16,7 @@ use anyhow::Context; use sqlx::PgPool; use uuid::Uuid; +use crate::crawler::detect::PageError; use crate::crawler::rate_limit::HostRateLimiters; use crate::crawler::session; use crate::storage::Storage; @@ -23,8 +24,18 @@ use crate::storage::Storage; /// Parse the chapter page DOM and return the page images in `pageN` /// order. Filters out the loader `` and any /// `` without a numeric `id="pageN"`. -pub fn parse_chapter_pages(html: &str) -> Vec { +/// +/// Reader pages don't render the site's `#logo` element, so the +/// universal logo-sentinel can't apply here — instead we assert +/// `a#pic_container` is present. Its absence means the response is the +/// transient broken-page response (or a redirect to some other layout) +/// and the caller should retry. +pub fn parse_chapter_pages(html: &str) -> Result, PageError> { let doc = scraper::Html::parse_document(html); + let container_sel = scraper::Selector::parse("a#pic_container").unwrap(); + if doc.select(&container_sel).next().is_none() { + return Err(PageError::transient("reader: a#pic_container missing")); + } let sel = scraper::Selector::parse("a#pic_container img:not(.loading)").unwrap(); let mut pages: Vec = doc .select(&sel) @@ -39,7 +50,7 @@ pub fn parse_chapter_pages(html: &str) -> Vec { }) .collect(); pages.sort_by_key(|p| p.page_number); - pages + Ok(pages) } #[derive(Debug, Clone, PartialEq, Eq)] @@ -109,7 +120,8 @@ pub async fn sync_chapter_content( let html = page.content().await.context("read chapter html")?; page.close().await.ok(); - let images = parse_chapter_pages(&html); + let images = parse_chapter_pages(&html) + .with_context(|| format!("parse chapter pages at {source_url}"))?; if images.is_empty() { anyhow::bail!("no page images parsed from {source_url}"); } @@ -205,7 +217,7 @@ mod tests { "#; - let pages = parse_chapter_pages(html); + let pages = parse_chapter_pages(html).expect("parse"); assert_eq!(pages.len(), 2); assert_eq!(pages[0].page_number, 1); assert_eq!(pages[0].url, "https://cdn/1.jpg"); @@ -221,7 +233,7 @@ mod tests { "#; - let pages = parse_chapter_pages(html); + let pages = parse_chapter_pages(html).expect("parse"); assert_eq!(pages.len(), 1); assert_eq!(pages[0].page_number, 2); } @@ -235,10 +247,22 @@ mod tests { "#; - let pages = parse_chapter_pages(html); + let pages = parse_chapter_pages(html).expect("parse"); assert_eq!( pages.iter().map(|p| p.page_number).collect::>(), vec![9, 50, 126] ); } + + #[test] + fn parse_chapter_pages_returns_transient_when_container_missing() { + // Reader doesn't render #logo, so the universal logo sentinel + // can't be used here — a#pic_container is the reader-specific + // marker. Broken-page response trips this. + let html = "\ +

we're sorry, the request file are not found.

\ + "; + let err = parse_chapter_pages(html).expect_err("expected Transient"); + assert!(err.is_transient(), "got non-transient: {err}"); + } } diff --git a/backend/src/crawler/detect.rs b/backend/src/crawler/detect.rs new file mode 100644 index 0000000..63bf934 --- /dev/null +++ b/backend/src/crawler/detect.rs @@ -0,0 +1,250 @@ +//! Transient-page detection. +//! +//! The target site occasionally responds with a 403 + tiny "we're sorry, +//! the request file are not found" body on pages that actually exist. +//! Selectors on that body match nothing, which is indistinguishable from +//! a genuinely empty page unless we look for the broken-page markers +//! explicitly. The same shape covers full-site outages: 5xx pages, +//! Cloudflare interstitials, and "site is down" placeholders all share +//! the trait that the normal layout (`#logo` in the header) is absent. +//! +//! Helpers here are split into two signals so callers can compose them: +//! - [`is_broken_page_body`]: pattern-match on the known broken-page +//! string. Works for *any* page on the site, including the reader, +//! which doesn't render `#logo`. +//! - [`has_logo_sentinel`]: assert `#logo` is in the parsed DOM. Site- +//! structural marker — present on the manga list, manga detail, +//! chapter-list, and login probe pages. **Not** present on the reader, +//! so callers in the reader path must rely on the body signature only. +//! +//! [`PageError::Transient`] is the typed signal returned by parser and +//! navigate wrappers. Job handlers map it to "reschedule with backoff" +//! rather than the per-page silent skip the parsers used to do. + +use std::future::Future; +use std::time::Duration; + +use thiserror::Error; + +/// Universal substring of the broken-page body. The site renders the +/// exact string verbatim in a single `

`, so a case-insensitive +/// substring match is enough — we deliberately do *not* anchor to the +/// kaomoji because that part is more likely to change than the prose. +const BROKEN_PAGE_MARKER: &str = "we're sorry, the request file are not found"; + +/// Outcome of a page fetch or parse when the caller wants to +/// distinguish "site/page is transiently broken — retry later" from +/// other errors. `Transient` is the only retry-friendly variant; every +/// other failure mode stays as `anyhow::Error` and is treated as today. +#[derive(Debug, Error)] +pub enum PageError { + /// Page came back but the site signaled trouble — broken-page body + /// signature, structural sentinel missing, etc. Caller should + /// reschedule this fetch rather than treat it as data. + #[error("transient page error: {reason}")] + Transient { reason: String }, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl PageError { + pub fn transient(reason: impl Into) -> Self { + Self::Transient { reason: reason.into() } + } + + pub fn is_transient(&self) -> bool { + matches!(self, Self::Transient { .. }) + } +} + +/// Returns true when the response body matches the known broken-page +/// template. Case-insensitive substring match — small bodies (~150B) +/// make the scan trivially fast, and the broken page is always tiny so +/// false positives on a real catalog page are not a concern. +pub fn is_broken_page_body(html: &str) -> bool { + html.to_ascii_lowercase().contains(BROKEN_PAGE_MARKER) +} + +/// Returns true when the parsed document contains `#logo` — the site's +/// header logo element, present on every full-layout page and absent on +/// the broken-page response and on the reader. +pub fn has_logo_sentinel(doc: &scraper::Html) -> bool { + let sel = scraper::Selector::parse("#logo").expect("#logo is a valid selector"); + doc.select(&sel).next().is_some() +} + +/// Retry `op` up to `max_attempts` times whenever it returns +/// [`PageError::Transient`], sleeping `delay` between attempts. +/// Non-transient errors short-circuit immediately. Used by discover-loop +/// callers so a single broken page doesn't drop the whole walk — the +/// caller can fall back on the job system's retry/backoff once the +/// inline budget is exhausted. +pub async fn retry_on_transient( + mut op: F, + max_attempts: u32, + delay: Duration, +) -> Result +where + F: FnMut() -> Fut, + Fut: Future>, +{ + debug_assert!(max_attempts >= 1, "max_attempts must be at least 1"); + let mut attempt = 0u32; + loop { + attempt += 1; + match op().await { + Ok(v) => return Ok(v), + Err(e) if !e.is_transient() => return Err(e), + Err(e) if attempt >= max_attempts => return Err(e), + Err(e) => { + tracing::warn!( + attempt, + max_attempts, + error = %e, + "transient error; sleeping before retry" + ); + tokio::time::sleep(delay).await; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn broken_page_body_matches_exact_template() { + let html = "\ +

we're sorry, the request file are not found. Σ(っ°Д °;)っ

\ + "; + assert!(is_broken_page_body(html)); + } + + #[test] + fn broken_page_body_is_case_insensitive() { + let html = "

WE'RE SORRY, THE REQUEST FILE ARE NOT FOUND.

"; + assert!(is_broken_page_body(html)); + } + + #[test] + fn broken_page_body_does_not_match_normal_listing() { + let html = "\ +
  • Manga A
  • Manga B
"; + assert!(!is_broken_page_body(html)); + } + + #[test] + fn broken_page_body_does_not_match_empty_string() { + assert!(!is_broken_page_body("")); + } + + #[test] + fn logo_sentinel_present_on_normal_page() { + let doc = scraper::Html::parse_document( + "
...
", + ); + assert!(has_logo_sentinel(&doc)); + } + + #[test] + fn logo_sentinel_absent_on_broken_page() { + let doc = scraper::Html::parse_document( + "\ +

we're sorry, the request file are not found.

", + ); + assert!(!has_logo_sentinel(&doc)); + } + + #[test] + fn logo_sentinel_absent_on_empty_document() { + let doc = scraper::Html::parse_document(""); + assert!(!has_logo_sentinel(&doc)); + } + + #[test] + fn page_error_transient_constructor_sets_reason() { + let e = PageError::transient("logo missing"); + assert!(e.is_transient()); + assert_eq!(e.to_string(), "transient page error: logo missing"); + } + + #[test] + fn page_error_other_is_not_transient() { + let e: PageError = anyhow::anyhow!("something else").into(); + assert!(!e.is_transient()); + } + + #[tokio::test] + async fn retry_returns_ok_after_a_transient_streak() { + let mut attempt = 0u32; + let result: Result = retry_on_transient( + || { + attempt += 1; + let n = attempt; + async move { + if n < 3 { + Err(PageError::transient("not yet")) + } else { + Ok(42) + } + } + }, + 5, + Duration::from_millis(0), + ) + .await; + assert_eq!(result.unwrap(), 42); + assert_eq!(attempt, 3); + } + + #[tokio::test] + async fn retry_gives_up_after_max_attempts_on_persistent_transient() { + let mut attempt = 0u32; + let result: Result = retry_on_transient( + || { + attempt += 1; + async { Err(PageError::transient("always")) } + }, + 3, + Duration::from_millis(0), + ) + .await; + let err = result.expect_err("expected Transient"); + assert!(err.is_transient()); + assert_eq!(attempt, 3, "retried max_attempts times, no more"); + } + + #[tokio::test] + async fn retry_does_not_retry_non_transient_errors() { + let mut attempt = 0u32; + let result: Result = retry_on_transient( + || { + attempt += 1; + async { Err(PageError::Other(anyhow::anyhow!("permanent"))) } + }, + 5, + Duration::from_millis(0), + ) + .await; + assert!(result.is_err()); + assert!(!result.unwrap_err().is_transient()); + assert_eq!(attempt, 1, "non-transient must fail immediately"); + } + + #[tokio::test] + async fn retry_returns_ok_on_first_attempt_without_sleeping() { + let mut attempt = 0u32; + let result: Result = retry_on_transient( + || { + attempt += 1; + async { Ok(7) } + }, + 5, + Duration::from_secs(60), + ) + .await; + assert_eq!(result.unwrap(), 7); + assert_eq!(attempt, 1); + } +} diff --git a/backend/src/crawler/mod.rs b/backend/src/crawler/mod.rs index c50c39d..be3dcaa 100644 --- a/backend/src/crawler/mod.rs +++ b/backend/src/crawler/mod.rs @@ -17,6 +17,7 @@ pub mod browser; pub mod browser_manager; pub mod content; pub mod daemon; +pub mod detect; pub mod diff; pub mod jobs; pub mod pipeline; diff --git a/backend/src/crawler/session.rs b/backend/src/crawler/session.rs index 1b257fc..209ea5c 100644 --- a/backend/src/crawler/session.rs +++ b/backend/src/crawler/session.rs @@ -9,19 +9,39 @@ //! Two things the cookie alone doesn't give us: //! 1. The cookie value is only meaningful to the *server* — we have //! no way to predict from the value alone whether it's still valid. -//! `verify_session` does a navigation and checks for `#avatar_menu`, -//! which only renders for authenticated visitors. Bail clean at -//! startup if it's missing rather than discovering it 30 minutes -//! into a backfill. +//! `verify_session` does a navigation and inspects the probe page +//! for three outcomes: broken-page response (transient — retry the +//! probe), `#logo` present but `#avatar_menu` absent (genuine logout +//! — bail loudly), or both present (authenticated). The earlier +//! avatar-only check conflated "site is hiccuping" with "session is +//! dead" and refused to start the crawler when the site had a brief +//! 503. //! 2. The reqwest client (used for cover and chapter-image downloads) //! has its own cookie store; we seed it for the catalog host only. //! CDN hosts are deliberately *not* given the cookie — they serve //! image bytes by signed URLs and don't need it. +use std::time::Duration; + use anyhow::{anyhow, Context}; use chromiumoxide::browser::Browser; use chromiumoxide::cdp::browser_protocol::network::CookieParam; +use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body}; + +/// Outcome of inspecting a probe-page response. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SessionProbe { + /// `#logo` present and `#avatar_menu` present — session valid. + Ok, + /// `#logo` present but `#avatar_menu` absent — site rendered the + /// normal layout for an unauthenticated visitor; refresh PHPSESSID. + Unauthenticated, + /// Broken-page body signature or `#logo` missing — site is hiccuping. + /// Caller retries the probe rather than blaming the session. + Transient, +} + /// Compute the cookie domain (e.g. `.example.com`) from a start URL. /// The leading dot makes the cookie cover every subdomain — the source /// often redirects between `www.` and other prefixes mid-crawl, and a @@ -86,34 +106,86 @@ pub async fn inject_phpsessid( Ok(()) } -/// Navigate to `probe_url` and confirm the logged-in `#avatar_menu` -/// element is present. The selector only renders for authenticated -/// visitors, so its absence is the unambiguous signal that PHPSESSID -/// is missing, expired, or revoked. +/// Three-way classification of a probe-page response. Pure over HTML so +/// it's unit-testable without a real browser. Order matters: a body +/// matching the broken-page template is `Transient` even if the page +/// happens to contain `#avatar_menu` HTML somewhere — trust the universal +/// site signal over a stray selector match. +pub fn classify_probe(html: &str) -> SessionProbe { + if is_broken_page_body(html) { + return SessionProbe::Transient; + } + let doc = scraper::Html::parse_document(html); + if !has_logo_sentinel(&doc) { + return SessionProbe::Transient; + } + let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap(); + if doc.select(&avatar_sel).next().is_some() { + SessionProbe::Ok + } else { + SessionProbe::Unauthenticated + } +} + +/// In-startup retry budget for the session probe. Small but non-zero — +/// startup hitting a 5-second site hiccup shouldn't fail the operator +/// with "PHPSESSID expired" when the session is actually fine. +const PROBE_MAX_ATTEMPTS: u32 = 3; +const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2); + +/// Navigate to `probe_url` and classify the response. Retries the probe +/// on `Transient` outcomes (broken-page body, missing `#logo`); fails +/// fast on `Unauthenticated`; returns `Ok(())` on success. /// -/// This burns one navigation against the catalog's rate limiter. The -/// trade is worth it — failing here costs ~1s; failing 30 minutes into -/// a backfill costs 30 minutes. +/// This burns one navigation per attempt against the catalog's rate +/// limiter. The trade is worth it — failing here costs ~1s; failing 30 +/// minutes into a backfill costs 30 minutes. pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> { + let mut attempt = 0u32; + loop { + attempt += 1; + let html = fetch_probe_html(browser, probe_url).await?; + match classify_probe(&html) { + SessionProbe::Ok => { + tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present"); + return Ok(()); + } + SessionProbe::Unauthenticated => { + return Err(anyhow!( + "session probe failed — #avatar_menu not present at {probe_url} \ + (page rendered the normal layout); PHPSESSID is missing, expired, \ + or revoked. Refresh CRAWLER_PHPSESSID and re-run." + )); + } + SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => { + tracing::warn!( + attempt, + max_attempts = PROBE_MAX_ATTEMPTS, + "session probe got a transient page; retrying" + ); + tokio::time::sleep(PROBE_RETRY_DELAY).await; + } + SessionProbe::Transient => { + return Err(anyhow!( + "session probe failed — probe page at {probe_url} returned a \ + broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \ + The site appears to be down or rate-limiting us; try again \ + later before refreshing CRAWLER_PHPSESSID." + )); + } + } + } +} + +async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result { let page = browser .new_page(probe_url) .await .with_context(|| format!("open probe page {probe_url}"))?; page.wait_for_navigation().await.context("wait for nav on probe")?; - // The avatar menu is rendered server-side as part of the header - // when a valid session cookie is present; absent JS is fine. - let found = page.find_element("#avatar_menu").await.is_ok(); + let html = page.content().await.context("read probe html")?; page.close().await.ok(); - if found { - tracing::info!("session probe ok — #avatar_menu present"); - Ok(()) - } else { - Err(anyhow!( - "session probe failed — #avatar_menu not present at {probe_url}; \ - PHPSESSID is missing, expired, or revoked. Refresh CRAWLER_PHPSESSID \ - and re-run." - )) - } + Ok(html) } #[cfg(test)] @@ -158,4 +230,59 @@ mod tests { fn registrable_domain_returns_none_for_garbage() { assert!(registrable_domain("not a url").is_none()); } + + #[test] + fn classify_probe_ok_when_logo_and_avatar_present() { + let html = r#" +
+ "#; + assert_eq!(classify_probe(html), SessionProbe::Ok); + } + + #[test] + fn classify_probe_unauth_when_logo_present_but_avatar_absent() { + // Real "logged out" response: site layout renders fine, just no + // avatar widget. This is the only state that should blame the + // session cookie. + let html = r#" +
+
Please log in.
+ "#; + assert_eq!(classify_probe(html), SessionProbe::Unauthenticated); + } + + #[test] + fn classify_probe_transient_on_broken_page_body() { + let html = "\ +

we're sorry, the request file are not found.

\ + "; + assert_eq!(classify_probe(html), SessionProbe::Transient); + } + + #[test] + fn classify_probe_transient_when_logo_missing() { + // No broken-body marker, but no site layout either — treat as + // transient (could be a Cloudflare interstitial, a 5xx page, + // etc.) rather than blaming the session. + let html = "

Service Unavailable

"; + assert_eq!(classify_probe(html), SessionProbe::Transient); + } + + #[test] + fn classify_probe_transient_on_empty_response() { + assert_eq!(classify_probe(""), SessionProbe::Transient); + } + + #[test] + fn classify_probe_trusts_broken_body_over_stray_avatar_match() { + // Defensive: if a broken-page body somehow contains an + // #avatar_menu element (e.g. an unrelated debug page on the + // same template), the body signature still wins. + let html = r#" +

we're sorry, the request file are not found.

+ +
+ "#; + assert_eq!(classify_probe(html), SessionProbe::Transient); + } } diff --git a/backend/src/crawler/source/target.rs b/backend/src/crawler/source/target.rs index 17e2b3f..9288bee 100644 --- a/backend/src/crawler/source/target.rs +++ b/backend/src/crawler/source/target.rs @@ -17,6 +17,16 @@ use super::{ DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga, SourceMangaRef, }; +use crate::crawler::detect::{ + has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError, +}; + +/// In-loop retry budget for transient pages encountered during a single +/// `discover` walk. Bounded small because the job system itself retries +/// the whole `Discover` job on failure — these inline retries only need +/// to absorb a brief site hiccup mid-walk. +const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3; +const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2); pub struct TargetSource { base_url: String, @@ -60,9 +70,15 @@ impl Source for TargetSource { max_results: Option, ) -> anyhow::Result> { // Always visit page 1 first because that's the only way to - // discover `last_page`. We cache the HTML so we don't have to - // re-navigate when the iteration reaches page 1 again. - let first_html = navigate(ctx, self.base_url.as_str()).await?; + // discover `last_page`. Retry it on transient — a broken first + // page would otherwise abort the whole walk before we've even + // started. + let first_html = retry_on_transient( + || async { navigate(ctx, self.base_url.as_str()).await }, + PAGE_TRANSIENT_RETRY_ATTEMPTS, + PAGE_TRANSIENT_RETRY_DELAY, + ) + .await?; let last_page = { let doc = scraper::Html::parse_document(&first_html); parse_last_page(&doc) @@ -87,14 +103,25 @@ impl Source for TargetSource { let mut all = Vec::new(); for page_num in order { - let html = if page_num == 1 { - first_html.clone() + // Page 1 is already cached from the last_page probe — reuse + // it rather than navigating twice. Every other page goes + // through the retry helper so a single broken page mid-walk + // doesn't silently drop its mangas from the result. + let mut page_refs = if page_num == 1 { + let doc = scraper::Html::parse_document(&first_html); + parse_manga_list_from(&doc)? } else { - navigate(ctx, &page_url(&self.base_url, page_num)).await? - }; - let mut page_refs = { - let doc = scraper::Html::parse_document(&html); - parse_manga_list_from(&doc) + retry_on_transient( + || async { + let url = page_url(&self.base_url, page_num); + let html = navigate(ctx, &url).await?; + let doc = scraper::Html::parse_document(&html); + parse_manga_list_from(&doc) + }, + PAGE_TRANSIENT_RETRY_ATTEMPTS, + PAGE_TRANSIENT_RETRY_DELAY, + ) + .await? }; if backfill { page_refs.reverse(); @@ -116,8 +143,12 @@ impl Source for TargetSource { r: &SourceMangaRef, ) -> anyhow::Result { let html = navigate(ctx, r.url.as_str()).await?; - parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters) - .with_context(|| format!("parse manga detail at {}", r.url)) + // Convert PageError → anyhow::Error via `?`. PageError stays + // downcastable from the wrapped anyhow::Error so the pipeline + // can still recognize Transient via `error.downcast_ref::()`. + let manga = parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters) + .with_context(|| format!("parse manga detail at {}", r.url))?; + Ok(manga) } async fn fetch_chapter_list( @@ -150,16 +181,39 @@ fn truncate_to_cap(mut buf: Vec, max: Option) -> Vec { /// Single point of rate-limited navigation. Every Source request goes /// through here, so the per-host limiter map is the only knob that -/// controls per-origin RPS. -async fn navigate(ctx: &FetchContext<'_>, url: &str) -> anyhow::Result { +/// controls per-origin RPS. Also the choke point for transient-page +/// detection — every fetched body is screened by +/// [`classify_navigate_html`] before being handed to a selector. +async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result { ctx.rate.wait_for(url).await?; - let page = ctx.browser.new_page(url).await?; - page.wait_for_navigation().await?; + let page = ctx + .browser + .new_page(url) + .await + .map_err(|e| PageError::Other(anyhow::Error::from(e)))?; + page.wait_for_navigation() + .await + .map_err(|e| PageError::Other(anyhow::Error::from(e)))?; // Stopgap until we wait on a specific selector per page type — // gives any post-load JS a beat to finish injecting content. tokio::time::sleep(Duration::from_secs(1)).await; - let html = page.content().await?; - page.close().await?; + let html = page + .content() + .await + .map_err(|e| PageError::Other(anyhow::Error::from(e)))?; + page.close().await.ok(); + classify_navigate_html(html) +} + +/// Classify a fetched body. The broken-page template is universal across +/// the site — every page type (list, detail, chapter list, reader) gets +/// the same `we're sorry, the request file are not found` body when the +/// server is hiccuping. Catching it here means individual parsers +/// downstream don't have to repeat the check. +fn classify_navigate_html(html: String) -> Result { + if is_broken_page_body(&html) { + return Err(PageError::transient("broken-page body signature")); + } Ok(html) } @@ -204,14 +258,23 @@ fn page_url(template_url: &str, page: i32) -> String { } #[cfg(test)] -fn parse_manga_list(html: &str) -> Vec { +fn parse_manga_list(html: &str) -> Result, PageError> { let doc = scraper::Html::parse_document(html); parse_manga_list_from(&doc) } -fn parse_manga_list_from(doc: &scraper::Html) -> Vec { +/// Parse a manga listing page. `#logo` is present on every well-formed +/// listing page on the source; its absence means the response is a +/// broken-page placeholder (transient) rather than a genuinely empty +/// listing. Empty listings (last-page tail, search with no hits) remain +/// `Ok(vec![])`. +fn parse_manga_list_from(doc: &scraper::Html) -> Result, PageError> { + if !has_logo_sentinel(doc) { + return Err(PageError::transient("manga list: #logo sentinel missing")); + } let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap(); - doc.select(&sel) + Ok(doc + .select(&sel) .filter_map(|a| { let url = a.value().attr("href")?.trim().to_string(); if url.is_empty() { @@ -227,16 +290,22 @@ fn parse_manga_list_from(doc: &scraper::Html) -> Vec { url, }) }) - .collect() + .collect()) } fn parse_manga_detail( html: &str, key: &str, include_chapters: bool, -) -> anyhow::Result { +) -> Result { let doc = scraper::Html::parse_document(html); + // Sentinel first: a broken-page response will trip this before any + // anyhow context is added for missing required fields. + if !has_logo_sentinel(&doc) { + return Err(PageError::transient("manga detail: #logo sentinel missing")); + } + let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?; let summary = first_text(&doc, ".manga_summary"); let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src"); @@ -494,6 +563,7 @@ mod tests { const LISTING_HTML: &str = r#" +
@@ -512,6 +582,7 @@ mod tests { const DETAIL_HTML: &str = r#" +

Test Manga Title

A summary of the manga.
@@ -537,7 +608,7 @@ mod tests { #[test] fn parse_manga_list_extracts_title_url_and_derives_key() { - let refs = parse_manga_list(LISTING_HTML); + let refs = parse_manga_list(LISTING_HTML).expect("parse"); assert_eq!(refs.len(), 2, "third entry has empty href and is skipped"); assert_eq!(refs[0].title, "Foo Manga"); assert_eq!(refs[0].url, "https://target.example/manga/foo"); @@ -546,6 +617,30 @@ mod tests { assert_eq!(refs[1].source_manga_key, "bar-baz"); } + #[test] + fn parse_manga_list_returns_transient_when_logo_missing() { + // Broken-page response: no #logo, no listing. Empty Vec would + // hide this as "page has no mangas"; Transient is the signal + // upstream code retries on. + let html = r#"\ +

we're sorry, the request file are not found.

\ + "#; + let err = parse_manga_list(html).expect_err("expected Transient"); + assert!(err.is_transient(), "got non-transient: {err}"); + } + + #[test] + fn parse_manga_list_ok_empty_when_logo_present_but_no_items() { + // Last page of pagination, "no results" search, etc. Legitimately + // empty must stay distinguishable from "page is broken". + let html = r#"\ +
\ +
\ + "#; + let refs = parse_manga_list(html).expect("logo present == not transient"); + assert!(refs.is_empty()); + } + #[test] fn parse_manga_detail_pulls_all_fields() { let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse"); @@ -761,7 +856,9 @@ mod tests { #[test] fn missing_optional_fields_parse_to_none() { - let html = r#"

Minimal

"#; + let html = r#"\ +
\ +

Minimal

"#; let m = parse_manga_detail(html, "min", true).unwrap(); assert_eq!(m.title, "Minimal"); assert!(m.summary.is_none()); @@ -785,8 +882,44 @@ mod tests { #[test] fn parse_manga_detail_errors_on_missing_title() { - let html = "

nothing

"; + // Logo present (page is alive) — failure here is a real parse + // miss (Other), not Transient. + let html = r#"\ +
\ +

nothing

"#; let err = parse_manga_detail(html, "x", true).unwrap_err(); + assert!(!err.is_transient(), "expected Other, got Transient: {err}"); assert!(err.to_string().contains("missing .w-title h1")); } + + #[test] + fn classify_navigate_html_passes_normal_body_through() { + let body = "
\ +

content

" + .to_string(); + let out = classify_navigate_html(body.clone()).expect("ok"); + assert_eq!(out, body); + } + + #[test] + fn classify_navigate_html_returns_transient_for_broken_template() { + let body = "\ +

we're sorry, the request file are not found.

\ + " + .to_string(); + let err = classify_navigate_html(body).expect_err("expected Transient"); + assert!(err.is_transient(), "got non-transient: {err}"); + } + + #[test] + fn parse_manga_detail_returns_transient_when_logo_missing() { + // Broken-page response on a detail URL — must be reported as + // Transient so the job is retried rather than logging "missing + // .w-title h1" against a permanently-skipped manga. + let html = "\ +

we're sorry, the request file are not found.

\ + "; + let err = parse_manga_detail(html, "x", true).expect_err("expected Transient"); + assert!(err.is_transient(), "got non-transient: {err}"); + } } diff --git a/frontend/package.json b/frontend/package.json index c03f957..c28cded 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.29.0", + "version": "0.30.0", "private": true, "type": "module", "scripts": {