//! Transient-page detection. //! //! The target site occasionally responds with a 403 + tiny "we're sorry, //! the request file are not found" body on pages that actually exist. //! Selectors on that body match nothing, which is indistinguishable from //! a genuinely empty page unless we look for the broken-page markers //! explicitly. The same shape covers full-site outages: 5xx pages, //! Cloudflare interstitials, and "site is down" placeholders all share //! the trait that the normal layout (`#logo` in the header) is absent. //! //! Helpers here are split into two signals so callers can compose them: //! - [`is_broken_page_body`]: pattern-match on the known broken-page //! string. Works for *any* page on the site, including the reader, //! which doesn't render `#logo`. //! - [`has_logo_sentinel`]: assert `#logo` is in the parsed DOM. Site- //! structural marker — present on the manga list, manga detail, //! chapter-list, and login probe pages. **Not** present on the reader, //! so callers in the reader path must rely on the body signature only. //! //! [`PageError::Transient`] is the typed signal returned by parser and //! navigate wrappers. Job handlers map it to "reschedule with backoff" //! rather than the per-page silent skip the parsers used to do. use std::future::Future; use std::time::Duration; use thiserror::Error; /// Universal substring of the broken-page body. The site renders the /// exact string verbatim in a single `

`, so a case-insensitive /// substring match is enough — we deliberately do *not* anchor to the /// kaomoji because that part is more likely to change than the prose. const BROKEN_PAGE_MARKER: &str = "we're sorry, the request file are not found"; /// Outcome of a page fetch or parse when the caller wants to /// distinguish "site/page is transiently broken — retry later" from /// other errors. `Transient` is the only retry-friendly variant; every /// other failure mode stays as `anyhow::Error` and is treated as today. #[derive(Debug, Error)] pub enum PageError { /// Page came back but the site signaled trouble — broken-page body /// signature, structural sentinel missing, etc. Caller should /// reschedule this fetch rather than treat it as data. #[error("transient page error: {reason}")] Transient { reason: String }, #[error(transparent)] Other(#[from] anyhow::Error), } impl PageError { pub fn transient(reason: impl Into) -> Self { Self::Transient { reason: reason.into() } } pub fn is_transient(&self) -> bool { matches!(self, Self::Transient { .. }) } } /// Returns true when the response body matches the known broken-page /// template. Case-insensitive substring match — small bodies (~150B) /// make the scan trivially fast, and the broken page is always tiny so /// false positives on a real catalog page are not a concern. pub fn is_broken_page_body(html: &str) -> bool { html.to_ascii_lowercase().contains(BROKEN_PAGE_MARKER) } /// Returns true when the parsed document contains `#logo` — the site's /// header logo element, present on every full-layout page and absent on /// the broken-page response and on the reader. pub fn has_logo_sentinel(doc: &scraper::Html) -> bool { let sel = scraper::Selector::parse("#logo").expect("#logo is a valid selector"); doc.select(&sel).next().is_some() } /// Retry `op` up to `max_attempts` times whenever it returns /// [`PageError::Transient`], sleeping `delay` between attempts. /// Non-transient errors short-circuit immediately. Used by discover-loop /// callers so a single broken page doesn't drop the whole walk — the /// caller can fall back on the job system's retry/backoff once the /// inline budget is exhausted. pub async fn retry_on_transient( mut op: F, max_attempts: u32, delay: Duration, ) -> Result where F: FnMut() -> Fut, Fut: Future>, { debug_assert!(max_attempts >= 1, "max_attempts must be at least 1"); let mut attempt = 0u32; loop { attempt += 1; match op().await { Ok(v) => return Ok(v), Err(e) if !e.is_transient() => return Err(e), Err(e) if attempt >= max_attempts => return Err(e), Err(e) => { tracing::warn!( attempt, max_attempts, error = %e, "transient error; sleeping before retry" ); tokio::time::sleep(delay).await; } } } } #[cfg(test)] mod tests { use super::*; #[test] fn broken_page_body_matches_exact_template() { let html = "\

we're sorry, the request file are not found. Σ(っ°Д °;)っ

\ "; assert!(is_broken_page_body(html)); } #[test] fn broken_page_body_is_case_insensitive() { let html = "

WE'RE SORRY, THE REQUEST FILE ARE NOT FOUND.

"; assert!(is_broken_page_body(html)); } #[test] fn broken_page_body_does_not_match_normal_listing() { let html = "\ "; assert!(!is_broken_page_body(html)); } #[test] fn broken_page_body_does_not_match_empty_string() { assert!(!is_broken_page_body("")); } #[test] fn logo_sentinel_present_on_normal_page() { let doc = scraper::Html::parse_document( "
...
", ); assert!(has_logo_sentinel(&doc)); } #[test] fn logo_sentinel_absent_on_broken_page() { let doc = scraper::Html::parse_document( "\

we're sorry, the request file are not found.

", ); assert!(!has_logo_sentinel(&doc)); } #[test] fn logo_sentinel_absent_on_empty_document() { let doc = scraper::Html::parse_document(""); assert!(!has_logo_sentinel(&doc)); } #[test] fn page_error_transient_constructor_sets_reason() { let e = PageError::transient("logo missing"); assert!(e.is_transient()); assert_eq!(e.to_string(), "transient page error: logo missing"); } #[test] fn page_error_other_is_not_transient() { let e: PageError = anyhow::anyhow!("something else").into(); assert!(!e.is_transient()); } #[tokio::test] async fn retry_returns_ok_after_a_transient_streak() { let mut attempt = 0u32; let result: Result = retry_on_transient( || { attempt += 1; let n = attempt; async move { if n < 3 { Err(PageError::transient("not yet")) } else { Ok(42) } } }, 5, Duration::from_millis(0), ) .await; assert_eq!(result.unwrap(), 42); assert_eq!(attempt, 3); } #[tokio::test] async fn retry_gives_up_after_max_attempts_on_persistent_transient() { let mut attempt = 0u32; let result: Result = retry_on_transient( || { attempt += 1; async { Err(PageError::transient("always")) } }, 3, Duration::from_millis(0), ) .await; let err = result.expect_err("expected Transient"); assert!(err.is_transient()); assert_eq!(attempt, 3, "retried max_attempts times, no more"); } #[tokio::test] async fn retry_does_not_retry_non_transient_errors() { let mut attempt = 0u32; let result: Result = retry_on_transient( || { attempt += 1; async { Err(PageError::Other(anyhow::anyhow!("permanent"))) } }, 5, Duration::from_millis(0), ) .await; assert!(result.is_err()); assert!(!result.unwrap_err().is_transient()); assert_eq!(attempt, 1, "non-transient must fail immediately"); } #[tokio::test] async fn retry_returns_ok_on_first_attempt_without_sleeping() { let mut attempt = 0u32; let result: Result = retry_on_transient( || { attempt += 1; async { Ok(7) } }, 5, Duration::from_secs(60), ) .await; assert_eq!(result.unwrap(), 7); assert_eq!(attempt, 1); } }