Files
Mangalord/backend/src/crawler/detect.rs
MechaCat02 9ff49166a5 feat: transient-page detection across the crawler (0.30.0)
Until now, when the target site returned its 403 "we're sorry, the
request file are not found" response on a page that actually exists,
selectors matched nothing and the crawler treated the page as
"legitimately empty". Pagination walks silently dropped whole pages
worth of mangas, fetch_manga skipped individual entries, and the
startup session probe blamed PHPSESSID for what was a site hiccup.

This branch adds a single detection layer that the whole pipeline
routes through:

- `crawler::detect`: PageError::Transient typed signal, plus two
  primitives (`is_broken_page_body` matches the universal 403 body;
  `has_logo_sentinel` asserts #logo, the site-wide header element)
  and a `retry_on_transient` helper that retries a closure on
  Transient with a small attempt budget.
- `navigate()` screens every fetched body for the broken-page
  signature before handing it to a selector.
- Parsers (`parse_manga_list_from`, `parse_manga_detail`,
  `parse_chapter_pages`) check their structural sentinels (#logo for
  full-layout pages; a#pic_container for the reader, which doesn't
  render #logo) and return Result<_, PageError>. Empty Vec is now
  reserved for genuinely empty pages.
- `discover()` retries each pagination page up to 3× (2s apart) before
  failing the whole Discover job — at which point the existing job
  system's retry/backoff takes over for longer outages.
- `verify_session` is three-state: broken-page → retry probe;
  #logo present but #avatar_menu absent → genuine logout (the only
  state that should blame PHPSESSID); both present → ok.

Test coverage added at the helper level: 13 unit tests for the
detection module (body signature, logo sentinel, PageError, retry
helper), parser-level tests for both transient and legitimately-empty
inputs, and 6 unit tests for the session probe classifier.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 22:47:21 +02:00

251 lines
8.6 KiB
Rust

//! Transient-page detection.
//!
//! The target site occasionally responds with a 403 + tiny "we're sorry,
//! the request file are not found" body on pages that actually exist.
//! Selectors on that body match nothing, which is indistinguishable from
//! a genuinely empty page unless we look for the broken-page markers
//! explicitly. The same shape covers full-site outages: 5xx pages,
//! Cloudflare interstitials, and "site is down" placeholders all share
//! the trait that the normal layout (`#logo` in the header) is absent.
//!
//! Helpers here are split into two signals so callers can compose them:
//! - [`is_broken_page_body`]: pattern-match on the known broken-page
//! string. Works for *any* page on the site, including the reader,
//! which doesn't render `#logo`.
//! - [`has_logo_sentinel`]: assert `#logo` is in the parsed DOM. Site-
//! structural marker — present on the manga list, manga detail,
//! chapter-list, and login probe pages. **Not** present on the reader,
//! so callers in the reader path must rely on the body signature only.
//!
//! [`PageError::Transient`] is the typed signal returned by parser and
//! navigate wrappers. Job handlers map it to "reschedule with backoff"
//! rather than the per-page silent skip the parsers used to do.
use std::future::Future;
use std::time::Duration;
use thiserror::Error;
/// Universal substring of the broken-page body. The site renders the
/// exact string verbatim in a single `<p>`, so a case-insensitive
/// substring match is enough — we deliberately do *not* anchor to the
/// kaomoji because that part is more likely to change than the prose.
const BROKEN_PAGE_MARKER: &str = "we're sorry, the request file are not found";
/// Outcome of a page fetch or parse when the caller wants to
/// distinguish "site/page is transiently broken — retry later" from
/// other errors. `Transient` is the only retry-friendly variant; every
/// other failure mode stays as `anyhow::Error` and is treated as today.
#[derive(Debug, Error)]
pub enum PageError {
/// Page came back but the site signaled trouble — broken-page body
/// signature, structural sentinel missing, etc. Caller should
/// reschedule this fetch rather than treat it as data.
#[error("transient page error: {reason}")]
Transient { reason: String },
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl PageError {
pub fn transient(reason: impl Into<String>) -> Self {
Self::Transient { reason: reason.into() }
}
pub fn is_transient(&self) -> bool {
matches!(self, Self::Transient { .. })
}
}
/// Returns true when the response body matches the known broken-page
/// template. Case-insensitive substring match — small bodies (~150B)
/// make the scan trivially fast, and the broken page is always tiny so
/// false positives on a real catalog page are not a concern.
pub fn is_broken_page_body(html: &str) -> bool {
html.to_ascii_lowercase().contains(BROKEN_PAGE_MARKER)
}
/// Returns true when the parsed document contains `#logo` — the site's
/// header logo element, present on every full-layout page and absent on
/// the broken-page response and on the reader.
pub fn has_logo_sentinel(doc: &scraper::Html) -> bool {
let sel = scraper::Selector::parse("#logo").expect("#logo is a valid selector");
doc.select(&sel).next().is_some()
}
/// Retry `op` up to `max_attempts` times whenever it returns
/// [`PageError::Transient`], sleeping `delay` between attempts.
/// Non-transient errors short-circuit immediately. Used by discover-loop
/// callers so a single broken page doesn't drop the whole walk — the
/// caller can fall back on the job system's retry/backoff once the
/// inline budget is exhausted.
pub async fn retry_on_transient<F, Fut, T>(
mut op: F,
max_attempts: u32,
delay: Duration,
) -> Result<T, PageError>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, PageError>>,
{
debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
let mut attempt = 0u32;
loop {
attempt += 1;
match op().await {
Ok(v) => return Ok(v),
Err(e) if !e.is_transient() => return Err(e),
Err(e) if attempt >= max_attempts => return Err(e),
Err(e) => {
tracing::warn!(
attempt,
max_attempts,
error = %e,
"transient error; sleeping before retry"
);
tokio::time::sleep(delay).await;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn broken_page_body_matches_exact_template() {
let html = "<html><head></head><body>\
<p>we're sorry, the request file are not found. Σ(っ°Д °;)っ</p>\
</body></html>";
assert!(is_broken_page_body(html));
}
#[test]
fn broken_page_body_is_case_insensitive() {
let html = "<p>WE'RE SORRY, THE REQUEST FILE ARE NOT FOUND.</p>";
assert!(is_broken_page_body(html));
}
#[test]
fn broken_page_body_does_not_match_normal_listing() {
let html = "<html><body><div id='logo'></div>\
<ul><li>Manga A</li><li>Manga B</li></ul></body></html>";
assert!(!is_broken_page_body(html));
}
#[test]
fn broken_page_body_does_not_match_empty_string() {
assert!(!is_broken_page_body(""));
}
#[test]
fn logo_sentinel_present_on_normal_page() {
let doc = scraper::Html::parse_document(
"<html><body><div id='logo'>Site</div><main>...</main></body></html>",
);
assert!(has_logo_sentinel(&doc));
}
#[test]
fn logo_sentinel_absent_on_broken_page() {
let doc = scraper::Html::parse_document(
"<html><head></head><body>\
<p>we're sorry, the request file are not found.</p></body></html>",
);
assert!(!has_logo_sentinel(&doc));
}
#[test]
fn logo_sentinel_absent_on_empty_document() {
let doc = scraper::Html::parse_document("");
assert!(!has_logo_sentinel(&doc));
}
#[test]
fn page_error_transient_constructor_sets_reason() {
let e = PageError::transient("logo missing");
assert!(e.is_transient());
assert_eq!(e.to_string(), "transient page error: logo missing");
}
#[test]
fn page_error_other_is_not_transient() {
let e: PageError = anyhow::anyhow!("something else").into();
assert!(!e.is_transient());
}
#[tokio::test]
async fn retry_returns_ok_after_a_transient_streak() {
let mut attempt = 0u32;
let result: Result<i32, PageError> = retry_on_transient(
|| {
attempt += 1;
let n = attempt;
async move {
if n < 3 {
Err(PageError::transient("not yet"))
} else {
Ok(42)
}
}
},
5,
Duration::from_millis(0),
)
.await;
assert_eq!(result.unwrap(), 42);
assert_eq!(attempt, 3);
}
#[tokio::test]
async fn retry_gives_up_after_max_attempts_on_persistent_transient() {
let mut attempt = 0u32;
let result: Result<i32, PageError> = retry_on_transient(
|| {
attempt += 1;
async { Err(PageError::transient("always")) }
},
3,
Duration::from_millis(0),
)
.await;
let err = result.expect_err("expected Transient");
assert!(err.is_transient());
assert_eq!(attempt, 3, "retried max_attempts times, no more");
}
#[tokio::test]
async fn retry_does_not_retry_non_transient_errors() {
let mut attempt = 0u32;
let result: Result<i32, PageError> = retry_on_transient(
|| {
attempt += 1;
async { Err(PageError::Other(anyhow::anyhow!("permanent"))) }
},
5,
Duration::from_millis(0),
)
.await;
assert!(result.is_err());
assert!(!result.unwrap_err().is_transient());
assert_eq!(attempt, 1, "non-transient must fail immediately");
}
#[tokio::test]
async fn retry_returns_ok_on_first_attempt_without_sleeping() {
let mut attempt = 0u32;
let result: Result<i32, PageError> = retry_on_transient(
|| {
attempt += 1;
async { Ok(7) }
},
5,
Duration::from_secs(60),
)
.await;
assert_eq!(result.unwrap(), 7);
assert_eq!(attempt, 1);
}
}