feat: transient-page detection across the crawler (0.30.0)
Until now, when the target site returned its 403 "we're sorry, the request file are not found" response on a page that actually exists, selectors matched nothing and the crawler treated the page as "legitimately empty". Pagination walks silently dropped whole pages worth of mangas, fetch_manga skipped individual entries, and the startup session probe blamed PHPSESSID for what was a site hiccup. This branch adds a single detection layer that the whole pipeline routes through: - `crawler::detect`: PageError::Transient typed signal, plus two primitives (`is_broken_page_body` matches the universal 403 body; `has_logo_sentinel` asserts #logo, the site-wide header element) and a `retry_on_transient` helper that retries a closure on Transient with a small attempt budget. - `navigate()` screens every fetched body for the broken-page signature before handing it to a selector. - Parsers (`parse_manga_list_from`, `parse_manga_detail`, `parse_chapter_pages`) check their structural sentinels (#logo for full-layout pages; a#pic_container for the reader, which doesn't render #logo) and return Result<_, PageError>. Empty Vec is now reserved for genuinely empty pages. - `discover()` retries each pagination page up to 3× (2s apart) before failing the whole Discover job — at which point the existing job system's retry/backoff takes over for longer outages. - `verify_session` is three-state: broken-page → retry probe; #logo present but #avatar_menu absent → genuine logout (the only state that should blame PHPSESSID); both present → ok. Test coverage added at the helper level: 13 unit tests for the detection module (body signature, logo sentinel, PageError, retry helper), parser-level tests for both transient and legitimately-empty inputs, and 6 unit tests for the session probe classifier. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mangalord"
|
||||
version = "0.29.0"
|
||||
version = "0.30.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.29.0"
|
||||
version = "0.30.0"
|
||||
edition = "2021"
|
||||
default-run = "mangalord"
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ use anyhow::Context;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::crawler::detect::PageError;
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::session;
|
||||
use crate::storage::Storage;
|
||||
@@ -23,8 +24,18 @@ use crate::storage::Storage;
|
||||
/// Parse the chapter page DOM and return the page images in `pageN`
|
||||
/// order. Filters out the loader `<img class="loading">` and any
|
||||
/// `<img>` without a numeric `id="pageN"`.
|
||||
pub fn parse_chapter_pages(html: &str) -> Vec<ChapterImage> {
|
||||
///
|
||||
/// Reader pages don't render the site's `#logo` element, so the
|
||||
/// universal logo-sentinel can't apply here — instead we assert
|
||||
/// `a#pic_container` is present. Its absence means the response is the
|
||||
/// transient broken-page response (or a redirect to some other layout)
|
||||
/// and the caller should retry.
|
||||
pub fn parse_chapter_pages(html: &str) -> Result<Vec<ChapterImage>, PageError> {
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
let container_sel = scraper::Selector::parse("a#pic_container").unwrap();
|
||||
if doc.select(&container_sel).next().is_none() {
|
||||
return Err(PageError::transient("reader: a#pic_container missing"));
|
||||
}
|
||||
let sel = scraper::Selector::parse("a#pic_container img:not(.loading)").unwrap();
|
||||
let mut pages: Vec<ChapterImage> = doc
|
||||
.select(&sel)
|
||||
@@ -39,7 +50,7 @@ pub fn parse_chapter_pages(html: &str) -> Vec<ChapterImage> {
|
||||
})
|
||||
.collect();
|
||||
pages.sort_by_key(|p| p.page_number);
|
||||
pages
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -109,7 +120,8 @@ pub async fn sync_chapter_content(
|
||||
let html = page.content().await.context("read chapter html")?;
|
||||
page.close().await.ok();
|
||||
|
||||
let images = parse_chapter_pages(&html);
|
||||
let images = parse_chapter_pages(&html)
|
||||
.with_context(|| format!("parse chapter pages at {source_url}"))?;
|
||||
if images.is_empty() {
|
||||
anyhow::bail!("no page images parsed from {source_url}");
|
||||
}
|
||||
@@ -205,7 +217,7 @@ mod tests {
|
||||
<img id="not-a-page" src="https://cdn/not-a-page.jpg">
|
||||
</a></body></html>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
let pages = parse_chapter_pages(html).expect("parse");
|
||||
assert_eq!(pages.len(), 2);
|
||||
assert_eq!(pages[0].page_number, 1);
|
||||
assert_eq!(pages[0].url, "https://cdn/1.jpg");
|
||||
@@ -221,7 +233,7 @@ mod tests {
|
||||
<img id="page2" src="https://cdn/2.jpg">
|
||||
</a>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
let pages = parse_chapter_pages(html).expect("parse");
|
||||
assert_eq!(pages.len(), 1);
|
||||
assert_eq!(pages[0].page_number, 2);
|
||||
}
|
||||
@@ -235,10 +247,22 @@ mod tests {
|
||||
<img id="page50" src="https://cdn/50.jpg">
|
||||
</a>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
let pages = parse_chapter_pages(html).expect("parse");
|
||||
assert_eq!(
|
||||
pages.iter().map(|p| p.page_number).collect::<Vec<_>>(),
|
||||
vec![9, 50, 126]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_pages_returns_transient_when_container_missing() {
|
||||
// Reader doesn't render #logo, so the universal logo sentinel
|
||||
// can't be used here — a#pic_container is the reader-specific
|
||||
// marker. Broken-page response trips this.
|
||||
let html = "<html><body>\
|
||||
<p>we're sorry, the request file are not found.</p>\
|
||||
</body></html>";
|
||||
let err = parse_chapter_pages(html).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
250
backend/src/crawler/detect.rs
Normal file
250
backend/src/crawler/detect.rs
Normal file
@@ -0,0 +1,250 @@
|
||||
//! Transient-page detection.
|
||||
//!
|
||||
//! The target site occasionally responds with a 403 + tiny "we're sorry,
|
||||
//! the request file are not found" body on pages that actually exist.
|
||||
//! Selectors on that body match nothing, which is indistinguishable from
|
||||
//! a genuinely empty page unless we look for the broken-page markers
|
||||
//! explicitly. The same shape covers full-site outages: 5xx pages,
|
||||
//! Cloudflare interstitials, and "site is down" placeholders all share
|
||||
//! the trait that the normal layout (`#logo` in the header) is absent.
|
||||
//!
|
||||
//! Helpers here are split into two signals so callers can compose them:
|
||||
//! - [`is_broken_page_body`]: pattern-match on the known broken-page
|
||||
//! string. Works for *any* page on the site, including the reader,
|
||||
//! which doesn't render `#logo`.
|
||||
//! - [`has_logo_sentinel`]: assert `#logo` is in the parsed DOM. Site-
|
||||
//! structural marker — present on the manga list, manga detail,
|
||||
//! chapter-list, and login probe pages. **Not** present on the reader,
|
||||
//! so callers in the reader path must rely on the body signature only.
|
||||
//!
|
||||
//! [`PageError::Transient`] is the typed signal returned by parser and
|
||||
//! navigate wrappers. Job handlers map it to "reschedule with backoff"
|
||||
//! rather than the per-page silent skip the parsers used to do.
|
||||
|
||||
use std::future::Future;
|
||||
use std::time::Duration;
|
||||
|
||||
use thiserror::Error;
|
||||
|
||||
/// Universal substring of the broken-page body. The site renders the
|
||||
/// exact string verbatim in a single `<p>`, so a case-insensitive
|
||||
/// substring match is enough — we deliberately do *not* anchor to the
|
||||
/// kaomoji because that part is more likely to change than the prose.
|
||||
const BROKEN_PAGE_MARKER: &str = "we're sorry, the request file are not found";
|
||||
|
||||
/// Outcome of a page fetch or parse when the caller wants to
|
||||
/// distinguish "site/page is transiently broken — retry later" from
|
||||
/// other errors. `Transient` is the only retry-friendly variant; every
|
||||
/// other failure mode stays as `anyhow::Error` and is treated as today.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum PageError {
|
||||
/// Page came back but the site signaled trouble — broken-page body
|
||||
/// signature, structural sentinel missing, etc. Caller should
|
||||
/// reschedule this fetch rather than treat it as data.
|
||||
#[error("transient page error: {reason}")]
|
||||
Transient { reason: String },
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl PageError {
|
||||
pub fn transient(reason: impl Into<String>) -> Self {
|
||||
Self::Transient { reason: reason.into() }
|
||||
}
|
||||
|
||||
pub fn is_transient(&self) -> bool {
|
||||
matches!(self, Self::Transient { .. })
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true when the response body matches the known broken-page
|
||||
/// template. Case-insensitive substring match — small bodies (~150B)
|
||||
/// make the scan trivially fast, and the broken page is always tiny so
|
||||
/// false positives on a real catalog page are not a concern.
|
||||
pub fn is_broken_page_body(html: &str) -> bool {
|
||||
html.to_ascii_lowercase().contains(BROKEN_PAGE_MARKER)
|
||||
}
|
||||
|
||||
/// Returns true when the parsed document contains `#logo` — the site's
|
||||
/// header logo element, present on every full-layout page and absent on
|
||||
/// the broken-page response and on the reader.
|
||||
pub fn has_logo_sentinel(doc: &scraper::Html) -> bool {
|
||||
let sel = scraper::Selector::parse("#logo").expect("#logo is a valid selector");
|
||||
doc.select(&sel).next().is_some()
|
||||
}
|
||||
|
||||
/// Retry `op` up to `max_attempts` times whenever it returns
|
||||
/// [`PageError::Transient`], sleeping `delay` between attempts.
|
||||
/// Non-transient errors short-circuit immediately. Used by discover-loop
|
||||
/// callers so a single broken page doesn't drop the whole walk — the
|
||||
/// caller can fall back on the job system's retry/backoff once the
|
||||
/// inline budget is exhausted.
|
||||
pub async fn retry_on_transient<F, Fut, T>(
|
||||
mut op: F,
|
||||
max_attempts: u32,
|
||||
delay: Duration,
|
||||
) -> Result<T, PageError>
|
||||
where
|
||||
F: FnMut() -> Fut,
|
||||
Fut: Future<Output = Result<T, PageError>>,
|
||||
{
|
||||
debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
|
||||
let mut attempt = 0u32;
|
||||
loop {
|
||||
attempt += 1;
|
||||
match op().await {
|
||||
Ok(v) => return Ok(v),
|
||||
Err(e) if !e.is_transient() => return Err(e),
|
||||
Err(e) if attempt >= max_attempts => return Err(e),
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
attempt,
|
||||
max_attempts,
|
||||
error = %e,
|
||||
"transient error; sleeping before retry"
|
||||
);
|
||||
tokio::time::sleep(delay).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn broken_page_body_matches_exact_template() {
|
||||
let html = "<html><head></head><body>\
|
||||
<p>we're sorry, the request file are not found. Σ(っ°Д °;)っ</p>\
|
||||
</body></html>";
|
||||
assert!(is_broken_page_body(html));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn broken_page_body_is_case_insensitive() {
|
||||
let html = "<p>WE'RE SORRY, THE REQUEST FILE ARE NOT FOUND.</p>";
|
||||
assert!(is_broken_page_body(html));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn broken_page_body_does_not_match_normal_listing() {
|
||||
let html = "<html><body><div id='logo'></div>\
|
||||
<ul><li>Manga A</li><li>Manga B</li></ul></body></html>";
|
||||
assert!(!is_broken_page_body(html));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn broken_page_body_does_not_match_empty_string() {
|
||||
assert!(!is_broken_page_body(""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn logo_sentinel_present_on_normal_page() {
|
||||
let doc = scraper::Html::parse_document(
|
||||
"<html><body><div id='logo'>Site</div><main>...</main></body></html>",
|
||||
);
|
||||
assert!(has_logo_sentinel(&doc));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn logo_sentinel_absent_on_broken_page() {
|
||||
let doc = scraper::Html::parse_document(
|
||||
"<html><head></head><body>\
|
||||
<p>we're sorry, the request file are not found.</p></body></html>",
|
||||
);
|
||||
assert!(!has_logo_sentinel(&doc));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn logo_sentinel_absent_on_empty_document() {
|
||||
let doc = scraper::Html::parse_document("");
|
||||
assert!(!has_logo_sentinel(&doc));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn page_error_transient_constructor_sets_reason() {
|
||||
let e = PageError::transient("logo missing");
|
||||
assert!(e.is_transient());
|
||||
assert_eq!(e.to_string(), "transient page error: logo missing");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn page_error_other_is_not_transient() {
|
||||
let e: PageError = anyhow::anyhow!("something else").into();
|
||||
assert!(!e.is_transient());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn retry_returns_ok_after_a_transient_streak() {
|
||||
let mut attempt = 0u32;
|
||||
let result: Result<i32, PageError> = retry_on_transient(
|
||||
|| {
|
||||
attempt += 1;
|
||||
let n = attempt;
|
||||
async move {
|
||||
if n < 3 {
|
||||
Err(PageError::transient("not yet"))
|
||||
} else {
|
||||
Ok(42)
|
||||
}
|
||||
}
|
||||
},
|
||||
5,
|
||||
Duration::from_millis(0),
|
||||
)
|
||||
.await;
|
||||
assert_eq!(result.unwrap(), 42);
|
||||
assert_eq!(attempt, 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn retry_gives_up_after_max_attempts_on_persistent_transient() {
|
||||
let mut attempt = 0u32;
|
||||
let result: Result<i32, PageError> = retry_on_transient(
|
||||
|| {
|
||||
attempt += 1;
|
||||
async { Err(PageError::transient("always")) }
|
||||
},
|
||||
3,
|
||||
Duration::from_millis(0),
|
||||
)
|
||||
.await;
|
||||
let err = result.expect_err("expected Transient");
|
||||
assert!(err.is_transient());
|
||||
assert_eq!(attempt, 3, "retried max_attempts times, no more");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn retry_does_not_retry_non_transient_errors() {
|
||||
let mut attempt = 0u32;
|
||||
let result: Result<i32, PageError> = retry_on_transient(
|
||||
|| {
|
||||
attempt += 1;
|
||||
async { Err(PageError::Other(anyhow::anyhow!("permanent"))) }
|
||||
},
|
||||
5,
|
||||
Duration::from_millis(0),
|
||||
)
|
||||
.await;
|
||||
assert!(result.is_err());
|
||||
assert!(!result.unwrap_err().is_transient());
|
||||
assert_eq!(attempt, 1, "non-transient must fail immediately");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn retry_returns_ok_on_first_attempt_without_sleeping() {
|
||||
let mut attempt = 0u32;
|
||||
let result: Result<i32, PageError> = retry_on_transient(
|
||||
|| {
|
||||
attempt += 1;
|
||||
async { Ok(7) }
|
||||
},
|
||||
5,
|
||||
Duration::from_secs(60),
|
||||
)
|
||||
.await;
|
||||
assert_eq!(result.unwrap(), 7);
|
||||
assert_eq!(attempt, 1);
|
||||
}
|
||||
}
|
||||
@@ -17,6 +17,7 @@ pub mod browser;
|
||||
pub mod browser_manager;
|
||||
pub mod content;
|
||||
pub mod daemon;
|
||||
pub mod detect;
|
||||
pub mod diff;
|
||||
pub mod jobs;
|
||||
pub mod pipeline;
|
||||
|
||||
@@ -9,19 +9,39 @@
|
||||
//! Two things the cookie alone doesn't give us:
|
||||
//! 1. The cookie value is only meaningful to the *server* — we have
|
||||
//! no way to predict from the value alone whether it's still valid.
|
||||
//! `verify_session` does a navigation and checks for `#avatar_menu`,
|
||||
//! which only renders for authenticated visitors. Bail clean at
|
||||
//! startup if it's missing rather than discovering it 30 minutes
|
||||
//! into a backfill.
|
||||
//! `verify_session` does a navigation and inspects the probe page
|
||||
//! for three outcomes: broken-page response (transient — retry the
|
||||
//! probe), `#logo` present but `#avatar_menu` absent (genuine logout
|
||||
//! — bail loudly), or both present (authenticated). The earlier
|
||||
//! avatar-only check conflated "site is hiccuping" with "session is
|
||||
//! dead" and refused to start the crawler when the site had a brief
|
||||
//! 503.
|
||||
//! 2. The reqwest client (used for cover and chapter-image downloads)
|
||||
//! has its own cookie store; we seed it for the catalog host only.
|
||||
//! CDN hosts are deliberately *not* given the cookie — they serve
|
||||
//! image bytes by signed URLs and don't need it.
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use chromiumoxide::browser::Browser;
|
||||
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
|
||||
|
||||
use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body};
|
||||
|
||||
/// Outcome of inspecting a probe-page response.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum SessionProbe {
|
||||
/// `#logo` present and `#avatar_menu` present — session valid.
|
||||
Ok,
|
||||
/// `#logo` present but `#avatar_menu` absent — site rendered the
|
||||
/// normal layout for an unauthenticated visitor; refresh PHPSESSID.
|
||||
Unauthenticated,
|
||||
/// Broken-page body signature or `#logo` missing — site is hiccuping.
|
||||
/// Caller retries the probe rather than blaming the session.
|
||||
Transient,
|
||||
}
|
||||
|
||||
/// Compute the cookie domain (e.g. `.example.com`) from a start URL.
|
||||
/// The leading dot makes the cookie cover every subdomain — the source
|
||||
/// often redirects between `www.` and other prefixes mid-crawl, and a
|
||||
@@ -86,34 +106,86 @@ pub async fn inject_phpsessid(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Navigate to `probe_url` and confirm the logged-in `#avatar_menu`
|
||||
/// element is present. The selector only renders for authenticated
|
||||
/// visitors, so its absence is the unambiguous signal that PHPSESSID
|
||||
/// is missing, expired, or revoked.
|
||||
/// Three-way classification of a probe-page response. Pure over HTML so
|
||||
/// it's unit-testable without a real browser. Order matters: a body
|
||||
/// matching the broken-page template is `Transient` even if the page
|
||||
/// happens to contain `#avatar_menu` HTML somewhere — trust the universal
|
||||
/// site signal over a stray selector match.
|
||||
pub fn classify_probe(html: &str) -> SessionProbe {
|
||||
if is_broken_page_body(html) {
|
||||
return SessionProbe::Transient;
|
||||
}
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
if !has_logo_sentinel(&doc) {
|
||||
return SessionProbe::Transient;
|
||||
}
|
||||
let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap();
|
||||
if doc.select(&avatar_sel).next().is_some() {
|
||||
SessionProbe::Ok
|
||||
} else {
|
||||
SessionProbe::Unauthenticated
|
||||
}
|
||||
}
|
||||
|
||||
/// In-startup retry budget for the session probe. Small but non-zero —
|
||||
/// startup hitting a 5-second site hiccup shouldn't fail the operator
|
||||
/// with "PHPSESSID expired" when the session is actually fine.
|
||||
const PROBE_MAX_ATTEMPTS: u32 = 3;
|
||||
const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
|
||||
|
||||
/// Navigate to `probe_url` and classify the response. Retries the probe
|
||||
/// on `Transient` outcomes (broken-page body, missing `#logo`); fails
|
||||
/// fast on `Unauthenticated`; returns `Ok(())` on success.
|
||||
///
|
||||
/// This burns one navigation against the catalog's rate limiter. The
|
||||
/// trade is worth it — failing here costs ~1s; failing 30 minutes into
|
||||
/// a backfill costs 30 minutes.
|
||||
/// This burns one navigation per attempt against the catalog's rate
|
||||
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
|
||||
/// minutes into a backfill costs 30 minutes.
|
||||
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
|
||||
let mut attempt = 0u32;
|
||||
loop {
|
||||
attempt += 1;
|
||||
let html = fetch_probe_html(browser, probe_url).await?;
|
||||
match classify_probe(&html) {
|
||||
SessionProbe::Ok => {
|
||||
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
|
||||
return Ok(());
|
||||
}
|
||||
SessionProbe::Unauthenticated => {
|
||||
return Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url} \
|
||||
(page rendered the normal layout); PHPSESSID is missing, expired, \
|
||||
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||
));
|
||||
}
|
||||
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
|
||||
tracing::warn!(
|
||||
attempt,
|
||||
max_attempts = PROBE_MAX_ATTEMPTS,
|
||||
"session probe got a transient page; retrying"
|
||||
);
|
||||
tokio::time::sleep(PROBE_RETRY_DELAY).await;
|
||||
}
|
||||
SessionProbe::Transient => {
|
||||
return Err(anyhow!(
|
||||
"session probe failed — probe page at {probe_url} returned a \
|
||||
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
|
||||
The site appears to be down or rate-limiting us; try again \
|
||||
later before refreshing CRAWLER_PHPSESSID."
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<String> {
|
||||
let page = browser
|
||||
.new_page(probe_url)
|
||||
.await
|
||||
.with_context(|| format!("open probe page {probe_url}"))?;
|
||||
page.wait_for_navigation().await.context("wait for nav on probe")?;
|
||||
// The avatar menu is rendered server-side as part of the header
|
||||
// when a valid session cookie is present; absent JS is fine.
|
||||
let found = page.find_element("#avatar_menu").await.is_ok();
|
||||
let html = page.content().await.context("read probe html")?;
|
||||
page.close().await.ok();
|
||||
if found {
|
||||
tracing::info!("session probe ok — #avatar_menu present");
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url}; \
|
||||
PHPSESSID is missing, expired, or revoked. Refresh CRAWLER_PHPSESSID \
|
||||
and re-run."
|
||||
))
|
||||
}
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -158,4 +230,59 @@ mod tests {
|
||||
fn registrable_domain_returns_none_for_garbage() {
|
||||
assert!(registrable_domain("not a url").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_probe_ok_when_logo_and_avatar_present() {
|
||||
let html = r#"<html><body>
|
||||
<header><div id="logo">Target</div><div id="avatar_menu"></div></header>
|
||||
</body></html>"#;
|
||||
assert_eq!(classify_probe(html), SessionProbe::Ok);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_probe_unauth_when_logo_present_but_avatar_absent() {
|
||||
// Real "logged out" response: site layout renders fine, just no
|
||||
// avatar widget. This is the only state that should blame the
|
||||
// session cookie.
|
||||
let html = r#"<html><body>
|
||||
<header><div id="logo">Target</div></header>
|
||||
<main>Please log in.</main>
|
||||
</body></html>"#;
|
||||
assert_eq!(classify_probe(html), SessionProbe::Unauthenticated);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_probe_transient_on_broken_page_body() {
|
||||
let html = "<html><body>\
|
||||
<p>we're sorry, the request file are not found.</p>\
|
||||
</body></html>";
|
||||
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_probe_transient_when_logo_missing() {
|
||||
// No broken-body marker, but no site layout either — treat as
|
||||
// transient (could be a Cloudflare interstitial, a 5xx page,
|
||||
// etc.) rather than blaming the session.
|
||||
let html = "<html><body><h1>Service Unavailable</h1></body></html>";
|
||||
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_probe_transient_on_empty_response() {
|
||||
assert_eq!(classify_probe(""), SessionProbe::Transient);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
|
||||
// Defensive: if a broken-page body somehow contains an
|
||||
// #avatar_menu element (e.g. an unrelated debug page on the
|
||||
// same template), the body signature still wins.
|
||||
let html = r#"<html><body>
|
||||
<p>we're sorry, the request file are not found.</p>
|
||||
<div id="logo"></div>
|
||||
<div id="avatar_menu"></div>
|
||||
</body></html>"#;
|
||||
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,16 @@ use super::{
|
||||
DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
|
||||
SourceMangaRef,
|
||||
};
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
};
|
||||
|
||||
/// In-loop retry budget for transient pages encountered during a single
|
||||
/// `discover` walk. Bounded small because the job system itself retries
|
||||
/// the whole `Discover` job on failure — these inline retries only need
|
||||
/// to absorb a brief site hiccup mid-walk.
|
||||
const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3;
|
||||
const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2);
|
||||
|
||||
pub struct TargetSource {
|
||||
base_url: String,
|
||||
@@ -60,9 +70,15 @@ impl Source for TargetSource {
|
||||
max_results: Option<usize>,
|
||||
) -> anyhow::Result<Vec<SourceMangaRef>> {
|
||||
// Always visit page 1 first because that's the only way to
|
||||
// discover `last_page`. We cache the HTML so we don't have to
|
||||
// re-navigate when the iteration reaches page 1 again.
|
||||
let first_html = navigate(ctx, self.base_url.as_str()).await?;
|
||||
// discover `last_page`. Retry it on transient — a broken first
|
||||
// page would otherwise abort the whole walk before we've even
|
||||
// started.
|
||||
let first_html = retry_on_transient(
|
||||
|| async { navigate(ctx, self.base_url.as_str()).await },
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
.await?;
|
||||
let last_page = {
|
||||
let doc = scraper::Html::parse_document(&first_html);
|
||||
parse_last_page(&doc)
|
||||
@@ -87,14 +103,25 @@ impl Source for TargetSource {
|
||||
|
||||
let mut all = Vec::new();
|
||||
for page_num in order {
|
||||
let html = if page_num == 1 {
|
||||
first_html.clone()
|
||||
// Page 1 is already cached from the last_page probe — reuse
|
||||
// it rather than navigating twice. Every other page goes
|
||||
// through the retry helper so a single broken page mid-walk
|
||||
// doesn't silently drop its mangas from the result.
|
||||
let mut page_refs = if page_num == 1 {
|
||||
let doc = scraper::Html::parse_document(&first_html);
|
||||
parse_manga_list_from(&doc)?
|
||||
} else {
|
||||
navigate(ctx, &page_url(&self.base_url, page_num)).await?
|
||||
};
|
||||
let mut page_refs = {
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let url = page_url(&self.base_url, page_num);
|
||||
let html = navigate(ctx, &url).await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
.await?
|
||||
};
|
||||
if backfill {
|
||||
page_refs.reverse();
|
||||
@@ -116,8 +143,12 @@ impl Source for TargetSource {
|
||||
r: &SourceMangaRef,
|
||||
) -> anyhow::Result<SourceManga> {
|
||||
let html = navigate(ctx, r.url.as_str()).await?;
|
||||
parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
|
||||
.with_context(|| format!("parse manga detail at {}", r.url))
|
||||
// Convert PageError → anyhow::Error via `?`. PageError stays
|
||||
// downcastable from the wrapped anyhow::Error so the pipeline
|
||||
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
|
||||
let manga = parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
|
||||
.with_context(|| format!("parse manga detail at {}", r.url))?;
|
||||
Ok(manga)
|
||||
}
|
||||
|
||||
async fn fetch_chapter_list(
|
||||
@@ -150,16 +181,39 @@ fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {
|
||||
|
||||
/// Single point of rate-limited navigation. Every Source request goes
|
||||
/// through here, so the per-host limiter map is the only knob that
|
||||
/// controls per-origin RPS.
|
||||
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> anyhow::Result<String> {
|
||||
/// controls per-origin RPS. Also the choke point for transient-page
|
||||
/// detection — every fetched body is screened by
|
||||
/// [`classify_navigate_html`] before being handed to a selector.
|
||||
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
|
||||
ctx.rate.wait_for(url).await?;
|
||||
let page = ctx.browser.new_page(url).await?;
|
||||
page.wait_for_navigation().await?;
|
||||
let page = ctx
|
||||
.browser
|
||||
.new_page(url)
|
||||
.await
|
||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||
page.wait_for_navigation()
|
||||
.await
|
||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||
// Stopgap until we wait on a specific selector per page type —
|
||||
// gives any post-load JS a beat to finish injecting content.
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
let html = page.content().await?;
|
||||
page.close().await?;
|
||||
let html = page
|
||||
.content()
|
||||
.await
|
||||
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
||||
page.close().await.ok();
|
||||
classify_navigate_html(html)
|
||||
}
|
||||
|
||||
/// Classify a fetched body. The broken-page template is universal across
|
||||
/// the site — every page type (list, detail, chapter list, reader) gets
|
||||
/// the same `we're sorry, the request file are not found` body when the
|
||||
/// server is hiccuping. Catching it here means individual parsers
|
||||
/// downstream don't have to repeat the check.
|
||||
fn classify_navigate_html(html: String) -> Result<String, PageError> {
|
||||
if is_broken_page_body(&html) {
|
||||
return Err(PageError::transient("broken-page body signature"));
|
||||
}
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
@@ -204,14 +258,23 @@ fn page_url(template_url: &str, page: i32) -> String {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn parse_manga_list(html: &str) -> Vec<SourceMangaRef> {
|
||||
fn parse_manga_list(html: &str) -> Result<Vec<SourceMangaRef>, PageError> {
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
parse_manga_list_from(&doc)
|
||||
}
|
||||
|
||||
fn parse_manga_list_from(doc: &scraper::Html) -> Vec<SourceMangaRef> {
|
||||
/// Parse a manga listing page. `#logo` is present on every well-formed
|
||||
/// listing page on the source; its absence means the response is a
|
||||
/// broken-page placeholder (transient) rather than a genuinely empty
|
||||
/// listing. Empty listings (last-page tail, search with no hits) remain
|
||||
/// `Ok(vec![])`.
|
||||
fn parse_manga_list_from(doc: &scraper::Html) -> Result<Vec<SourceMangaRef>, PageError> {
|
||||
if !has_logo_sentinel(doc) {
|
||||
return Err(PageError::transient("manga list: #logo sentinel missing"));
|
||||
}
|
||||
let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap();
|
||||
doc.select(&sel)
|
||||
Ok(doc
|
||||
.select(&sel)
|
||||
.filter_map(|a| {
|
||||
let url = a.value().attr("href")?.trim().to_string();
|
||||
if url.is_empty() {
|
||||
@@ -227,16 +290,22 @@ fn parse_manga_list_from(doc: &scraper::Html) -> Vec<SourceMangaRef> {
|
||||
url,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn parse_manga_detail(
|
||||
html: &str,
|
||||
key: &str,
|
||||
include_chapters: bool,
|
||||
) -> anyhow::Result<SourceManga> {
|
||||
) -> Result<SourceManga, PageError> {
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
|
||||
// Sentinel first: a broken-page response will trip this before any
|
||||
// anyhow context is added for missing required fields.
|
||||
if !has_logo_sentinel(&doc) {
|
||||
return Err(PageError::transient("manga detail: #logo sentinel missing"));
|
||||
}
|
||||
|
||||
let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?;
|
||||
let summary = first_text(&doc, ".manga_summary");
|
||||
let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src");
|
||||
@@ -494,6 +563,7 @@ mod tests {
|
||||
|
||||
const LISTING_HTML: &str = r#"
|
||||
<html><body>
|
||||
<header><div id="logo">Target</div></header>
|
||||
<div id="left_side">
|
||||
<div class="pic_list">
|
||||
<div class="updatesli">
|
||||
@@ -512,6 +582,7 @@ mod tests {
|
||||
|
||||
const DETAIL_HTML: &str = r#"
|
||||
<html><body>
|
||||
<header><div id="logo">Target</div></header>
|
||||
<div class="w-title"><h1>Test Manga Title</h1></div>
|
||||
<div class="cover"><img src="/cover.jpg"><img src="/extra-not-cover.jpg"></div>
|
||||
<div class="manga_summary">A summary of the manga.</div>
|
||||
@@ -537,7 +608,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_manga_list_extracts_title_url_and_derives_key() {
|
||||
let refs = parse_manga_list(LISTING_HTML);
|
||||
let refs = parse_manga_list(LISTING_HTML).expect("parse");
|
||||
assert_eq!(refs.len(), 2, "third entry has empty href and is skipped");
|
||||
assert_eq!(refs[0].title, "Foo Manga");
|
||||
assert_eq!(refs[0].url, "https://target.example/manga/foo");
|
||||
@@ -546,6 +617,30 @@ mod tests {
|
||||
assert_eq!(refs[1].source_manga_key, "bar-baz");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_manga_list_returns_transient_when_logo_missing() {
|
||||
// Broken-page response: no #logo, no listing. Empty Vec would
|
||||
// hide this as "page has no mangas"; Transient is the signal
|
||||
// upstream code retries on.
|
||||
let html = r#"<html><body>\
|
||||
<p>we're sorry, the request file are not found.</p>\
|
||||
</body></html>"#;
|
||||
let err = parse_manga_list(html).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_manga_list_ok_empty_when_logo_present_but_no_items() {
|
||||
// Last page of pagination, "no results" search, etc. Legitimately
|
||||
// empty must stay distinguishable from "page is broken".
|
||||
let html = r#"<html><body>\
|
||||
<header><div id="logo">Target</div></header>\
|
||||
<div id="left_side"><div class="pic_list"></div></div>\
|
||||
</body></html>"#;
|
||||
let refs = parse_manga_list(html).expect("logo present == not transient");
|
||||
assert!(refs.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_manga_detail_pulls_all_fields() {
|
||||
let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse");
|
||||
@@ -761,7 +856,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn missing_optional_fields_parse_to_none() {
|
||||
let html = r#"<html><body><div class="w-title"><h1>Minimal</h1></div></body></html>"#;
|
||||
let html = r#"<html><body>\
|
||||
<header><div id="logo">Target</div></header>\
|
||||
<div class="w-title"><h1>Minimal</h1></div></body></html>"#;
|
||||
let m = parse_manga_detail(html, "min", true).unwrap();
|
||||
assert_eq!(m.title, "Minimal");
|
||||
assert!(m.summary.is_none());
|
||||
@@ -785,8 +882,44 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_manga_detail_errors_on_missing_title() {
|
||||
let html = "<html><body><p>nothing</p></body></html>";
|
||||
// Logo present (page is alive) — failure here is a real parse
|
||||
// miss (Other), not Transient.
|
||||
let html = r#"<html><body>\
|
||||
<header><div id="logo">Target</div></header>\
|
||||
<p>nothing</p></body></html>"#;
|
||||
let err = parse_manga_detail(html, "x", true).unwrap_err();
|
||||
assert!(!err.is_transient(), "expected Other, got Transient: {err}");
|
||||
assert!(err.to_string().contains("missing .w-title h1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_navigate_html_passes_normal_body_through() {
|
||||
let body = "<html><body><header><div id='logo'>Target</div></header>\
|
||||
<p>content</p></body></html>"
|
||||
.to_string();
|
||||
let out = classify_navigate_html(body.clone()).expect("ok");
|
||||
assert_eq!(out, body);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_navigate_html_returns_transient_for_broken_template() {
|
||||
let body = "<html><head></head><body>\
|
||||
<p>we're sorry, the request file are not found.</p>\
|
||||
</body></html>"
|
||||
.to_string();
|
||||
let err = classify_navigate_html(body).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_manga_detail_returns_transient_when_logo_missing() {
|
||||
// Broken-page response on a detail URL — must be reported as
|
||||
// Transient so the job is retried rather than logging "missing
|
||||
// .w-title h1" against a permanently-skipped manga.
|
||||
let html = "<html><body>\
|
||||
<p>we're sorry, the request file are not found.</p>\
|
||||
</body></html>";
|
||||
let err = parse_manga_detail(html, "x", true).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user