feat: transient-page detection across the crawler (0.30.0)
Until now, when the target site returned its 403 "we're sorry, the request file are not found" response on a page that actually exists, selectors matched nothing and the crawler treated the page as "legitimately empty". Pagination walks silently dropped whole pages worth of mangas, fetch_manga skipped individual entries, and the startup session probe blamed PHPSESSID for what was a site hiccup. This branch adds a single detection layer that the whole pipeline routes through: - `crawler::detect`: PageError::Transient typed signal, plus two primitives (`is_broken_page_body` matches the universal 403 body; `has_logo_sentinel` asserts #logo, the site-wide header element) and a `retry_on_transient` helper that retries a closure on Transient with a small attempt budget. - `navigate()` screens every fetched body for the broken-page signature before handing it to a selector. - Parsers (`parse_manga_list_from`, `parse_manga_detail`, `parse_chapter_pages`) check their structural sentinels (#logo for full-layout pages; a#pic_container for the reader, which doesn't render #logo) and return Result<_, PageError>. Empty Vec is now reserved for genuinely empty pages. - `discover()` retries each pagination page up to 3× (2s apart) before failing the whole Discover job — at which point the existing job system's retry/backoff takes over for longer outages. - `verify_session` is three-state: broken-page → retry probe; #logo present but #avatar_menu absent → genuine logout (the only state that should blame PHPSESSID); both present → ok. Test coverage added at the helper level: 13 unit tests for the detection module (body signature, logo sentinel, PageError, retry helper), parser-level tests for both transient and legitimately-empty inputs, and 6 unit tests for the session probe classifier. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ use anyhow::Context;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::crawler::detect::PageError;
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::session;
|
||||
use crate::storage::Storage;
|
||||
@@ -23,8 +24,18 @@ use crate::storage::Storage;
|
||||
/// Parse the chapter page DOM and return the page images in `pageN`
|
||||
/// order. Filters out the loader `<img class="loading">` and any
|
||||
/// `<img>` without a numeric `id="pageN"`.
|
||||
pub fn parse_chapter_pages(html: &str) -> Vec<ChapterImage> {
|
||||
///
|
||||
/// Reader pages don't render the site's `#logo` element, so the
|
||||
/// universal logo-sentinel can't apply here — instead we assert
|
||||
/// `a#pic_container` is present. Its absence means the response is the
|
||||
/// transient broken-page response (or a redirect to some other layout)
|
||||
/// and the caller should retry.
|
||||
pub fn parse_chapter_pages(html: &str) -> Result<Vec<ChapterImage>, PageError> {
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
let container_sel = scraper::Selector::parse("a#pic_container").unwrap();
|
||||
if doc.select(&container_sel).next().is_none() {
|
||||
return Err(PageError::transient("reader: a#pic_container missing"));
|
||||
}
|
||||
let sel = scraper::Selector::parse("a#pic_container img:not(.loading)").unwrap();
|
||||
let mut pages: Vec<ChapterImage> = doc
|
||||
.select(&sel)
|
||||
@@ -39,7 +50,7 @@ pub fn parse_chapter_pages(html: &str) -> Vec<ChapterImage> {
|
||||
})
|
||||
.collect();
|
||||
pages.sort_by_key(|p| p.page_number);
|
||||
pages
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -109,7 +120,8 @@ pub async fn sync_chapter_content(
|
||||
let html = page.content().await.context("read chapter html")?;
|
||||
page.close().await.ok();
|
||||
|
||||
let images = parse_chapter_pages(&html);
|
||||
let images = parse_chapter_pages(&html)
|
||||
.with_context(|| format!("parse chapter pages at {source_url}"))?;
|
||||
if images.is_empty() {
|
||||
anyhow::bail!("no page images parsed from {source_url}");
|
||||
}
|
||||
@@ -205,7 +217,7 @@ mod tests {
|
||||
<img id="not-a-page" src="https://cdn/not-a-page.jpg">
|
||||
</a></body></html>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
let pages = parse_chapter_pages(html).expect("parse");
|
||||
assert_eq!(pages.len(), 2);
|
||||
assert_eq!(pages[0].page_number, 1);
|
||||
assert_eq!(pages[0].url, "https://cdn/1.jpg");
|
||||
@@ -221,7 +233,7 @@ mod tests {
|
||||
<img id="page2" src="https://cdn/2.jpg">
|
||||
</a>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
let pages = parse_chapter_pages(html).expect("parse");
|
||||
assert_eq!(pages.len(), 1);
|
||||
assert_eq!(pages[0].page_number, 2);
|
||||
}
|
||||
@@ -235,10 +247,22 @@ mod tests {
|
||||
<img id="page50" src="https://cdn/50.jpg">
|
||||
</a>
|
||||
"#;
|
||||
let pages = parse_chapter_pages(html);
|
||||
let pages = parse_chapter_pages(html).expect("parse");
|
||||
assert_eq!(
|
||||
pages.iter().map(|p| p.page_number).collect::<Vec<_>>(),
|
||||
vec![9, 50, 126]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_pages_returns_transient_when_container_missing() {
|
||||
// Reader doesn't render #logo, so the universal logo sentinel
|
||||
// can't be used here — a#pic_container is the reader-specific
|
||||
// marker. Broken-page response trips this.
|
||||
let html = "<html><body>\
|
||||
<p>we're sorry, the request file are not found.</p>\
|
||||
</body></html>";
|
||||
let err = parse_chapter_pages(html).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user