feat: transient-page detection across the crawler (0.30.0)

Until now, when the target site returned its 403 "we're sorry, the request file are not found" response on a page that actually exists, selectors matched nothing and the crawler treated the page as "legitimately empty". Pagination walks silently dropped whole pages worth of mangas, fetch_manga skipped individual entries, and the startup session probe blamed PHPSESSID for what was a site hiccup. This branch adds a single detection layer that the whole pipeline routes through: - `crawler::detect`: PageError::Transient typed signal, plus two primitives (`is_broken_page_body` matches the universal 403 body; `has_logo_sentinel` asserts #logo, the site-wide header element) and a `retry_on_transient` helper that retries a closure on Transient with a small attempt budget. - `navigate()` screens every fetched body for the broken-page signature before handing it to a selector. - Parsers (`parse_manga_list_from`, `parse_manga_detail`, `parse_chapter_pages`) check their structural sentinels (#logo for full-layout pages; a#pic_container for the reader, which doesn't render #logo) and return Result<_, PageError>. Empty Vec is now reserved for genuinely empty pages. - `discover()` retries each pagination page up to 3× (2s apart) before failing the whole Discover job — at which point the existing job system's retry/backoff takes over for longer outages. - `verify_session` is three-state: broken-page → retry probe; #logo present but #avatar_menu absent → genuine logout (the only state that should blame PHPSESSID); both present → ok. Test coverage added at the helper level: 13 unit tests for the detection module (body signature, logo sentinel, PageError, retry helper), parser-level tests for both transient and legitimately-empty inputs, and 6 unit tests for the session probe classifier. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 22:47:21 +02:00
parent b845d88766
commit 9ff49166a5
8 changed files with 594 additions and 59 deletions
--- a/backend/src/crawler/content.rs
+++ b/backend/src/crawler/content.rs
@@ -16,6 +16,7 @@ use anyhow::Context;
 use sqlx::PgPool;
 use uuid::Uuid;

+use crate::crawler::detect::PageError;
 use crate::crawler::rate_limit::HostRateLimiters;
 use crate::crawler::session;
 use crate::storage::Storage;
@@ -23,8 +24,18 @@ use crate::storage::Storage;
 /// Parse the chapter page DOM and return the page images in `pageN`
 /// order. Filters out the loader `<img class="loading">` and any
 /// `<img>` without a numeric `id="pageN"`.
-pub fn parse_chapter_pages(html: &str) -> Vec<ChapterImage> {
+///
+/// Reader pages don't render the site's `#logo` element, so the
+/// universal logo-sentinel can't apply here — instead we assert
+/// `a#pic_container` is present. Its absence means the response is the
+/// transient broken-page response (or a redirect to some other layout)
+/// and the caller should retry.
+pub fn parse_chapter_pages(html: &str) -> Result<Vec<ChapterImage>, PageError> {
    let doc = scraper::Html::parse_document(html);
+    let container_sel = scraper::Selector::parse("a#pic_container").unwrap();
+    if doc.select(&container_sel).next().is_none() {
+        return Err(PageError::transient("reader: a#pic_container missing"));
+    }
    let sel = scraper::Selector::parse("a#pic_container img:not(.loading)").unwrap();
    let mut pages: Vec<ChapterImage> = doc
        .select(&sel)
@@ -39,7 +50,7 @@ pub fn parse_chapter_pages(html: &str) -> Vec<ChapterImage> {
        })
        .collect();
    pages.sort_by_key(|p| p.page_number);
-    pages
+    Ok(pages)
 }

 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -109,7 +120,8 @@ pub async fn sync_chapter_content(
    let html = page.content().await.context("read chapter html")?;
    page.close().await.ok();

-    let images = parse_chapter_pages(&html);
+    let images = parse_chapter_pages(&html)
+        .with_context(|| format!("parse chapter pages at {source_url}"))?;
    if images.is_empty() {
        anyhow::bail!("no page images parsed from {source_url}");
    }
@@ -205,7 +217,7 @@ mod tests {
            <img id="not-a-page" src="https://cdn/not-a-page.jpg">
          </a></body></html>
        "#;
-        let pages = parse_chapter_pages(html);
+        let pages = parse_chapter_pages(html).expect("parse");
        assert_eq!(pages.len(), 2);
        assert_eq!(pages[0].page_number, 1);
        assert_eq!(pages[0].url, "https://cdn/1.jpg");
@@ -221,7 +233,7 @@ mod tests {
            <img id="page2" src="https://cdn/2.jpg">
          </a>
        "#;
-        let pages = parse_chapter_pages(html);
+        let pages = parse_chapter_pages(html).expect("parse");
        assert_eq!(pages.len(), 1);
        assert_eq!(pages[0].page_number, 2);
    }
@@ -235,10 +247,22 @@ mod tests {
            <img id="page50" src="https://cdn/50.jpg">
          </a>
        "#;
-        let pages = parse_chapter_pages(html);
+        let pages = parse_chapter_pages(html).expect("parse");
        assert_eq!(
            pages.iter().map(|p| p.page_number).collect::<Vec<_>>(),
            vec![9, 50, 126]
        );
    }
+
+    #[test]
+    fn parse_chapter_pages_returns_transient_when_container_missing() {
+        // Reader doesn't render #logo, so the universal logo sentinel
+        // can't be used here — a#pic_container is the reader-specific
+        // marker. Broken-page response trips this.
+        let html = "<html><body>\
+            <p>we're sorry, the request file are not found.</p>\
+            </body></html>";
+        let err = parse_chapter_pages(html).expect_err("expected Transient");
+        assert!(err.is_transient(), "got non-transient: {err}");
+    }
 }