feat: transient-page detection across the crawler (0.30.0)

Until now, when the target site returned its 403 "we're sorry, the request file are not found" response on a page that actually exists, selectors matched nothing and the crawler treated the page as "legitimately empty". Pagination walks silently dropped whole pages worth of mangas, fetch_manga skipped individual entries, and the startup session probe blamed PHPSESSID for what was a site hiccup. This branch adds a single detection layer that the whole pipeline routes through: - `crawler::detect`: PageError::Transient typed signal, plus two primitives (`is_broken_page_body` matches the universal 403 body; `has_logo_sentinel` asserts #logo, the site-wide header element) and a `retry_on_transient` helper that retries a closure on Transient with a small attempt budget. - `navigate()` screens every fetched body for the broken-page signature before handing it to a selector. - Parsers (`parse_manga_list_from`, `parse_manga_detail`, `parse_chapter_pages`) check their structural sentinels (#logo for full-layout pages; a#pic_container for the reader, which doesn't render #logo) and return Result<_, PageError>. Empty Vec is now reserved for genuinely empty pages. - `discover()` retries each pagination page up to 3× (2s apart) before failing the whole Discover job — at which point the existing job system's retry/backoff takes over for longer outages. - `verify_session` is three-state: broken-page → retry probe; #logo present but #avatar_menu absent → genuine logout (the only state that should blame PHPSESSID); both present → ok. Test coverage added at the helper level: 13 unit tests for the detection module (body signature, logo sentinel, PageError, retry helper), parser-level tests for both transient and legitimately-empty inputs, and 6 unit tests for the session probe classifier. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 22:47:21 +02:00
parent b845d88766
commit 9ff49166a5
8 changed files with 594 additions and 59 deletions
--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -17,6 +17,16 @@ use super::{
    DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
    SourceMangaRef,
 };
+use crate::crawler::detect::{
+    has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
+};
+
+/// In-loop retry budget for transient pages encountered during a single
+/// `discover` walk. Bounded small because the job system itself retries
+/// the whole `Discover` job on failure — these inline retries only need
+/// to absorb a brief site hiccup mid-walk.
+const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3;
+const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2);

 pub struct TargetSource {
    base_url: String,
@@ -60,9 +70,15 @@ impl Source for TargetSource {
        max_results: Option<usize>,
    ) -> anyhow::Result<Vec<SourceMangaRef>> {
        // Always visit page 1 first because that's the only way to
-        // discover `last_page`. We cache the HTML so we don't have to
-        // re-navigate when the iteration reaches page 1 again.
-        let first_html = navigate(ctx, self.base_url.as_str()).await?;
+        // discover `last_page`. Retry it on transient — a broken first
+        // page would otherwise abort the whole walk before we've even
+        // started.
+        let first_html = retry_on_transient(
+            || async { navigate(ctx, self.base_url.as_str()).await },
+            PAGE_TRANSIENT_RETRY_ATTEMPTS,
+            PAGE_TRANSIENT_RETRY_DELAY,
+        )
+        .await?;
        let last_page = {
            let doc = scraper::Html::parse_document(&first_html);
            parse_last_page(&doc)
@@ -87,14 +103,25 @@ impl Source for TargetSource {

        let mut all = Vec::new();
        for page_num in order {
-            let html = if page_num == 1 {
-                first_html.clone()
+            // Page 1 is already cached from the last_page probe — reuse
+            // it rather than navigating twice. Every other page goes
+            // through the retry helper so a single broken page mid-walk
+            // doesn't silently drop its mangas from the result.
+            let mut page_refs = if page_num == 1 {
+                let doc = scraper::Html::parse_document(&first_html);
+                parse_manga_list_from(&doc)?
            } else {
-                navigate(ctx, &page_url(&self.base_url, page_num)).await?
-            };
-            let mut page_refs = {
-                let doc = scraper::Html::parse_document(&html);
-                parse_manga_list_from(&doc)
+                retry_on_transient(
+                    || async {
+                        let url = page_url(&self.base_url, page_num);
+                        let html = navigate(ctx, &url).await?;
+                        let doc = scraper::Html::parse_document(&html);
+                        parse_manga_list_from(&doc)
+                    },
+                    PAGE_TRANSIENT_RETRY_ATTEMPTS,
+                    PAGE_TRANSIENT_RETRY_DELAY,
+                )
+                .await?
            };
            if backfill {
                page_refs.reverse();
@@ -116,8 +143,12 @@ impl Source for TargetSource {
        r: &SourceMangaRef,
    ) -> anyhow::Result<SourceManga> {
        let html = navigate(ctx, r.url.as_str()).await?;
-        parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
-            .with_context(|| format!("parse manga detail at {}", r.url))
+        // Convert PageError → anyhow::Error via `?`. PageError stays
+        // downcastable from the wrapped anyhow::Error so the pipeline
+        // can still recognize Transient via `error.downcast_ref::<PageError>()`.
+        let manga = parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
+            .with_context(|| format!("parse manga detail at {}", r.url))?;
+        Ok(manga)
    }

    async fn fetch_chapter_list(
@@ -150,16 +181,39 @@ fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {

 /// Single point of rate-limited navigation. Every Source request goes
 /// through here, so the per-host limiter map is the only knob that
-/// controls per-origin RPS.
-async fn navigate(ctx: &FetchContext<'_>, url: &str) -> anyhow::Result<String> {
+/// controls per-origin RPS. Also the choke point for transient-page
+/// detection — every fetched body is screened by
+/// [`classify_navigate_html`] before being handed to a selector.
+async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
    ctx.rate.wait_for(url).await?;
-    let page = ctx.browser.new_page(url).await?;
-    page.wait_for_navigation().await?;
+    let page = ctx
+        .browser
+        .new_page(url)
+        .await
+        .map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
+    page.wait_for_navigation()
+        .await
+        .map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
    // Stopgap until we wait on a specific selector per page type —
    // gives any post-load JS a beat to finish injecting content.
    tokio::time::sleep(Duration::from_secs(1)).await;
-    let html = page.content().await?;
-    page.close().await?;
+    let html = page
+        .content()
+        .await
+        .map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
+    page.close().await.ok();
+    classify_navigate_html(html)
+}
+
+/// Classify a fetched body. The broken-page template is universal across
+/// the site — every page type (list, detail, chapter list, reader) gets
+/// the same `we're sorry, the request file are not found` body when the
+/// server is hiccuping. Catching it here means individual parsers
+/// downstream don't have to repeat the check.
+fn classify_navigate_html(html: String) -> Result<String, PageError> {
+    if is_broken_page_body(&html) {
+        return Err(PageError::transient("broken-page body signature"));
+    }
    Ok(html)
 }

@@ -204,14 +258,23 @@ fn page_url(template_url: &str, page: i32) -> String {
 }

 #[cfg(test)]
-fn parse_manga_list(html: &str) -> Vec<SourceMangaRef> {
+fn parse_manga_list(html: &str) -> Result<Vec<SourceMangaRef>, PageError> {
    let doc = scraper::Html::parse_document(html);
    parse_manga_list_from(&doc)
 }

-fn parse_manga_list_from(doc: &scraper::Html) -> Vec<SourceMangaRef> {
+/// Parse a manga listing page. `#logo` is present on every well-formed
+/// listing page on the source; its absence means the response is a
+/// broken-page placeholder (transient) rather than a genuinely empty
+/// listing. Empty listings (last-page tail, search with no hits) remain
+/// `Ok(vec![])`.
+fn parse_manga_list_from(doc: &scraper::Html) -> Result<Vec<SourceMangaRef>, PageError> {
+    if !has_logo_sentinel(doc) {
+        return Err(PageError::transient("manga list: #logo sentinel missing"));
+    }
    let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap();
-    doc.select(&sel)
+    Ok(doc
+        .select(&sel)
        .filter_map(|a| {
            let url = a.value().attr("href")?.trim().to_string();
            if url.is_empty() {
@@ -227,16 +290,22 @@ fn parse_manga_list_from(doc: &scraper::Html) -> Vec<SourceMangaRef> {
                url,
            })
        })
-        .collect()
+        .collect())
 }

 fn parse_manga_detail(
    html: &str,
    key: &str,
    include_chapters: bool,
-) -> anyhow::Result<SourceManga> {
+) -> Result<SourceManga, PageError> {
    let doc = scraper::Html::parse_document(html);

+    // Sentinel first: a broken-page response will trip this before any
+    // anyhow context is added for missing required fields.
+    if !has_logo_sentinel(&doc) {
+        return Err(PageError::transient("manga detail: #logo sentinel missing"));
+    }
+
    let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?;
    let summary = first_text(&doc, ".manga_summary");
    let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src");
@@ -494,6 +563,7 @@ mod tests {

    const LISTING_HTML: &str = r#"
        <html><body>
+        <header><div id="logo">Target</div></header>
        <div id="left_side">
          <div class="pic_list">
            <div class="updatesli">
@@ -512,6 +582,7 @@ mod tests {

    const DETAIL_HTML: &str = r#"
        <html><body>
+        <header><div id="logo">Target</div></header>
        <div class="w-title"><h1>Test Manga Title</h1></div>
        <div class="cover"><img src="/cover.jpg"><img src="/extra-not-cover.jpg"></div>
        <div class="manga_summary">A summary of the manga.</div>
@@ -537,7 +608,7 @@ mod tests {

    #[test]
    fn parse_manga_list_extracts_title_url_and_derives_key() {
-        let refs = parse_manga_list(LISTING_HTML);
+        let refs = parse_manga_list(LISTING_HTML).expect("parse");
        assert_eq!(refs.len(), 2, "third entry has empty href and is skipped");
        assert_eq!(refs[0].title, "Foo Manga");
        assert_eq!(refs[0].url, "https://target.example/manga/foo");
@@ -546,6 +617,30 @@ mod tests {
        assert_eq!(refs[1].source_manga_key, "bar-baz");
    }

+    #[test]
+    fn parse_manga_list_returns_transient_when_logo_missing() {
+        // Broken-page response: no #logo, no listing. Empty Vec would
+        // hide this as "page has no mangas"; Transient is the signal
+        // upstream code retries on.
+        let html = r#"<html><body>\
+            <p>we're sorry, the request file are not found.</p>\
+            </body></html>"#;
+        let err = parse_manga_list(html).expect_err("expected Transient");
+        assert!(err.is_transient(), "got non-transient: {err}");
+    }
+
+    #[test]
+    fn parse_manga_list_ok_empty_when_logo_present_but_no_items() {
+        // Last page of pagination, "no results" search, etc. Legitimately
+        // empty must stay distinguishable from "page is broken".
+        let html = r#"<html><body>\
+            <header><div id="logo">Target</div></header>\
+            <div id="left_side"><div class="pic_list"></div></div>\
+            </body></html>"#;
+        let refs = parse_manga_list(html).expect("logo present == not transient");
+        assert!(refs.is_empty());
+    }
+
    #[test]
    fn parse_manga_detail_pulls_all_fields() {
        let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse");
@@ -761,7 +856,9 @@ mod tests {

    #[test]
    fn missing_optional_fields_parse_to_none() {
-        let html = r#"<html><body><div class="w-title"><h1>Minimal</h1></div></body></html>"#;
+        let html = r#"<html><body>\
+            <header><div id="logo">Target</div></header>\
+            <div class="w-title"><h1>Minimal</h1></div></body></html>"#;
        let m = parse_manga_detail(html, "min", true).unwrap();
        assert_eq!(m.title, "Minimal");
        assert!(m.summary.is_none());
@@ -785,8 +882,44 @@ mod tests {

    #[test]
    fn parse_manga_detail_errors_on_missing_title() {
-        let html = "<html><body><p>nothing</p></body></html>";
+        // Logo present (page is alive) — failure here is a real parse
+        // miss (Other), not Transient.
+        let html = r#"<html><body>\
+            <header><div id="logo">Target</div></header>\
+            <p>nothing</p></body></html>"#;
        let err = parse_manga_detail(html, "x", true).unwrap_err();
+        assert!(!err.is_transient(), "expected Other, got Transient: {err}");
        assert!(err.to_string().contains("missing .w-title h1"));
    }
+
+    #[test]
+    fn classify_navigate_html_passes_normal_body_through() {
+        let body = "<html><body><header><div id='logo'>Target</div></header>\
+                    <p>content</p></body></html>"
+            .to_string();
+        let out = classify_navigate_html(body.clone()).expect("ok");
+        assert_eq!(out, body);
+    }
+
+    #[test]
+    fn classify_navigate_html_returns_transient_for_broken_template() {
+        let body = "<html><head></head><body>\
+                    <p>we're sorry, the request file are not found.</p>\
+                    </body></html>"
+            .to_string();
+        let err = classify_navigate_html(body).expect_err("expected Transient");
+        assert!(err.is_transient(), "got non-transient: {err}");
+    }
+
+    #[test]
+    fn parse_manga_detail_returns_transient_when_logo_missing() {
+        // Broken-page response on a detail URL — must be reported as
+        // Transient so the job is retried rather than logging "missing
+        // .w-title h1" against a permanently-skipped manga.
+        let html = "<html><body>\
+            <p>we're sorry, the request file are not found.</p>\
+            </body></html>";
+        let err = parse_manga_detail(html, "x", true).expect_err("expected Transient");
+        assert!(err.is_transient(), "got non-transient: {err}");
+    }
 }