Merge remote-tracking branch 'origin/main' into feat/crawler-system-chromium

feat(crawler): CRAWLER_CHROMIUM_BINARY to use system chromium (0.45.0)
Skips the chromiumoxide fetcher when CRAWLER_CHROMIUM_BINARY is set, unblocking Linux_arm64 deployments (Raspberry Pi 5) where the fetcher's upstream snapshot bucket has no reliable build. The Dockerfile gains an INSTALL_CHROMIUM build-arg that adds chromium-headless-shell + fonts-liberation to the runtime image when set; default off so cloud/x86 images stay slim. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-31 17:44:20 +02:00 · 2026-05-31 17:18:37 +02:00
4 changed files with 100 additions and 27 deletions
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"

 [[package]]
 name = "mangalord"
-version = "0.45.1"
+version = "0.45.0"
 dependencies = [
 "anyhow",
 "argon2",
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mangalord"
-version = "0.45.1"
+version = "0.45.0"
 edition = "2021"
 default-run = "mangalord"

--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -7,6 +7,7 @@
 //! (`td:has(label:contains("Author:"))`) are implemented by walking
 //! the parsed tree.

+use std::collections::VecDeque;
 use std::time::Duration;

 use anyhow::Context;
@@ -74,11 +75,10 @@ impl Source for TargetSource {
        &self,
        ctx: &FetchContext<'_>,
    ) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
-        // Probe page 1 up front (with transient retry) for two reasons:
-        // a broken first page should abort cleanly rather than mid-walk,
-        // and the HTML is handed straight to the first `next_batch` call
-        // so the walker doesn't re-fetch it. Page count is discovered
-        // incrementally — see `TargetSourceWalker::next_batch`.
+        // Always visit page 1 first because that's the only way to
+        // discover `last_page`. Retry it on transient — a broken first
+        // page would otherwise abort the whole walk before we've even
+        // started.
        let first_html = retry_on_transient(
            || async {
                navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
@@ -87,10 +87,21 @@ impl Source for TargetSource {
            PAGE_TRANSIENT_RETRY_DELAY,
        )
        .await?;
+        let last_page = {
+            let doc = scraper::Html::parse_document(&first_html);
+            parse_last_page(&doc)
+        };
+
+        let order = build_page_order(last_page);
+        tracing::info!(
+            last_page = ?last_page,
+            page_count = order.len(),
+            "walking pagination"
+        );

        Ok(Box::new(TargetSourceWalker {
            base_url: self.base_url.clone(),
-            next_page: 1,
+            pages_remaining: order,
            first_page_html: Some(first_html),
        }))
    }
@@ -136,19 +147,24 @@ impl Source for TargetSource {
    }
 }

-/// Walker returned by [`TargetSource::discover`]. Walks pages `1..` in
-/// order, terminating as soon as a page renders cleanly with zero entries
-/// — that's the "we ran off the end of the index" signal. Page 1's HTML
-/// is cached at construction time (discover already had to fetch it for
-/// the transient probe) so the first batch doesn't re-fetch.
-///
-/// A genuinely empty `Ok(vec![])` from `parse_manga_list_from` is what
-/// stops us: the parser's `#logo` sentinel converts unrendered pages
-/// into transient errors before they reach this loop, so an empty
-/// parse result reliably means "no more entries."
+/// Build the queue of page numbers `TargetSource::discover` will walk.
+/// The site orders by `update_date DESC`, so newest-first is just the
+/// natural page order: `1..=last`. If `last_page` is unknown (source
+/// surfaces no pagination) only page 1 is visited.
+fn build_page_order(last_page: Option<i32>) -> VecDeque<i32> {
+    match last_page {
+        None => VecDeque::from([1]),
+        Some(last) => (1..=last).collect(),
+    }
+}
+
+/// Walker returned by [`TargetSource::discover`]. Pops one source-index
+/// page per `next_batch` call. Page 1's HTML is cached at construction
+/// time (the discover call needed it to read `last_page` anyway) so the
+/// batch covering page 1 doesn't re-fetch.
 struct TargetSourceWalker {
    base_url: String,
-    next_page: i32,
+    pages_remaining: VecDeque<i32>,
    first_page_html: Option<String>,
 }

@@ -158,11 +174,13 @@ impl DiscoverWalk for TargetSourceWalker {
        &mut self,
        ctx: &FetchContext<'_>,
    ) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
-        let page_num = self.next_page;
+        let Some(page_num) = self.pages_remaining.pop_front() else {
+            return Ok(None);
+        };
        let page_refs = if page_num == 1 {
            // Reuse the cached page-1 HTML from the initial probe. Take
-            // it (rather than clone) so a future re-entry that somehow
-            // revisits page 1 still falls back to a real fetch.
+            // it (rather than clone) so a malformed page-order queue
+            // that re-visits page 1 still falls back to a real fetch.
            match self.first_page_html.take() {
                Some(html) => {
                    let doc = scraper::Html::parse_document(&html);
@@ -200,10 +218,6 @@ impl DiscoverWalk for TargetSourceWalker {
            .await?
        };
        tracing::info!(page_num, count = page_refs.len(), "page walked");
-        if page_refs.is_empty() {
-            return Ok(None);
-        }
-        self.next_page += 1;
        Ok(Some(page_refs))
    }
 }
@@ -274,6 +288,20 @@ fn classify_navigate_html(html: String) -> Result<String, PageError> {
    Ok(html)
 }

+fn parse_last_page(doc: &scraper::Html) -> Option<i32> {
+    // Pagination links carry their page number as text. Take the
+    // numeric maximum so we don't depend on a specific layout (Prev,
+    // Next, ellipses, etc. all get filtered out by .parse).
+    let sel = scraper::Selector::parse("#left_side .pagination a").unwrap();
+    doc.select(&sel)
+        .filter_map(|a| {
+            collapse_whitespace(&a.text().collect::<String>())
+                .parse::<i32>()
+                .ok()
+        })
+        .max()
+}
+
 /// Substitutes the first `/N/` path segment with the target page
 /// number. Source impls that paginate via a different URL shape can
 /// override this — for the modeled site the segment is always present.
@@ -825,6 +853,29 @@ mod tests {
        assert_eq!(parse_chapter_number("Special"), None);
    }

+    #[test]
+    fn parse_last_page_picks_highest_pagination_link() {
+        let html = r#"
+            <div id="left_side"><div class="pagination">
+              <a href="/list/1/">Prev</a>
+              <ol>
+                <li><a href="/list/1/">1</a></li>
+                <li><a href="/list/2/">2</a></li>
+                <li><a href="/list/47/">47</a></li>
+                <li><a href="/list/2/">Next</a></li>
+              </ol>
+            </div></div>
+        "#;
+        let doc = scraper::Html::parse_document(html);
+        assert_eq!(parse_last_page(&doc), Some(47));
+    }
+
+    #[test]
+    fn parse_last_page_none_when_no_pagination() {
+        let doc = scraper::Html::parse_document("<html></html>");
+        assert!(parse_last_page(&doc).is_none());
+    }
+
    #[test]
    fn page_url_substitutes_numeric_path_segment() {
        assert_eq!(
@@ -973,6 +1024,28 @@ mod tests {
        assert!(err.is_transient(), "got non-transient: {err}");
    }

+    #[test]
+    fn build_page_order_is_natural_one_to_last() {
+        // Newest-first is just the source's natural pagination order:
+        // (update_date DESC) lives at page 1, oldest at the last page.
+        let order = build_page_order(Some(3));
+        assert_eq!(Vec::from(order), vec![1, 2, 3]);
+    }
+
+    #[test]
+    fn build_page_order_falls_back_to_page_one_only_without_pagination() {
+        // Source surfaced no pagination control — visit page 1 alone
+        // and let the walk end after one batch.
+        let order = build_page_order(None);
+        assert_eq!(Vec::from(order), vec![1]);
+    }
+
+    #[test]
+    fn build_page_order_single_page_index_yields_one_entry() {
+        let order = build_page_order(Some(1));
+        assert_eq!(Vec::from(order), vec![1]);
+    }
+
    #[test]
    fn parse_chapter_list_returns_transient_when_table_missing() {
        // Partial render (post-load JS hadn't injected the table, layout
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "mangalord-frontend",
-  "version": "0.45.1",
+  "version": "0.45.0",
  "private": true,
  "type": "module",
  "scripts": {
Author	SHA1	Message	Date
fabi	2f9037e210	Merge remote-tracking branch 'origin/main' into feat/crawler-system-chromium Some checks failed deploy / test-backend (pull_request) Failing after 6s Details deploy / test-frontend (pull_request) Failing after 33s Details deploy / build-and-push (pull_request) Has been skipped Details deploy / deploy (pull_request) Has been skipped Details	2026-05-31 17:44:20 +02:00
MechaCat02	0b5f5d1692	feat(crawler): CRAWLER_CHROMIUM_BINARY to use system chromium (0.45.0) Skips the chromiumoxide fetcher when CRAWLER_CHROMIUM_BINARY is set, unblocking Linux_arm64 deployments (Raspberry Pi 5) where the fetcher's upstream snapshot bucket has no reliable build. The Dockerfile gains an INSTALL_CHROMIUM build-arg that adds chromium-headless-shell + fonts-liberation to the runtime image when set; default off so cloud/x86 images stay slim. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>	2026-05-31 17:18:37 +02:00