From ee4594f679f007eb275457269542261e5a01f3a2 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 31 May 2026 18:06:22 +0200 Subject: [PATCH] fix(crawler): walk list pages incrementally; stop on empty page (0.45.1) The pre-built `1..=parse_last_page` queue silently broke whenever the configured CRAWLER_START_URL lacked a `/N/` path segment: page_url returned the input unchanged, every "next" page re-fetched page 1, and the dedup set caught the duplicates as a flood of "skip already-seen key in this run" debug lines. The walker now increments next_page on each batch and terminates when parse_manga_list_from yields an empty list (the `#logo` sentinel still converts unrendered pages into transient errors, so an Ok(vec![]) is a real end-of-index signal). parse_last_page and build_page_order are deleted along with their unit tests; they have no callers under the new model. page_url and the page-1 HTML cache from discover() are retained as-is. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/crawler/source/target.rs | 121 ++++++--------------------- frontend/package.json | 2 +- 4 files changed, 27 insertions(+), 100 deletions(-) diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 920feaa..3cc273c 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.45.0" +version = "0.45.1" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index b826569..e62860b 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.45.0" +version = "0.45.1" edition = "2021" default-run = "mangalord" diff --git a/backend/src/crawler/source/target.rs b/backend/src/crawler/source/target.rs index 1ae00fc..1be0a38 100644 --- a/backend/src/crawler/source/target.rs +++ b/backend/src/crawler/source/target.rs @@ -7,7 +7,6 @@ //! (`td:has(label:contains("Author:"))`) are implemented by walking //! the parsed tree. -use std::collections::VecDeque; use std::time::Duration; use anyhow::Context; @@ -75,10 +74,11 @@ impl Source for TargetSource { &self, ctx: &FetchContext<'_>, ) -> anyhow::Result> { - // Always visit page 1 first because that's the only way to - // discover `last_page`. Retry it on transient — a broken first - // page would otherwise abort the whole walk before we've even - // started. + // Probe page 1 up front (with transient retry) for two reasons: + // a broken first page should abort cleanly rather than mid-walk, + // and the HTML is handed straight to the first `next_batch` call + // so the walker doesn't re-fetch it. Page count is discovered + // incrementally — see `TargetSourceWalker::next_batch`. let first_html = retry_on_transient( || async { navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await @@ -87,21 +87,10 @@ impl Source for TargetSource { PAGE_TRANSIENT_RETRY_DELAY, ) .await?; - let last_page = { - let doc = scraper::Html::parse_document(&first_html); - parse_last_page(&doc) - }; - - let order = build_page_order(last_page); - tracing::info!( - last_page = ?last_page, - page_count = order.len(), - "walking pagination" - ); Ok(Box::new(TargetSourceWalker { base_url: self.base_url.clone(), - pages_remaining: order, + next_page: 1, first_page_html: Some(first_html), })) } @@ -147,24 +136,19 @@ impl Source for TargetSource { } } -/// Build the queue of page numbers `TargetSource::discover` will walk. -/// The site orders by `update_date DESC`, so newest-first is just the -/// natural page order: `1..=last`. If `last_page` is unknown (source -/// surfaces no pagination) only page 1 is visited. -fn build_page_order(last_page: Option) -> VecDeque { - match last_page { - None => VecDeque::from([1]), - Some(last) => (1..=last).collect(), - } -} - -/// Walker returned by [`TargetSource::discover`]. Pops one source-index -/// page per `next_batch` call. Page 1's HTML is cached at construction -/// time (the discover call needed it to read `last_page` anyway) so the -/// batch covering page 1 doesn't re-fetch. +/// Walker returned by [`TargetSource::discover`]. Walks pages `1..` in +/// order, terminating as soon as a page renders cleanly with zero entries +/// — that's the "we ran off the end of the index" signal. Page 1's HTML +/// is cached at construction time (discover already had to fetch it for +/// the transient probe) so the first batch doesn't re-fetch. +/// +/// A genuinely empty `Ok(vec![])` from `parse_manga_list_from` is what +/// stops us: the parser's `#logo` sentinel converts unrendered pages +/// into transient errors before they reach this loop, so an empty +/// parse result reliably means "no more entries." struct TargetSourceWalker { base_url: String, - pages_remaining: VecDeque, + next_page: i32, first_page_html: Option, } @@ -174,13 +158,11 @@ impl DiscoverWalk for TargetSourceWalker { &mut self, ctx: &FetchContext<'_>, ) -> anyhow::Result>> { - let Some(page_num) = self.pages_remaining.pop_front() else { - return Ok(None); - }; + let page_num = self.next_page; let page_refs = if page_num == 1 { // Reuse the cached page-1 HTML from the initial probe. Take - // it (rather than clone) so a malformed page-order queue - // that re-visits page 1 still falls back to a real fetch. + // it (rather than clone) so a future re-entry that somehow + // revisits page 1 still falls back to a real fetch. match self.first_page_html.take() { Some(html) => { let doc = scraper::Html::parse_document(&html); @@ -218,6 +200,10 @@ impl DiscoverWalk for TargetSourceWalker { .await? }; tracing::info!(page_num, count = page_refs.len(), "page walked"); + if page_refs.is_empty() { + return Ok(None); + } + self.next_page += 1; Ok(Some(page_refs)) } } @@ -288,20 +274,6 @@ fn classify_navigate_html(html: String) -> Result { Ok(html) } -fn parse_last_page(doc: &scraper::Html) -> Option { - // Pagination links carry their page number as text. Take the - // numeric maximum so we don't depend on a specific layout (Prev, - // Next, ellipses, etc. all get filtered out by .parse). - let sel = scraper::Selector::parse("#left_side .pagination a").unwrap(); - doc.select(&sel) - .filter_map(|a| { - collapse_whitespace(&a.text().collect::()) - .parse::() - .ok() - }) - .max() -} - /// Substitutes the first `/N/` path segment with the target page /// number. Source impls that paginate via a different URL shape can /// override this — for the modeled site the segment is always present. @@ -853,29 +825,6 @@ mod tests { assert_eq!(parse_chapter_number("Special"), None); } - #[test] - fn parse_last_page_picks_highest_pagination_link() { - let html = r#" -
- "#; - let doc = scraper::Html::parse_document(html); - assert_eq!(parse_last_page(&doc), Some(47)); - } - - #[test] - fn parse_last_page_none_when_no_pagination() { - let doc = scraper::Html::parse_document(""); - assert!(parse_last_page(&doc).is_none()); - } - #[test] fn page_url_substitutes_numeric_path_segment() { assert_eq!( @@ -1024,28 +973,6 @@ mod tests { assert!(err.is_transient(), "got non-transient: {err}"); } - #[test] - fn build_page_order_is_natural_one_to_last() { - // Newest-first is just the source's natural pagination order: - // (update_date DESC) lives at page 1, oldest at the last page. - let order = build_page_order(Some(3)); - assert_eq!(Vec::from(order), vec![1, 2, 3]); - } - - #[test] - fn build_page_order_falls_back_to_page_one_only_without_pagination() { - // Source surfaced no pagination control — visit page 1 alone - // and let the walk end after one batch. - let order = build_page_order(None); - assert_eq!(Vec::from(order), vec![1]); - } - - #[test] - fn build_page_order_single_page_index_yields_one_entry() { - let order = build_page_order(Some(1)); - assert_eq!(Vec::from(order), vec![1]); - } - #[test] fn parse_chapter_list_returns_transient_when_table_missing() { // Partial render (post-load JS hadn't injected the table, layout diff --git a/frontend/package.json b/frontend/package.json index 52180a0..f110d5c 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.45.0", + "version": "0.45.1", "private": true, "type": "module", "scripts": {