From c51353ead36a498178351a7735ccadb248eea67b Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Fri, 22 May 2026 23:15:36 +0200 Subject: [PATCH] bugfix: chapter source key uses chapter id, not /pg-1/ (0.23.1) Listing links point at the reader's page 1 (`.../uu/br_chapter-N/pg-1/`). The generic `derive_key_from_url` took the last URL segment and returned `"pg-1"` for every chapter, so all parsed chapters collapsed onto a single `chapter_sources` row downstream and the first-manga chapter was the only row that survived. New `derive_chapter_key_from_url` strips a trailing `/pg-\d+/` before picking the chapter-identifying segment (`br_chapter-N` / `to_chapter-N`). Notices, hiatus rows, and duplicate-numbered chapters are preserved as distinct parser entries. The (manga_id, number) UNIQUE collapse in the chapters table is a separate, follow-up concern handled in feat/chapter-id-routing. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/crawler/source/target.rs | 119 ++++++++++- .../fixtures/target/chapter_list_uu.html | 194 ++++++++++++++++++ frontend/package.json | 2 +- 5 files changed, 315 insertions(+), 4 deletions(-) create mode 100644 backend/tests/fixtures/target/chapter_list_uu.html diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 0c14ad0..df2da46 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1415,7 +1415,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.23.0" +version = "0.23.1" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index fbf29c9..4900858 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.23.0" +version = "0.23.1" edition = "2021" default-run = "mangalord" diff --git a/backend/src/crawler/source/target.rs b/backend/src/crawler/source/target.rs index 633611d..375ba69 100644 --- a/backend/src/crawler/source/target.rs +++ b/backend/src/crawler/source/target.rs @@ -334,7 +334,7 @@ fn parse_chapter_list(doc: &scraper::Html) -> Vec { let title_text = collapse_whitespace(&a.text().collect::()); let number = parse_chapter_number(&title_text).unwrap_or(0); Some(SourceChapterRef { - source_chapter_key: derive_key_from_url(&url), + source_chapter_key: derive_chapter_key_from_url(&url), number, title: (!title_text.is_empty()).then_some(title_text), url, @@ -366,6 +366,29 @@ fn derive_key_from_url(url: &str) -> String { .to_string() } +/// Chapter URLs on this source point at the reader's page 1, e.g. +/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the +/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment +/// identifies a page *within* a chapter, so naively taking the last +/// path component returns `"pg-1"` for every chapter and collapses +/// them all under one source_chapter_key downstream. +fn derive_chapter_key_from_url(url: &str) -> String { + let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/'); + let without_reader_page = match trimmed.rsplit_once('/') { + Some((prefix, last)) if is_reader_page_segment(last) => prefix, + _ => trimmed, + }; + without_reader_page + .rsplit('/') + .find(|s| !s.is_empty()) + .unwrap_or(url) + .to_string() +} + +fn is_reader_page_segment(s: &str) -> bool { + s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit()) +} + fn first_text(doc: &scraper::Html, sel: &str) -> Option { let s = scraper::Selector::parse(sel).ok()?; let el = doc.select(&s).next()?; @@ -577,6 +600,61 @@ mod tests { assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)"); } + #[test] + fn parse_chapter_list_keeps_all_chapters_with_unique_keys() { + // Real listing fixture from the target site. 15 rows: chapters + // with various Ch.N markup, one hiatus row, three "notice." rows, + // and duplicates of Ch.1 and Ch.52 from different uploaders. + // Every row must survive parsing and every chapter must have a + // distinct source_chapter_key — chapter URLs all end in `/pg-1/` + // (the reader's page-1 entry point), and a naive + // last-segment-of-URL derivation returns "pg-1" for every row, + // collapsing the whole list into one downstream chapter row. + let html = include_str!( + "../../../tests/fixtures/target/chapter_list_uu.html" + ); + let doc = scraper::Html::parse_document(html); + let chapters = parse_chapter_list(&doc); + + assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)"); + + let mut keys: Vec<&str> = + chapters.iter().map(|c| c.source_chapter_key.as_str()).collect(); + keys.sort(); + let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]); + assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}"); + for c in &chapters { + assert_ne!( + c.source_chapter_key, "pg-1", + "key must not be the reader-page segment: {:?}", c + ); + } + + // Latest chapter is first (source orders newest → oldest). + assert_eq!(chapters[0].number, 67); + assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official")); + assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272"); + + // Duplicate-number chapters (different uploaders) survive as + // two rows. The (manga_id, number) UNIQUE collapse is a + // downstream schema concern handled separately. + assert_eq!( + chapters.iter().filter(|c| c.number == 52).count(), + 2, + "two Ch.52 uploads must both survive parsing" + ); + assert_eq!( + chapters.iter().filter(|c| c.number == 1).count(), + 2, + "Ch.1 Official and Ch.1 Team Hazama are both kept" + ); + + // Notices / hiatus rows have no leading digit so they parse to + // number=0. They are not filtered out. + let zero = chapters.iter().filter(|c| c.number == 0).count(); + assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}"); + } + #[test] fn parse_chapter_number_grabs_first_integer_run() { assert_eq!(parse_chapter_number("Ch.1"), Some(1)); @@ -630,6 +708,45 @@ mod tests { assert_eq!(derive_key_from_url("/manga/bar"), "bar"); } + #[test] + fn derive_chapter_key_strips_trailing_reader_page_segment() { + // Listing links go to page 1 of the reader; strip /pg-\d+/. + assert_eq!( + derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"), + "br_chapter-379272" + ); + assert_eq!( + derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"), + "to_chapter-13" + ); + // Defensive: deep-link to a non-first page should still resolve + // to the same chapter identity. + assert_eq!( + derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"), + "br_chapter-379272" + ); + // No reader-page suffix → behaves like derive_key_from_url. + assert_eq!( + derive_chapter_key_from_url(".../uu/br_chapter-379272/"), + "br_chapter-379272" + ); + // Query strings are stripped. + assert_eq!( + derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"), + "br_chapter-379272" + ); + // `pg-foo` is not a valid reader-page segment; treated as identity. + assert_eq!( + derive_chapter_key_from_url(".../uu/something/pg-foo/"), + "pg-foo" + ); + // Bare `pg-` (no digits) likewise not stripped. + assert_eq!( + derive_chapter_key_from_url(".../uu/something/pg-/"), + "pg-" + ); + } + #[test] fn metadata_hash_is_stable_and_field_sensitive() { let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap(); diff --git a/backend/tests/fixtures/target/chapter_list_uu.html b/backend/tests/fixtures/target/chapter_list_uu.html new file mode 100644 index 0000000..7b1bafe --- /dev/null +++ b/backend/tests/fixtures/target/chapter_list_uu.html @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

+ Ch.67 + : Official + new +

+
+ bloomingdale + May 20, 2026
+

+ hitaus. + +

+
+ bloomingdale + Jan 15, 2026
+

+ Ch.66 + : Official +

+
+ bloomingdale + Jan 10, 2026
+

+ Ch.52 + : Official +

+
+ bloomingdale + Aug 28, 2025
+

+ Ch.52 + : Official +

+
+ mina + Aug 27, 2025
+

+ Ch.10 + : Official +

+
+ bloomingdale + Jan 5, 2025
+

+ Ch.13 + : Thank you, we'll see you in the next one! +

+
Dec 30, 2024
+

+ Ch.9 + : Official +

+
+ bloomingdale + Dec 28, 2024
+

+ Ch.1 + : Official +

+
+ bloomingdale + Dec 26, 2024
+

+ Ch.12 + +

+
Dec 1, 2024
+

+ notice. + : Officials +

+
+ bloomingdale + Nov 26, 2024
+

+ Ch.11 + +

+
Nov 18, 2024
+

+ notice. + +

+
+ Izanami + Jun 21, 2024
+

+ notice. + +

+
+ bloomingdale + Sep 13, 2024
+

+ Ch.1 + : Team Hazama +

+
+ purplepandabear + Jun 16, 2024
diff --git a/frontend/package.json b/frontend/package.json index 813cb2d..c4df14d 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.23.0", + "version": "0.23.1", "private": true, "type": "module", "scripts": {