bugfix: chapter source key uses chapter id, not /pg-1/ (0.23.1)

Listing links point at the reader's page 1 (`.../uu/br_chapter-N/pg-1/`). The generic `derive_key_from_url` took the last URL segment and returned `"pg-1"` for every chapter, so all parsed chapters collapsed onto a single `chapter_sources` row downstream and the first-manga chapter was the only row that survived. New `derive_chapter_key_from_url` strips a trailing `/pg-\d+/` before picking the chapter-identifying segment (`br_chapter-N` / `to_chapter-N`). Notices, hiatus rows, and duplicate-numbered chapters are preserved as distinct parser entries. The (manga_id, number) UNIQUE collapse in the chapters table is a separate, follow-up concern handled in feat/chapter-id-routing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 23:15:36 +02:00
parent b1a3a4e9d3
commit c51353ead3
5 changed files with 315 additions and 4 deletions
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
@@ -1415,7 +1415,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"

 [[package]]
 name = "mangalord"
-version = "0.23.0"
+version = "0.23.1"
 dependencies = [
 "anyhow",
 "argon2",
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mangalord"
-version = "0.23.0"
+version = "0.23.1"
 edition = "2021"
 default-run = "mangalord"

--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -334,7 +334,7 @@ fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
            let title_text = collapse_whitespace(&a.text().collect::<String>());
            let number = parse_chapter_number(&title_text).unwrap_or(0);
            Some(SourceChapterRef {
-                source_chapter_key: derive_key_from_url(&url),
+                source_chapter_key: derive_chapter_key_from_url(&url),
                number,
                title: (!title_text.is_empty()).then_some(title_text),
                url,
@@ -366,6 +366,29 @@ fn derive_key_from_url(url: &str) -> String {
        .to_string()
 }

+/// Chapter URLs on this source point at the reader's page 1, e.g.
+/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the
+/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment
+/// identifies a page *within* a chapter, so naively taking the last
+/// path component returns `"pg-1"` for every chapter and collapses
+/// them all under one source_chapter_key downstream.
+fn derive_chapter_key_from_url(url: &str) -> String {
+    let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/');
+    let without_reader_page = match trimmed.rsplit_once('/') {
+        Some((prefix, last)) if is_reader_page_segment(last) => prefix,
+        _ => trimmed,
+    };
+    without_reader_page
+        .rsplit('/')
+        .find(|s| !s.is_empty())
+        .unwrap_or(url)
+        .to_string()
+}
+
+fn is_reader_page_segment(s: &str) -> bool {
+    s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit())
+}
+
 fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
    let s = scraper::Selector::parse(sel).ok()?;
    let el = doc.select(&s).next()?;
@@ -577,6 +600,61 @@ mod tests {
        assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
    }

+    #[test]
+    fn parse_chapter_list_keeps_all_chapters_with_unique_keys() {
+        // Real listing fixture from the target site. 15 rows: chapters
+        // with various Ch.N markup, one hiatus row, three "notice." rows,
+        // and duplicates of Ch.1 and Ch.52 from different uploaders.
+        // Every row must survive parsing and every chapter must have a
+        // distinct source_chapter_key — chapter URLs all end in `/pg-1/`
+        // (the reader's page-1 entry point), and a naive
+        // last-segment-of-URL derivation returns "pg-1" for every row,
+        // collapsing the whole list into one downstream chapter row.
+        let html = include_str!(
+            "../../../tests/fixtures/target/chapter_list_uu.html"
+        );
+        let doc = scraper::Html::parse_document(html);
+        let chapters = parse_chapter_list(&doc);
+
+        assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
+
+        let mut keys: Vec<&str> =
+            chapters.iter().map(|c| c.source_chapter_key.as_str()).collect();
+        keys.sort();
+        let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]);
+        assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}");
+        for c in &chapters {
+            assert_ne!(
+                c.source_chapter_key, "pg-1",
+                "key must not be the reader-page segment: {:?}", c
+            );
+        }
+
+        // Latest chapter is first (source orders newest → oldest).
+        assert_eq!(chapters[0].number, 67);
+        assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official"));
+        assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272");
+
+        // Duplicate-number chapters (different uploaders) survive as
+        // two rows. The (manga_id, number) UNIQUE collapse is a
+        // downstream schema concern handled separately.
+        assert_eq!(
+            chapters.iter().filter(|c| c.number == 52).count(),
+            2,
+            "two Ch.52 uploads must both survive parsing"
+        );
+        assert_eq!(
+            chapters.iter().filter(|c| c.number == 1).count(),
+            2,
+            "Ch.1 Official and Ch.1 Team Hazama are both kept"
+        );
+
+        // Notices / hiatus rows have no leading digit so they parse to
+        // number=0. They are not filtered out.
+        let zero = chapters.iter().filter(|c| c.number == 0).count();
+        assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}");
+    }
+
    #[test]
    fn parse_chapter_number_grabs_first_integer_run() {
        assert_eq!(parse_chapter_number("Ch.1"), Some(1));
@@ -630,6 +708,45 @@ mod tests {
        assert_eq!(derive_key_from_url("/manga/bar"), "bar");
    }

+    #[test]
+    fn derive_chapter_key_strips_trailing_reader_page_segment() {
+        // Listing links go to page 1 of the reader; strip /pg-\d+/.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"),
+            "br_chapter-379272"
+        );
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"),
+            "to_chapter-13"
+        );
+        // Defensive: deep-link to a non-first page should still resolve
+        // to the same chapter identity.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"),
+            "br_chapter-379272"
+        );
+        // No reader-page suffix → behaves like derive_key_from_url.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/"),
+            "br_chapter-379272"
+        );
+        // Query strings are stripped.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"),
+            "br_chapter-379272"
+        );
+        // `pg-foo` is not a valid reader-page segment; treated as identity.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/something/pg-foo/"),
+            "pg-foo"
+        );
+        // Bare `pg-` (no digits) likewise not stripped.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/something/pg-/"),
+            "pg-"
+        );
+    }
+
    #[test]
    fn metadata_hash_is_stable_and_field_sensitive() {
        let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
--- a/backend/tests/fixtures/target/chapter_list_uu.html
+++ b/backend/tests/fixtures/target/chapter_list_uu.html
@@ -0,0 +1,194 @@
+<table class="listing" id="chapter_table">
+	<tbody>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-379272/pg-1/"><b>Ch.67</b>
+						: Official </a>
+					<b style="color:#FEFD7F;width;30px;display:inline-block;margin-left:5px">new</b>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">May 20, 2026</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-328248/pg-1/"><b>hitaus.</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Jan 15, 2026</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-326351/pg-1/"><b>Ch.66</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Jan 10, 2026</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-295078/pg-1/"><b>Ch.52</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Aug 28, 2025</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-294815/pg-1/"><b>Ch.52</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../4300634/upload/">mina</a>
+			</td>
+			<td class="no">Aug 27, 2025</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-249964/pg-1/"><b>Ch.10</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Jan 5, 2025</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/to_chapter-13/pg-1/"><b>Ch.13</b>
+						: Thank you, we'll see you in the next one! </a>
+				</h4>
+			</td>
+			<td class="no"></td>
+			<td class="no">Dec 30, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-249095/pg-1/"><b>Ch.9</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Dec 28, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-248930/pg-1/"><b>Ch.1</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Dec 26, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/to_chapter-12/pg-1/"><b>Ch.12</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no"></td>
+			<td class="no">Dec 1, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-244844/pg-1/"><b>notice.</b>
+						: Officials </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Nov 26, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/to_chapter-11/pg-1/"><b>Ch.11</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no"></td>
+			<td class="no">Nov 18, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-221180/pg-1/"><b>notice.</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../3781074/upload/">Izanami</a>
+			</td>
+			<td class="no">Jun 21, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-234803/pg-1/"><b>notice.</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Sep 13, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-220299/pg-1/"><b>Ch.1</b>
+						: Team Hazama </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../1457681/upload/">purplepandabear</a>
+			</td>
+			<td class="no">Jun 16, 2024</td>
+		</tr>
+	</tbody>
+</table>
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "mangalord-frontend",
-  "version": "0.23.0",
+  "version": "0.23.1",
  "private": true,
  "type": "module",
  "scripts": {