From c51353ead36a498178351a7735ccadb248eea67b Mon Sep 17 00:00:00 2001
From: MechaCat02 <fabian@diekaulbachs.de>
Date: Fri, 22 May 2026 23:15:36 +0200
Subject: [PATCH] bugfix: chapter source key uses chapter id, not /pg-1/
 (0.23.1)

Listing links point at the reader's page 1
(`.../uu/br_chapter-N/pg-1/`). The generic `derive_key_from_url` took
the last URL segment and returned `"pg-1"` for every chapter, so all
parsed chapters collapsed onto a single `chapter_sources` row downstream
and the first-manga chapter was the only row that survived. New
`derive_chapter_key_from_url` strips a trailing `/pg-\d+/` before
picking the chapter-identifying segment (`br_chapter-N` / `to_chapter-N`).

Notices, hiatus rows, and duplicate-numbered chapters are preserved as
distinct parser entries. The (manga_id, number) UNIQUE collapse in the
chapters table is a separate, follow-up concern handled in
feat/chapter-id-routing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 backend/Cargo.lock                            |   2 +-
 backend/Cargo.toml                            |   2 +-
 backend/src/crawler/source/target.rs          | 119 ++++++++++-
 .../fixtures/target/chapter_list_uu.html      | 194 ++++++++++++++++++
 frontend/package.json                         |   2 +-
 5 files changed, 315 insertions(+), 4 deletions(-)
 create mode 100644 backend/tests/fixtures/target/chapter_list_uu.html
diff --git a/backend/Cargo.lock b/backend/Cargo.lock
index 0c14ad0..df2da46 100644
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
@@ -1415,7 +1415,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
 
 [[package]]
 name = "mangalord"
-version = "0.23.0"
+version = "0.23.1"
 dependencies = [
  "anyhow",
  "argon2",
diff --git a/backend/Cargo.toml b/backend/Cargo.toml
index fbf29c9..4900858 100644
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mangalord"
-version = "0.23.0"
+version = "0.23.1"
 edition = "2021"
 default-run = "mangalord"
 
diff --git a/backend/src/crawler/source/target.rs b/backend/src/crawler/source/target.rs
index 633611d..375ba69 100644
--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -334,7 +334,7 @@ fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
             let title_text = collapse_whitespace(&a.text().collect::<String>());
             let number = parse_chapter_number(&title_text).unwrap_or(0);
             Some(SourceChapterRef {
-                source_chapter_key: derive_key_from_url(&url),
+                source_chapter_key: derive_chapter_key_from_url(&url),
                 number,
                 title: (!title_text.is_empty()).then_some(title_text),
                 url,
@@ -366,6 +366,29 @@ fn derive_key_from_url(url: &str) -> String {
         .to_string()
 }
 
+/// Chapter URLs on this source point at the reader's page 1, e.g.
+/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the
+/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment
+/// identifies a page *within* a chapter, so naively taking the last
+/// path component returns `"pg-1"` for every chapter and collapses
+/// them all under one source_chapter_key downstream.
+fn derive_chapter_key_from_url(url: &str) -> String {
+    let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/');
+    let without_reader_page = match trimmed.rsplit_once('/') {
+        Some((prefix, last)) if is_reader_page_segment(last) => prefix,
+        _ => trimmed,
+    };
+    without_reader_page
+        .rsplit('/')
+        .find(|s| !s.is_empty())
+        .unwrap_or(url)
+        .to_string()
+}
+
+fn is_reader_page_segment(s: &str) -> bool {
+    s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit())
+}
+
 fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
     let s = scraper::Selector::parse(sel).ok()?;
     let el = doc.select(&s).next()?;
@@ -577,6 +600,61 @@ mod tests {
         assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
     }
 
+    #[test]
+    fn parse_chapter_list_keeps_all_chapters_with_unique_keys() {
+        // Real listing fixture from the target site. 15 rows: chapters
+        // with various Ch.N markup, one hiatus row, three "notice." rows,
+        // and duplicates of Ch.1 and Ch.52 from different uploaders.
+        // Every row must survive parsing and every chapter must have a
+        // distinct source_chapter_key — chapter URLs all end in `/pg-1/`
+        // (the reader's page-1 entry point), and a naive
+        // last-segment-of-URL derivation returns "pg-1" for every row,
+        // collapsing the whole list into one downstream chapter row.
+        let html = include_str!(
+            "../../../tests/fixtures/target/chapter_list_uu.html"
+        );
+        let doc = scraper::Html::parse_document(html);
+        let chapters = parse_chapter_list(&doc);
+
+        assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
+
+        let mut keys: Vec<&str> =
+            chapters.iter().map(|c| c.source_chapter_key.as_str()).collect();
+        keys.sort();
+        let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]);
+        assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}");
+        for c in &chapters {
+            assert_ne!(
+                c.source_chapter_key, "pg-1",
+                "key must not be the reader-page segment: {:?}", c
+            );
+        }
+
+        // Latest chapter is first (source orders newest → oldest).
+        assert_eq!(chapters[0].number, 67);
+        assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official"));
+        assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272");
+
+        // Duplicate-number chapters (different uploaders) survive as
+        // two rows. The (manga_id, number) UNIQUE collapse is a
+        // downstream schema concern handled separately.
+        assert_eq!(
+            chapters.iter().filter(|c| c.number == 52).count(),
+            2,
+            "two Ch.52 uploads must both survive parsing"
+        );
+        assert_eq!(
+            chapters.iter().filter(|c| c.number == 1).count(),
+            2,
+            "Ch.1 Official and Ch.1 Team Hazama are both kept"
+        );
+
+        // Notices / hiatus rows have no leading digit so they parse to
+        // number=0. They are not filtered out.
+        let zero = chapters.iter().filter(|c| c.number == 0).count();
+        assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}");
+    }
+
     #[test]
     fn parse_chapter_number_grabs_first_integer_run() {
         assert_eq!(parse_chapter_number("Ch.1"), Some(1));
@@ -630,6 +708,45 @@ mod tests {
         assert_eq!(derive_key_from_url("/manga/bar"), "bar");
     }
 
+    #[test]
+    fn derive_chapter_key_strips_trailing_reader_page_segment() {
+        // Listing links go to page 1 of the reader; strip /pg-\d+/.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"),
+            "br_chapter-379272"
+        );
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"),
+            "to_chapter-13"
+        );
+        // Defensive: deep-link to a non-first page should still resolve
+        // to the same chapter identity.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"),
+            "br_chapter-379272"
+        );
+        // No reader-page suffix → behaves like derive_key_from_url.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/"),
+            "br_chapter-379272"
+        );
+        // Query strings are stripped.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"),
+            "br_chapter-379272"
+        );
+        // `pg-foo` is not a valid reader-page segment; treated as identity.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/something/pg-foo/"),
+            "pg-foo"
+        );
+        // Bare `pg-` (no digits) likewise not stripped.
+        assert_eq!(
+            derive_chapter_key_from_url(".../uu/something/pg-/"),
+            "pg-"
+        );
+    }
+
     #[test]
     fn metadata_hash_is_stable_and_field_sensitive() {
         let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
diff --git a/backend/tests/fixtures/target/chapter_list_uu.html b/backend/tests/fixtures/target/chapter_list_uu.html
new file mode 100644
index 0000000..7b1bafe
--- /dev/null
+++ b/backend/tests/fixtures/target/chapter_list_uu.html
@@ -0,0 +1,194 @@
+<table class="listing" id="chapter_table">
+	<tbody>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-379272/pg-1/"><b>Ch.67</b>
+						: Official </a>
+					<b style="color:#FEFD7F;width;30px;display:inline-block;margin-left:5px">new</b>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">May 20, 2026</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-328248/pg-1/"><b>hitaus.</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Jan 15, 2026</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-326351/pg-1/"><b>Ch.66</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Jan 10, 2026</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-295078/pg-1/"><b>Ch.52</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Aug 28, 2025</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-294815/pg-1/"><b>Ch.52</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../4300634/upload/">mina</a>
+			</td>
+			<td class="no">Aug 27, 2025</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-249964/pg-1/"><b>Ch.10</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Jan 5, 2025</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/to_chapter-13/pg-1/"><b>Ch.13</b>
+						: Thank you, we'll see you in the next one! </a>
+				</h4>
+			</td>
+			<td class="no"></td>
+			<td class="no">Dec 30, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-249095/pg-1/"><b>Ch.9</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Dec 28, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-248930/pg-1/"><b>Ch.1</b>
+						: Official </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Dec 26, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/to_chapter-12/pg-1/"><b>Ch.12</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no"></td>
+			<td class="no">Dec 1, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-244844/pg-1/"><b>notice.</b>
+						: Officials </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Nov 26, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/to_chapter-11/pg-1/"><b>Ch.11</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no"></td>
+			<td class="no">Nov 18, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-221180/pg-1/"><b>notice.</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../3781074/upload/">Izanami</a>
+			</td>
+			<td class="no">Jun 21, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-234803/pg-1/"><b>notice.</b>
+					</a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../2843005/upload/">bloomingdale</a>
+			</td>
+			<td class="no">Sep 13, 2024</td>
+		</tr>
+		<tr>
+			<td>
+				<h4>
+					<a class="chico"
+						href=".../uu/br_chapter-220299/pg-1/"><b>Ch.1</b>
+						: Team Hazama </a>
+				</h4>
+			</td>
+			<td class="no">
+				<a href=".../1457681/upload/">purplepandabear</a>
+			</td>
+			<td class="no">Jun 16, 2024</td>
+		</tr>
+	</tbody>
+</table>
diff --git a/frontend/package.json b/frontend/package.json
index 813cb2d..c4df14d 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
   "name": "mangalord-frontend",
-  "version": "0.23.0",
+  "version": "0.23.1",
   "private": true,
   "type": "module",
   "scripts": {

+ + Ch.67 + : Official + new + +	+ bloomingdale +	May 20, 2026
+ + hitaus. + + +	+ bloomingdale +	Jan 15, 2026
+ + Ch.66 + : Official + +	+ bloomingdale +	Jan 10, 2026
+ + Ch.52 + : Official + +	+ bloomingdale +	Aug 28, 2025
+ + Ch.52 + : Official + +	+ mina +	Aug 27, 2025
+ + Ch.10 + : Official + +	+ bloomingdale +	Jan 5, 2025
+ + Ch.13 + : Thank you, we'll see you in the next one! + +		Dec 30, 2024
+ + Ch.9 + : Official + +	+ bloomingdale +	Dec 28, 2024
+ + Ch.1 + : Official + +	+ bloomingdale +	Dec 26, 2024
+ + Ch.12 + + +		Dec 1, 2024
+ + notice. + : Officials + +	+ bloomingdale +	Nov 26, 2024
+ + Ch.11 + + +		Nov 18, 2024
+ + notice. + + +	+ Izanami +	Jun 21, 2024
+ + notice. + + +	+ bloomingdale +	Sep 13, 2024
+ + Ch.1 + : Team Hazama + +	+ purplepandabear +	Jun 16, 2024