bugfix: chapter source key uses chapter id, not /pg-1/ (0.23.1)

Listing links point at the reader's page 1
(`.../uu/br_chapter-N/pg-1/`). The generic `derive_key_from_url` took
the last URL segment and returned `"pg-1"` for every chapter, so all
parsed chapters collapsed onto a single `chapter_sources` row downstream
and the first-manga chapter was the only row that survived. New
`derive_chapter_key_from_url` strips a trailing `/pg-\d+/` before
picking the chapter-identifying segment (`br_chapter-N` / `to_chapter-N`).

Notices, hiatus rows, and duplicate-numbered chapters are preserved as
distinct parser entries. The (manga_id, number) UNIQUE collapse in the
chapters table is a separate, follow-up concern handled in
feat/chapter-id-routing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-22 23:15:36 +02:00
parent b1a3a4e9d3
commit c51353ead3
5 changed files with 315 additions and 4 deletions

2
backend/Cargo.lock generated
View File

@@ -1415,7 +1415,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mangalord"
version = "0.23.0"
version = "0.23.1"
dependencies = [
"anyhow",
"argon2",

View File

@@ -1,6 +1,6 @@
[package]
name = "mangalord"
version = "0.23.0"
version = "0.23.1"
edition = "2021"
default-run = "mangalord"

View File

@@ -334,7 +334,7 @@ fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
let title_text = collapse_whitespace(&a.text().collect::<String>());
let number = parse_chapter_number(&title_text).unwrap_or(0);
Some(SourceChapterRef {
source_chapter_key: derive_key_from_url(&url),
source_chapter_key: derive_chapter_key_from_url(&url),
number,
title: (!title_text.is_empty()).then_some(title_text),
url,
@@ -366,6 +366,29 @@ fn derive_key_from_url(url: &str) -> String {
.to_string()
}
/// Chapter URLs on this source point at the reader's page 1, e.g.
/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the
/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment
/// identifies a page *within* a chapter, so naively taking the last
/// path component returns `"pg-1"` for every chapter and collapses
/// them all under one source_chapter_key downstream.
fn derive_chapter_key_from_url(url: &str) -> String {
let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/');
let without_reader_page = match trimmed.rsplit_once('/') {
Some((prefix, last)) if is_reader_page_segment(last) => prefix,
_ => trimmed,
};
without_reader_page
.rsplit('/')
.find(|s| !s.is_empty())
.unwrap_or(url)
.to_string()
}
fn is_reader_page_segment(s: &str) -> bool {
s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit())
}
fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
let s = scraper::Selector::parse(sel).ok()?;
let el = doc.select(&s).next()?;
@@ -577,6 +600,61 @@ mod tests {
assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
}
#[test]
fn parse_chapter_list_keeps_all_chapters_with_unique_keys() {
// Real listing fixture from the target site. 15 rows: chapters
// with various Ch.N markup, one hiatus row, three "notice." rows,
// and duplicates of Ch.1 and Ch.52 from different uploaders.
// Every row must survive parsing and every chapter must have a
// distinct source_chapter_key — chapter URLs all end in `/pg-1/`
// (the reader's page-1 entry point), and a naive
// last-segment-of-URL derivation returns "pg-1" for every row,
// collapsing the whole list into one downstream chapter row.
let html = include_str!(
"../../../tests/fixtures/target/chapter_list_uu.html"
);
let doc = scraper::Html::parse_document(html);
let chapters = parse_chapter_list(&doc);
assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
let mut keys: Vec<&str> =
chapters.iter().map(|c| c.source_chapter_key.as_str()).collect();
keys.sort();
let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]);
assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}");
for c in &chapters {
assert_ne!(
c.source_chapter_key, "pg-1",
"key must not be the reader-page segment: {:?}", c
);
}
// Latest chapter is first (source orders newest → oldest).
assert_eq!(chapters[0].number, 67);
assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official"));
assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272");
// Duplicate-number chapters (different uploaders) survive as
// two rows. The (manga_id, number) UNIQUE collapse is a
// downstream schema concern handled separately.
assert_eq!(
chapters.iter().filter(|c| c.number == 52).count(),
2,
"two Ch.52 uploads must both survive parsing"
);
assert_eq!(
chapters.iter().filter(|c| c.number == 1).count(),
2,
"Ch.1 Official and Ch.1 Team Hazama are both kept"
);
// Notices / hiatus rows have no leading digit so they parse to
// number=0. They are not filtered out.
let zero = chapters.iter().filter(|c| c.number == 0).count();
assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}");
}
#[test]
fn parse_chapter_number_grabs_first_integer_run() {
assert_eq!(parse_chapter_number("Ch.1"), Some(1));
@@ -630,6 +708,45 @@ mod tests {
assert_eq!(derive_key_from_url("/manga/bar"), "bar");
}
#[test]
fn derive_chapter_key_strips_trailing_reader_page_segment() {
// Listing links go to page 1 of the reader; strip /pg-\d+/.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"),
"br_chapter-379272"
);
assert_eq!(
derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"),
"to_chapter-13"
);
// Defensive: deep-link to a non-first page should still resolve
// to the same chapter identity.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"),
"br_chapter-379272"
);
// No reader-page suffix → behaves like derive_key_from_url.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/"),
"br_chapter-379272"
);
// Query strings are stripped.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"),
"br_chapter-379272"
);
// `pg-foo` is not a valid reader-page segment; treated as identity.
assert_eq!(
derive_chapter_key_from_url(".../uu/something/pg-foo/"),
"pg-foo"
);
// Bare `pg-` (no digits) likewise not stripped.
assert_eq!(
derive_chapter_key_from_url(".../uu/something/pg-/"),
"pg-"
);
}
#[test]
fn metadata_hash_is_stable_and_field_sensitive() {
let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();

View File

@@ -0,0 +1,194 @@
<table class="listing" id="chapter_table">
<tbody>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-379272/pg-1/"><b>Ch.67</b>
: Official </a>
<b style="color:#FEFD7F;width;30px;display:inline-block;margin-left:5px">new</b>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">May 20, 2026</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-328248/pg-1/"><b>hitaus.</b>
</a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Jan 15, 2026</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-326351/pg-1/"><b>Ch.66</b>
: Official </a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Jan 10, 2026</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-295078/pg-1/"><b>Ch.52</b>
: Official </a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Aug 28, 2025</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-294815/pg-1/"><b>Ch.52</b>
: Official </a>
</h4>
</td>
<td class="no">
<a href=".../4300634/upload/">mina</a>
</td>
<td class="no">Aug 27, 2025</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-249964/pg-1/"><b>Ch.10</b>
: Official </a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Jan 5, 2025</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/to_chapter-13/pg-1/"><b>Ch.13</b>
: Thank you, we'll see you in the next one! </a>
</h4>
</td>
<td class="no"></td>
<td class="no">Dec 30, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-249095/pg-1/"><b>Ch.9</b>
: Official </a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Dec 28, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-248930/pg-1/"><b>Ch.1</b>
: Official </a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Dec 26, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/to_chapter-12/pg-1/"><b>Ch.12</b>
</a>
</h4>
</td>
<td class="no"></td>
<td class="no">Dec 1, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-244844/pg-1/"><b>notice.</b>
: Officials </a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Nov 26, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/to_chapter-11/pg-1/"><b>Ch.11</b>
</a>
</h4>
</td>
<td class="no"></td>
<td class="no">Nov 18, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-221180/pg-1/"><b>notice.</b>
</a>
</h4>
</td>
<td class="no">
<a href=".../3781074/upload/">Izanami</a>
</td>
<td class="no">Jun 21, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-234803/pg-1/"><b>notice.</b>
</a>
</h4>
</td>
<td class="no">
<a href=".../2843005/upload/">bloomingdale</a>
</td>
<td class="no">Sep 13, 2024</td>
</tr>
<tr>
<td>
<h4>
<a class="chico"
href=".../uu/br_chapter-220299/pg-1/"><b>Ch.1</b>
: Team Hazama </a>
</h4>
</td>
<td class="no">
<a href=".../1457681/upload/">purplepandabear</a>
</td>
<td class="no">Jun 16, 2024</td>
</tr>
</tbody>
</table>

View File

@@ -1,6 +1,6 @@
{
"name": "mangalord-frontend",
"version": "0.23.0",
"version": "0.23.1",
"private": true,
"type": "module",
"scripts": {