fix(crawler): scope chapter_sources lookup per-manga (0.35.5)

chapter_sources's PRIMARY KEY was (source_id, source_chapter_key) and the lookup in sync_manga_chapters didn't constrain by manga_id, so a source whose chapter slugs aren't globally unique (e.g. "chapter-1" appearing under multiple mangas) silently attributed every collision to the first manga that synced it. The INSERT path would have conflicted on the second manga's sync. Migration 0017 drops the old PK and rekeys on (source_id, chapter_id) — the natural identity of a per-source chapter attachment — and adds an index on (source_id, source_chapter_key) for the lookup path. The repo lookup now joins chapters and filters by manga_id; the UPDATE path keys on chapter_id directly (the row's natural identifier post-migration). Test sync_chapters_isolates_colliding_keys_across_mangas pins the contract end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-29 20:43:08 +02:00
parent 50763addcf
commit c6bb9160e3
6 changed files with 156 additions and 6 deletions
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"

 [[package]]
 name = "mangalord"
-version = "0.35.4"
+version = "0.35.5"
 dependencies = [
 "anyhow",
 "argon2",
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mangalord"
-version = "0.35.4"
+version = "0.35.5"
 edition = "2021"
 default-run = "mangalord"

--- a/backend/migrations/0017_chapter_sources_per_manga.sql
+++ b/backend/migrations/0017_chapter_sources_per_manga.sql
@@ -0,0 +1,20 @@
+-- chapter_sources: drop the global (source_id, source_chapter_key) PK
+-- and rekey on (source_id, chapter_id).
+--
+-- The old PK assumed chapter slugs are unique per source. Sources whose
+-- chapter naming is per-manga (chapter-1, chapter-2, ...) instead of per-
+-- catalog (br_chapter-379272 with a global counter) would collide on the
+-- second manga: the INSERT would conflict on (source_id, "chapter-1") and
+-- the lookup would attribute the row to the first manga's chapter_id.
+--
+-- The new key is the natural identity of a source attachment: "this source
+-- has this chapter". An (source_id, source_chapter_key) index preserves
+-- the lookup path (find existing source row by source's identifier) but
+-- no longer enforces uniqueness — the application combines it with the
+-- chapters table's manga_id to scope the lookup per-manga.
+
+ALTER TABLE chapter_sources DROP CONSTRAINT chapter_sources_pkey;
+ALTER TABLE chapter_sources ADD PRIMARY KEY (source_id, chapter_id);
+
+CREATE INDEX chapter_sources_source_key_idx
+    ON chapter_sources (source_id, source_chapter_key);
--- a/backend/src/repo/crawler.rs
+++ b/backend/src/repo/crawler.rs
@@ -335,11 +335,23 @@ pub async fn sync_manga_chapters(
        .collect();

    for c in chapters {
+        // Lookup is constrained by manga_id (via the chapters join) so a
+        // source whose chapter slugs collide across mangas (e.g.
+        // "chapter-1" appearing under two different mangas) attributes
+        // each row to the correct manga. Migration 0017 dropped the
+        // (source_id, source_chapter_key) PK in favour of
+        // (source_id, chapter_id) for exactly this reason.
        let existing: Option<(Uuid,)> = sqlx::query_as(
-            "SELECT chapter_id FROM chapter_sources WHERE source_id = $1 AND source_chapter_key = $2",
+            "SELECT cs.chapter_id \
+               FROM chapter_sources cs \
+               JOIN chapters ch ON ch.id = cs.chapter_id \
+              WHERE cs.source_id = $1 \
+                AND cs.source_chapter_key = $2 \
+                AND ch.manga_id = $3",
        )
        .bind(source_id)
        .bind(&c.source_chapter_key)
+        .bind(manga_id)
        .fetch_optional(&mut *tx)
        .await?;

@@ -383,16 +395,19 @@ pub async fn sync_manga_chapters(
                    .bind(chapter_id)
                    .execute(&mut *tx)
                    .await?;
+                // chapter_id is now the natural per-(source, chapter)
+                // identifier — use it directly instead of re-keying on
+                // (source_id, source_chapter_key) which may not be unique.
                sqlx::query(
                    r#"
                    UPDATE chapter_sources
                       SET source_url = $1, last_seen_at = NOW(), dropped_at = NULL
-                     WHERE source_id = $2 AND source_chapter_key = $3
+                     WHERE source_id = $2 AND chapter_id = $3
                    "#,
                )
                .bind(&c.url)
                .bind(source_id)
-                .bind(&c.source_chapter_key)
+                .bind(chapter_id)
                .execute(&mut *tx)
                .await?;
                diff.refreshed += 1;
--- a/backend/tests/crawler_sync.rs
+++ b/backend/tests/crawler_sync.rs
@@ -308,6 +308,121 @@ async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool:
    assert_eq!(ch52_count.0, 2, "both Ch.52 uploads survive as separate rows");
 }

+#[sqlx::test(migrations = "./migrations")]
+async fn sync_chapters_isolates_colliding_keys_across_mangas(pool: PgPool) {
+    // Two mangas, both with a chapter whose source_chapter_key is
+    // "chapter-1". Pre-migration-0017 the PK enforced (source_id,
+    // source_chapter_key) globally and the lookup didn't filter by
+    // manga_id, so the second manga's sync would adopt the first manga's
+    // chapter_id (silent attribution corruption). After 0017 each manga
+    // owns its own row.
+    crawler::ensure_source(&pool, "target", "T", "https://x.example")
+        .await
+        .unwrap();
+    let m1 = sample_manga("foo", "Manga Foo", "hash-foo");
+    let m2 = sample_manga("bar", "Manga Bar", "hash-bar");
+    let up1 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m1)
+        .await
+        .unwrap();
+    let up2 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
+        .await
+        .unwrap();
+    assert_ne!(up1.manga_id, up2.manga_id);
+
+    let shared = vec![SourceChapterRef {
+        source_chapter_key: "chapter-1".into(),
+        number: 1,
+        title: Some("Ch.1".into()),
+        url: "https://x.example/foo/chapter-1/".into(),
+    }];
+    let diff1 = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &shared)
+        .await
+        .unwrap();
+    assert_eq!(diff1.new, 1, "manga foo: chapter inserted fresh");
+
+    // Manga bar now syncs *the same key*. Under the old schema this would
+    // either fail on PK conflict or attribute the chapter to foo. Under
+    // the new schema bar gets its own chapter row.
+    let bar_chapters = vec![SourceChapterRef {
+        source_chapter_key: "chapter-1".into(),
+        number: 1,
+        title: Some("Ch.1 (bar)".into()),
+        url: "https://x.example/bar/chapter-1/".into(),
+    }];
+    let diff2 = crawler::sync_manga_chapters(&pool, "target", up2.manga_id, &bar_chapters)
+        .await
+        .unwrap();
+    assert_eq!(
+        diff2.new, 1,
+        "manga bar: same key resolved per-manga to a fresh row"
+    );
+
+    let foo_count: (i64,) = sqlx::query_as(
+        "SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
+    )
+    .bind(up1.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    let bar_count: (i64,) = sqlx::query_as(
+        "SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
+    )
+    .bind(up2.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(foo_count.0, 1);
+    assert_eq!(bar_count.0, 1);
+
+    let bar_title: (Option<String>,) = sqlx::query_as(
+        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
+    )
+    .bind(up2.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(
+        bar_title.0.as_deref(),
+        Some("Ch.1 (bar)"),
+        "bar's chapter has bar's title, not foo's"
+    );
+
+    // A subsequent re-sync of foo with the same key correctly refreshes
+    // foo's row, not bar's.
+    let foo_resync = vec![SourceChapterRef {
+        source_chapter_key: "chapter-1".into(),
+        number: 1,
+        title: Some("Ch.1 (foo updated)".into()),
+        url: "https://x.example/foo/chapter-1/".into(),
+    }];
+    let diff_refresh = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &foo_resync)
+        .await
+        .unwrap();
+    assert_eq!(diff_refresh.refreshed, 1);
+    assert_eq!(diff_refresh.new, 0);
+
+    let foo_title: (Option<String>,) = sqlx::query_as(
+        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
+    )
+    .bind(up1.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(foo_title.0.as_deref(), Some("Ch.1 (foo updated)"));
+    let bar_title_after: (Option<String>,) = sqlx::query_as(
+        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
+    )
+    .bind(up2.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(
+        bar_title_after.0.as_deref(),
+        Some("Ch.1 (bar)"),
+        "bar's row is untouched by foo's refresh"
+    );
+}
+
 #[sqlx::test(migrations = "./migrations")]
 async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "mangalord-frontend",
-  "version": "0.35.4",
+  "version": "0.35.5",
  "private": true,
  "type": "module",
  "scripts": {