fix(crawler): scope chapter_sources lookup per-manga (0.35.5)

chapter_sources's PRIMARY KEY was (source_id, source_chapter_key) and the lookup in sync_manga_chapters didn't constrain by manga_id, so a source whose chapter slugs aren't globally unique (e.g. "chapter-1" appearing under multiple mangas) silently attributed every collision to the first manga that synced it. The INSERT path would have conflicted on the second manga's sync. Migration 0017 drops the old PK and rekeys on (source_id, chapter_id) — the natural identity of a per-source chapter attachment — and adds an index on (source_id, source_chapter_key) for the lookup path. The repo lookup now joins chapters and filters by manga_id; the UPDATE path keys on chapter_id directly (the row's natural identifier post-migration). Test sync_chapters_isolates_colliding_keys_across_mangas pins the contract end-to-end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-29 20:43:08 +02:00
parent 50763addcf
commit c6bb9160e3
6 changed files with 156 additions and 6 deletions
--- a/backend/tests/crawler_sync.rs
+++ b/backend/tests/crawler_sync.rs
@@ -308,6 +308,121 @@ async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool:
    assert_eq!(ch52_count.0, 2, "both Ch.52 uploads survive as separate rows");
 }

+#[sqlx::test(migrations = "./migrations")]
+async fn sync_chapters_isolates_colliding_keys_across_mangas(pool: PgPool) {
+    // Two mangas, both with a chapter whose source_chapter_key is
+    // "chapter-1". Pre-migration-0017 the PK enforced (source_id,
+    // source_chapter_key) globally and the lookup didn't filter by
+    // manga_id, so the second manga's sync would adopt the first manga's
+    // chapter_id (silent attribution corruption). After 0017 each manga
+    // owns its own row.
+    crawler::ensure_source(&pool, "target", "T", "https://x.example")
+        .await
+        .unwrap();
+    let m1 = sample_manga("foo", "Manga Foo", "hash-foo");
+    let m2 = sample_manga("bar", "Manga Bar", "hash-bar");
+    let up1 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m1)
+        .await
+        .unwrap();
+    let up2 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
+        .await
+        .unwrap();
+    assert_ne!(up1.manga_id, up2.manga_id);
+
+    let shared = vec![SourceChapterRef {
+        source_chapter_key: "chapter-1".into(),
+        number: 1,
+        title: Some("Ch.1".into()),
+        url: "https://x.example/foo/chapter-1/".into(),
+    }];
+    let diff1 = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &shared)
+        .await
+        .unwrap();
+    assert_eq!(diff1.new, 1, "manga foo: chapter inserted fresh");
+
+    // Manga bar now syncs *the same key*. Under the old schema this would
+    // either fail on PK conflict or attribute the chapter to foo. Under
+    // the new schema bar gets its own chapter row.
+    let bar_chapters = vec![SourceChapterRef {
+        source_chapter_key: "chapter-1".into(),
+        number: 1,
+        title: Some("Ch.1 (bar)".into()),
+        url: "https://x.example/bar/chapter-1/".into(),
+    }];
+    let diff2 = crawler::sync_manga_chapters(&pool, "target", up2.manga_id, &bar_chapters)
+        .await
+        .unwrap();
+    assert_eq!(
+        diff2.new, 1,
+        "manga bar: same key resolved per-manga to a fresh row"
+    );
+
+    let foo_count: (i64,) = sqlx::query_as(
+        "SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
+    )
+    .bind(up1.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    let bar_count: (i64,) = sqlx::query_as(
+        "SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
+    )
+    .bind(up2.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(foo_count.0, 1);
+    assert_eq!(bar_count.0, 1);
+
+    let bar_title: (Option<String>,) = sqlx::query_as(
+        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
+    )
+    .bind(up2.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(
+        bar_title.0.as_deref(),
+        Some("Ch.1 (bar)"),
+        "bar's chapter has bar's title, not foo's"
+    );
+
+    // A subsequent re-sync of foo with the same key correctly refreshes
+    // foo's row, not bar's.
+    let foo_resync = vec![SourceChapterRef {
+        source_chapter_key: "chapter-1".into(),
+        number: 1,
+        title: Some("Ch.1 (foo updated)".into()),
+        url: "https://x.example/foo/chapter-1/".into(),
+    }];
+    let diff_refresh = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &foo_resync)
+        .await
+        .unwrap();
+    assert_eq!(diff_refresh.refreshed, 1);
+    assert_eq!(diff_refresh.new, 0);
+
+    let foo_title: (Option<String>,) = sqlx::query_as(
+        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
+    )
+    .bind(up1.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(foo_title.0.as_deref(), Some("Ch.1 (foo updated)"));
+    let bar_title_after: (Option<String>,) = sqlx::query_as(
+        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
+    )
+    .bind(up2.manga_id)
+    .fetch_one(&pool)
+    .await
+    .unwrap();
+    assert_eq!(
+        bar_title_after.0.as_deref(),
+        Some("Ch.1 (bar)"),
+        "bar's row is untouched by foo's refresh"
+    );
+}
+
 #[sqlx::test(migrations = "./migrations")]
 async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")