//! Integration tests for `repo::crawler`. //! //! Each test runs against a fresh, migrated DB via `#[sqlx::test]`. //! `DATABASE_URL` must point to a Postgres where the test user can //! `CREATEDB`. use mangalord::crawler::source::{SourceChapterRef, SourceManga}; use mangalord::repo::crawler::{self, ChapterDiff, UpsertStatus}; use sqlx::PgPool; use uuid::Uuid; /// Helper to spin up a `SourceManga` fixture with a stable shape so /// each test can tweak just the fields it cares about. fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga { SourceManga { source_manga_key: key.to_string(), title: title.to_string(), alternative_titles: vec!["Alt 1".into()], authors: vec!["Author One".into()], // Action is in the seeded `genres` table; Fantasy is too. genres: vec!["Action".into(), "Fantasy".into()], tags: vec!["popular".into()], status: Some("ongoing".into()), summary: Some("Sample summary.".into()), cover_url: Some("/cover.jpg".into()), chapters: vec![], metadata_hash: hash.to_string(), } } #[sqlx::test(migrations = "./migrations")] async fn ensure_source_is_idempotent(pool: PgPool) { crawler::ensure_source(&pool, "target", "Target Site", "https://x.example") .await .unwrap(); crawler::ensure_source(&pool, "target", "Target Site v2", "https://x.example") .await .unwrap(); let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM sources WHERE id = 'target'") .fetch_one(&pool) .await .unwrap(); assert_eq!(count.0, 1); let name: (String,) = sqlx::query_as("SELECT name FROM sources WHERE id = 'target'") .fetch_one(&pool) .await .unwrap(); assert_eq!(name.0, "Target Site v2", "name updates on re-call"); } #[sqlx::test(migrations = "./migrations")] async fn first_upsert_inserts_manga_and_links_metadata(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo Manga", "hash-1"); let res = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); assert_eq!(res.status, UpsertStatus::New); // mangas row created let row: (String, String, Vec) = sqlx::query_as("SELECT title, status, alt_titles FROM mangas WHERE id = $1") .bind(res.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(row.0, "Foo Manga"); assert_eq!(row.1, "ongoing"); assert_eq!(row.2, vec!["Alt 1"]); // manga_sources row links the two let link: (String, Uuid, Option) = sqlx::query_as( "SELECT source_id, manga_id, metadata_hash FROM manga_sources WHERE source_manga_key = $1", ) .bind("foo") .fetch_one(&pool) .await .unwrap(); assert_eq!(link.0, "target"); assert_eq!(link.1, res.manga_id); assert_eq!(link.2.as_deref(), Some("hash-1")); // Authors, genres, tags M2M populated let n_authors: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_authors WHERE manga_id = $1") .bind(res.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(n_authors.0, 1); let n_genres: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1") .bind(res.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(n_genres.0, 2, "Action + Fantasy"); let n_tags: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_tags WHERE manga_id = $1") .bind(res.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(n_tags.0, 1); } #[sqlx::test(migrations = "./migrations")] async fn second_upsert_with_same_hash_reports_unchanged(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo Manga", "hash-1"); let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); assert_eq!(second.status, UpsertStatus::Unchanged); assert_eq!(second.manga_id, first.manga_id); } #[sqlx::test(migrations = "./migrations")] async fn upsert_with_changed_hash_updates_fields(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let mut m = sample_manga("foo", "Foo Manga", "hash-1"); let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); m.title = "Foo Manga (Revised)".into(); m.status = Some("completed".into()); m.metadata_hash = "hash-2".into(); let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); assert_eq!(second.status, UpsertStatus::Updated); assert_eq!(second.manga_id, first.manga_id); let row: (String, String) = sqlx::query_as("SELECT title, status FROM mangas WHERE id = $1") .bind(first.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(row.0, "Foo Manga (Revised)"); assert_eq!(row.1, "completed"); } #[sqlx::test(migrations = "./migrations")] async fn sync_chapters_adds_new_refreshes_existing_and_drops_vanished(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo Manga", "hash-1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); let initial = vec![ SourceChapterRef { source_chapter_key: "1".into(), number: 1, title: Some("Ch.1".into()), url: "https://x.example/foo/1".into(), }, SourceChapterRef { source_chapter_key: "2".into(), number: 2, title: Some("Ch.2".into()), url: "https://x.example/foo/2".into(), }, ]; let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &initial) .await .unwrap(); assert_eq!( diff, ChapterDiff { new: 2, refreshed: 0, dropped: 0 } ); // Second run: keep ch1, replace ch2 with ch3 — ch2 should be dropped. let second = vec![ SourceChapterRef { source_chapter_key: "1".into(), number: 1, title: Some("Ch.1 (renamed)".into()), url: "https://x.example/foo/1".into(), }, SourceChapterRef { source_chapter_key: "3".into(), number: 3, title: Some("Ch.3".into()), url: "https://x.example/foo/3".into(), }, ]; let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &second) .await .unwrap(); assert_eq!( diff, ChapterDiff { new: 1, refreshed: 1, dropped: 1 } ); // Renamed title propagated to chapters.title let title: (Option,) = sqlx::query_as("SELECT c.title FROM chapters c JOIN chapter_sources cs ON cs.chapter_id = c.id WHERE cs.source_chapter_key = '1'") .fetch_one(&pool) .await .unwrap(); assert_eq!(title.0.as_deref(), Some("Ch.1 (renamed)")); // Vanished chapter is soft-dropped (row still exists, dropped_at set). let dropped: (Option>,) = sqlx::query_as("SELECT dropped_at FROM chapter_sources WHERE source_chapter_key = '2'") .fetch_one(&pool) .await .unwrap(); assert!(dropped.0.is_some(), "ch2 should be soft-dropped"); } #[sqlx::test(migrations = "./migrations")] async fn live_chapter_count_returns_zero_for_unknown_source_key(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); // No manga_sources row yet → unknown key path. Must not error and // must report zero so the partial-render guard accepts the // "brand-new manga with no chapters" case as legitimate. let n = crawler::live_chapter_count_for_source_manga(&pool, "target", "nobody") .await .unwrap(); assert_eq!(n, 0); } #[sqlx::test(migrations = "./migrations")] async fn live_chapter_count_only_counts_live_sources(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo Manga", "hash-1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); let chapters = vec![ SourceChapterRef { source_chapter_key: "1".into(), number: 1, title: Some("Ch.1".into()), url: "https://x.example/foo/1".into(), }, SourceChapterRef { source_chapter_key: "2".into(), number: 2, title: Some("Ch.2".into()), url: "https://x.example/foo/2".into(), }, ]; crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters) .await .unwrap(); assert_eq!( crawler::live_chapter_count_for_source_manga(&pool, "target", "foo") .await .unwrap(), 2 ); // Soft-drop one source row — count drops by one, the row stays. sqlx::query( "UPDATE chapter_sources SET dropped_at = NOW() WHERE source_chapter_key = '2'", ) .execute(&pool) .await .unwrap(); assert_eq!( crawler::live_chapter_count_for_source_manga(&pool, "target", "foo") .await .unwrap(), 1 ); } /// Real-world sources publish multiple chapters at the same number /// (different uploaders, translator notes, re-releases). After the /// (manga_id, number) UNIQUE drop in 0013, each `SourceChapterRef` /// becomes its own `chapters` row even when the parsed number matches /// — chapter identity is now the chapter id, not the number. #[sqlx::test(migrations = "./migrations")] async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo Manga", "hash-1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); // Two distinct uploads of Ch.52 (different uploaders → different // URLs/keys, same parsed number) plus a notice/hiatus row that // parses to number=0 alongside a real chapter at number 1. let chapters = vec![ SourceChapterRef { source_chapter_key: "br_chapter-A".into(), number: 52, title: Some("Ch.52 : Official".into()), url: "https://x.example/foo/A/pg-1/".into(), }, SourceChapterRef { source_chapter_key: "br_chapter-B".into(), number: 52, title: Some("Ch.52 : Official (alt)".into()), url: "https://x.example/foo/B/pg-1/".into(), }, SourceChapterRef { source_chapter_key: "br_chapter-NOTICE".into(), number: 0, title: Some("hitaus.".into()), url: "https://x.example/foo/notice/pg-1/".into(), }, SourceChapterRef { source_chapter_key: "br_chapter-1".into(), number: 1, title: Some("Ch.1 : Official".into()), url: "https://x.example/foo/1/pg-1/".into(), }, ]; let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters) .await .unwrap(); assert_eq!( diff, ChapterDiff { new: 4, refreshed: 0, dropped: 0 }, "every source ref yields a new chapter row" ); let rows: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM chapters WHERE manga_id = $1") .bind(up.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(rows.0, 4, "4 distinct chapter rows even with duplicate numbers"); let ch52_count: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM chapters WHERE manga_id = $1 AND number = 52", ) .bind(up.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(ch52_count.0, 2, "both Ch.52 uploads survive as separate rows"); } #[sqlx::test(migrations = "./migrations")] async fn sync_chapters_isolates_colliding_keys_across_mangas(pool: PgPool) { // Two mangas, both with a chapter whose source_chapter_key is // "chapter-1". Pre-migration-0017 the PK enforced (source_id, // source_chapter_key) globally and the lookup didn't filter by // manga_id, so the second manga's sync would adopt the first manga's // chapter_id (silent attribution corruption). After 0017 each manga // owns its own row. crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m1 = sample_manga("foo", "Manga Foo", "hash-foo"); let m2 = sample_manga("bar", "Manga Bar", "hash-bar"); let up1 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m1) .await .unwrap(); let up2 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2) .await .unwrap(); assert_ne!(up1.manga_id, up2.manga_id); let shared = vec![SourceChapterRef { source_chapter_key: "chapter-1".into(), number: 1, title: Some("Ch.1".into()), url: "https://x.example/foo/chapter-1/".into(), }]; let diff1 = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &shared) .await .unwrap(); assert_eq!(diff1.new, 1, "manga foo: chapter inserted fresh"); // Manga bar now syncs *the same key*. Under the old schema this would // either fail on PK conflict or attribute the chapter to foo. Under // the new schema bar gets its own chapter row. let bar_chapters = vec![SourceChapterRef { source_chapter_key: "chapter-1".into(), number: 1, title: Some("Ch.1 (bar)".into()), url: "https://x.example/bar/chapter-1/".into(), }]; let diff2 = crawler::sync_manga_chapters(&pool, "target", up2.manga_id, &bar_chapters) .await .unwrap(); assert_eq!( diff2.new, 1, "manga bar: same key resolved per-manga to a fresh row" ); let foo_count: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM chapters WHERE manga_id = $1", ) .bind(up1.manga_id) .fetch_one(&pool) .await .unwrap(); let bar_count: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM chapters WHERE manga_id = $1", ) .bind(up2.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(foo_count.0, 1); assert_eq!(bar_count.0, 1); let bar_title: (Option,) = sqlx::query_as( "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1", ) .bind(up2.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!( bar_title.0.as_deref(), Some("Ch.1 (bar)"), "bar's chapter has bar's title, not foo's" ); // A subsequent re-sync of foo with the same key correctly refreshes // foo's row, not bar's. let foo_resync = vec![SourceChapterRef { source_chapter_key: "chapter-1".into(), number: 1, title: Some("Ch.1 (foo updated)".into()), url: "https://x.example/foo/chapter-1/".into(), }]; let diff_refresh = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &foo_resync) .await .unwrap(); assert_eq!(diff_refresh.refreshed, 1); assert_eq!(diff_refresh.new, 0); let foo_title: (Option,) = sqlx::query_as( "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1", ) .bind(up1.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(foo_title.0.as_deref(), Some("Ch.1 (foo updated)")); let bar_title_after: (Option,) = sqlx::query_as( "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1", ) .bind(up2.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!( bar_title_after.0.as_deref(), Some("Ch.1 (bar)"), "bar's row is untouched by foo's refresh" ); } #[sqlx::test(migrations = "./migrations")] async fn sync_chapters_serializes_concurrent_calls_for_same_manga(pool: PgPool) { // Without the per-manga advisory lock, two concurrent calls would // both read `seen_keys`, both run the drop UPDATE filtered on `NOT // (key = ANY $3)`, and the later commit could soft-drop a chapter // the earlier had just inserted. The lock makes the calls strictly // sequential per-manga: whichever runs second sees the first one's // committed chapters and treats their absence as a "dropped" signal // only if the second list legitimately omits them. // // Concretely: pre-state [A]. Call X syncs [A, B]; call Y syncs // [A, B, C]. Whatever the schedule, the final state must include // *all three* chapters because neither call legitimately omits the // other's contribution — both lists are supersets of each other's // pre-existing rows. crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo Manga", "hash-1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); let manga_id = up.manga_id; // Pre-state: [A]. let pre = vec![SourceChapterRef { source_chapter_key: "A".into(), number: 1, title: Some("Ch.A".into()), url: "https://x.example/foo/A".into(), }]; crawler::sync_manga_chapters(&pool, "target", manga_id, &pre) .await .unwrap(); // Two concurrent calls. Call X adds B; call Y adds B + C. Both keep // A. Their drop branches would otherwise race against each other. let list_x = vec![ SourceChapterRef { source_chapter_key: "A".into(), number: 1, title: Some("Ch.A".into()), url: "https://x.example/foo/A".into(), }, SourceChapterRef { source_chapter_key: "B".into(), number: 2, title: Some("Ch.B".into()), url: "https://x.example/foo/B".into(), }, ]; let list_y = vec![ SourceChapterRef { source_chapter_key: "A".into(), number: 1, title: Some("Ch.A".into()), url: "https://x.example/foo/A".into(), }, SourceChapterRef { source_chapter_key: "B".into(), number: 2, title: Some("Ch.B".into()), url: "https://x.example/foo/B".into(), }, SourceChapterRef { source_chapter_key: "C".into(), number: 3, title: Some("Ch.C".into()), url: "https://x.example/foo/C".into(), }, ]; let pool_x = pool.clone(); let pool_y = pool.clone(); let (rx, ry) = tokio::join!( tokio::spawn(async move { crawler::sync_manga_chapters(&pool_x, "target", manga_id, &list_x).await }), tokio::spawn(async move { crawler::sync_manga_chapters(&pool_y, "target", manga_id, &list_y).await }), ); rx.unwrap().expect("call X"); ry.unwrap().expect("call Y"); // All three keys must survive with dropped_at NULL — the lock // ensures the later call sees the earlier one's INSERTs and the // drop UPDATE finds nothing to drop. let alive: Vec = sqlx::query_scalar( "SELECT cs.source_chapter_key \ FROM chapter_sources cs \ JOIN chapters ch ON ch.id = cs.chapter_id \ WHERE ch.manga_id = $1 AND cs.dropped_at IS NULL \ ORDER BY cs.source_chapter_key", ) .bind(manga_id) .fetch_all(&pool) .await .unwrap(); assert_eq!( alive, vec!["A".to_string(), "B".to_string(), "C".to_string()], "all chapters survive concurrent syncs that both contain them" ); } #[sqlx::test(migrations = "./migrations")] async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo", "h1"); // First upsert: row is brand new, no cover stored yet. let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); assert!(first.cover_image_path.is_none(), "new manga has no cover yet"); // Simulate cover landing in storage post-upsert. sqlx::query("UPDATE mangas SET cover_image_path = $1 WHERE id = $2") .bind("mangas/foo/cover.jpg") .bind(first.manga_id) .execute(&pool) .await .unwrap(); // Second upsert with same hash → Unchanged, but cover path is now // surfaced so the caller knows the backfill is done. let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); assert_eq!(second.status, UpsertStatus::Unchanged); assert_eq!( second.cover_image_path.as_deref(), Some("mangas/foo/cover.jpg") ); } #[sqlx::test(migrations = "./migrations")] async fn arbitrary_genres_from_source_get_inserted(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let mut m = sample_manga("foo", "Foo", "h"); // "Action" is seeded by migration 0009. "Webtoons" is not. m.genres = vec!["Action".into(), "Webtoons".into()]; let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); let n_genre_links: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1") .bind(up.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(n_genre_links.0, 2, "both seeded and source-added genres attach"); let webtoons: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM genres WHERE name = 'Webtoons'") .fetch_one(&pool) .await .unwrap(); assert_eq!(webtoons.0, 1, "non-seeded genre was inserted"); // Case-insensitive de-dup: a second sync with the genre re-cased // attaches the existing row, not a new one. let mut m2 = sample_manga("bar", "Bar", "h2"); m2.genres = vec!["webtoons".into()]; let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2) .await .unwrap(); let webtoons_count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM genres WHERE lower(name) = 'webtoons'") .fetch_one(&pool) .await .unwrap(); assert_eq!(webtoons_count.0, 1, "case-insensitive lookup reuses the existing row"); } /// User-attached tags (rows with non-NULL `added_by` in `manga_tags`) /// must survive a crawler upsert. The crawler owns source-attached tags /// (added_by IS NULL); user attachments are owned by the user who made /// them and the recurring metadata pass must not delete them. #[sqlx::test(migrations = "./migrations")] async fn sync_tags_preserves_user_attached_tags(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo Manga", "hash-1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); // A real user attaches a personal tag. let user = mangalord::repo::user::create(&pool, "alice", "phc-stub") .await .unwrap(); let outcome = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id) .await .unwrap(); assert!(outcome.created_attachment); // Second crawler pass. Use a different metadata_hash so the upsert // takes the Updated branch, but the bug also fires on Unchanged // ticks since sync_tags runs unconditionally. let mut m2 = m.clone(); m2.metadata_hash = "hash-2".into(); m2.tags = vec!["popular".into(), "weekly".into()]; let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2) .await .unwrap(); // The user tag must still be attached. let user_tag_rows: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM manga_tags mt \ JOIN tags t ON t.id = mt.tag_id \ WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \ AND mt.added_by = $2", ) .bind(up.manga_id) .bind(user.id) .fetch_one(&pool) .await .unwrap(); assert_eq!( user_tag_rows.0, 1, "user-attached tag must survive a crawler upsert" ); // The source's tags should still attach as well, as crawler-owned. let source_tag_rows: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM manga_tags mt \ JOIN tags t ON t.id = mt.tag_id \ WHERE mt.manga_id = $1 \ AND mt.added_by IS NULL \ AND lower(t.name) IN ('popular', 'weekly')", ) .bind(up.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(source_tag_rows.0, 2, "source tags re-attach on each pass"); // A subsequent pass where the source drops a previously-seen tag // must clear that crawler-owned attachment (otherwise crawler-tags // would only ever accumulate). let mut m3 = m2.clone(); m3.metadata_hash = "hash-3".into(); m3.tags = vec!["popular".into()]; let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m3) .await .unwrap(); let weekly_rows: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM manga_tags mt \ JOIN tags t ON t.id = mt.tag_id \ WHERE mt.manga_id = $1 AND lower(t.name) = 'weekly'", ) .bind(up.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(weekly_rows.0, 0, "source-owned tag dropped by source goes away"); // And the user tag still survives that third pass. let user_tag_rows: (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM manga_tags mt \ JOIN tags t ON t.id = mt.tag_id \ WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \ AND mt.added_by = $2", ) .bind(up.manga_id) .bind(user.id) .fetch_one(&pool) .await .unwrap(); assert_eq!(user_tag_rows.0, 1); } /// `manga_tags.added_by` is `ON DELETE SET NULL` on the user FK. When /// the attaching user is deleted, their attachments become orphans /// indistinguishable from crawler-owned rows — and the crawler should /// reap them on the next pass. Pins the semantic so a future change /// can't quietly leave orphan rows lying around. #[sqlx::test(migrations = "./migrations")] async fn sync_tags_garbage_collects_orphan_user_attachments(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo", "hash-1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); // A user attaches "personal", then the user gets deleted. The // attachment row stays (manga_tags.manga_id FK is CASCADE on // mangas only; we never CASCADE-delete user attachments). The FK // on added_by is `ON DELETE SET NULL`, so the row's owner column // goes NULL — same shape as a crawler-owned row. let user = mangalord::repo::user::create(&pool, "bob", "phc-stub") .await .unwrap(); let _ = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id) .await .unwrap(); sqlx::query("DELETE FROM users WHERE id = $1") .bind(user.id) .execute(&pool) .await .unwrap(); // Sanity: the orphan still exists post-user-delete with added_by NULL. let (orphan_rows,): (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM manga_tags mt \ JOIN tags t ON t.id = mt.tag_id \ WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \ AND mt.added_by IS NULL", ) .bind(up.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(orphan_rows, 1); // Next crawler pass — orphan should be reaped along with any // other source-owned rows that aren't in the new tag list. let mut m2 = m.clone(); m2.metadata_hash = "hash-2".into(); m2.tags = vec!["popular".into()]; let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2) .await .unwrap(); let (orphan_rows,): (i64,) = sqlx::query_as( "SELECT COUNT(*) FROM manga_tags mt \ JOIN tags t ON t.id = mt.tag_id \ WHERE mt.manga_id = $1 AND lower(t.name) = 'personal'", ) .bind(up.manga_id) .fetch_one(&pool) .await .unwrap(); assert_eq!(orphan_rows, 0, "orphan user-attached tag should be reaped"); } // ---- list_missing_covers --------------------------------------------------- #[sqlx::test(migrations = "./migrations")] async fn list_missing_covers_only_returns_rows_without_cover(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let with_cover = sample_manga("with", "With Cover", "h1"); let without_cover = sample_manga("without", "No Cover", "h2"); let _w = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/with", &with_cover) .await .unwrap(); let nc = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/without", &without_cover) .await .unwrap(); // Manually set a cover for `with` only. sqlx::query("UPDATE mangas SET cover_image_path = 'mangas/x/cover.jpg' WHERE id = $1") .bind(_w.manga_id) .execute(&pool) .await .unwrap(); let entries = crawler::list_missing_covers(&pool, 50).await.unwrap(); assert_eq!(entries.len(), 1, "exactly the manga without a cover"); assert_eq!(entries[0].manga_id, nc.manga_id); assert_eq!(entries[0].source_manga_key, "without"); assert_eq!(entries[0].source_url, "https://x.example/without"); } #[sqlx::test(migrations = "./migrations")] async fn list_missing_covers_skips_dropped_source_rows(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo", "h1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); sqlx::query("UPDATE manga_sources SET dropped_at = NOW() WHERE manga_id = $1") .bind(up.manga_id) .execute(&pool) .await .unwrap(); let entries = crawler::list_missing_covers(&pool, 50).await.unwrap(); assert!( entries.is_empty(), "dropped-source mangas must not be backfilled — no live source to fetch from" ); } #[sqlx::test(migrations = "./migrations")] async fn list_missing_covers_respects_limit(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); for i in 0..5 { let key = format!("m{i}"); let url = format!("https://x.example/{key}"); let m = sample_manga(&key, &format!("M{i}"), &format!("h{i}")); let _ = crawler::upsert_manga_from_source(&pool, "target", &url, &m) .await .unwrap(); } let entries = crawler::list_missing_covers(&pool, 3).await.unwrap(); assert_eq!(entries.len(), 3, "limit caps the result set"); } #[sqlx::test(migrations = "./migrations")] async fn list_missing_covers_deduplicates_per_manga(pool: PgPool) { // A manga surfaced by two sources should produce ONE backfill // entry, not two — otherwise the per-tick cap could be eaten by // duplicates and starve other mangas. crawler::ensure_source(&pool, "src-a", "A", "https://a.example") .await .unwrap(); crawler::ensure_source(&pool, "src-b", "B", "https://b.example") .await .unwrap(); let m = sample_manga("foo", "Foo", "h1"); let up = crawler::upsert_manga_from_source(&pool, "src-a", "https://a.example/foo", &m) .await .unwrap(); // Second source attaches to the SAME manga row. sqlx::query( "INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url) \ VALUES ($1, $2, $3, $4)", ) .bind("src-b") .bind("foo-on-b") .bind(up.manga_id) .bind("https://b.example/foo") .execute(&pool) .await .unwrap(); let entries = crawler::list_missing_covers(&pool, 50).await.unwrap(); assert_eq!(entries.len(), 1, "DISTINCT ON (m.id) collapses duplicate source rows"); } #[sqlx::test(migrations = "./migrations")] async fn re_appearing_manga_clears_dropped_at(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") .await .unwrap(); let m = sample_manga("foo", "Foo", "h1"); let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); // Drop it manually. sqlx::query( "UPDATE manga_sources SET dropped_at = NOW() WHERE source_manga_key = 'foo'", ) .execute(&pool) .await .unwrap(); // Re-upsert: the link should un-drop. let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) .await .unwrap(); let dropped: (Option>, Uuid) = sqlx::query_as( "SELECT dropped_at, manga_id FROM manga_sources WHERE source_manga_key = 'foo'", ) .fetch_one(&pool) .await .unwrap(); assert!(dropped.0.is_none()); assert_eq!(dropped.1, up.manga_id); }