Mangalord/backend/tests/crawler_sync.rs

//! Integration tests for `repo::crawler`.
//!
//! Each test runs against a fresh, migrated DB via `#[sqlx::test]`.
//! `DATABASE_URL` must point to a Postgres where the test user can
//! `CREATEDB`.

use mangalord::crawler::source::{SourceChapterRef, SourceManga};
use mangalord::repo::crawler::{self, ChapterDiff, UpsertStatus};
use mangalord::repo::chapter as chapter_repo;
use sqlx::PgPool;
use uuid::Uuid;

/// Helper to spin up a `SourceManga` fixture with a stable shape so
/// each test can tweak just the fields it cares about.
fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga {
    SourceManga {
        source_manga_key: key.to_string(),
        title: title.to_string(),
        alternative_titles: vec!["Alt 1".into()],
        authors: vec!["Author One".into()],
        // Action is in the seeded `genres` table; Fantasy is too.
        genres: vec!["Action".into(), "Fantasy".into()],
        tags: vec!["popular".into()],
        status: Some("ongoing".into()),
        summary: Some("Sample summary.".into()),
        cover_url: Some("/cover.jpg".into()),
        chapters: vec![],
        metadata_hash: hash.to_string(),
    }
}

#[sqlx::test(migrations = "./migrations")]
async fn ensure_source_is_idempotent(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "Target Site", "https://x.example")
        .await
        .unwrap();
    crawler::ensure_source(&pool, "target", "Target Site v2", "https://x.example")
        .await
        .unwrap();
    let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM sources WHERE id = 'target'")
        .fetch_one(&pool)
        .await
        .unwrap();
    assert_eq!(count.0, 1);
    let name: (String,) = sqlx::query_as("SELECT name FROM sources WHERE id = 'target'")
        .fetch_one(&pool)
        .await
        .unwrap();
    assert_eq!(name.0, "Target Site v2", "name updates on re-call");
}

#[sqlx::test(migrations = "./migrations")]
async fn first_upsert_inserts_manga_and_links_metadata(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");

    let res = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    assert_eq!(res.status, UpsertStatus::New);

    // mangas row created
    let row: (String, String, Vec<String>) =
        sqlx::query_as("SELECT title, status, alt_titles FROM mangas WHERE id = $1")
            .bind(res.manga_id)
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(row.0, "Foo Manga");
    assert_eq!(row.1, "ongoing");
    assert_eq!(row.2, vec!["Alt 1"]);

    // manga_sources row links the two
    let link: (String, Uuid, Option<String>) = sqlx::query_as(
        "SELECT source_id, manga_id, metadata_hash FROM manga_sources WHERE source_manga_key = $1",
    )
    .bind("foo")
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(link.0, "target");
    assert_eq!(link.1, res.manga_id);
    assert_eq!(link.2.as_deref(), Some("hash-1"));

    // Authors, genres, tags M2M populated
    let n_authors: (i64,) =
        sqlx::query_as("SELECT COUNT(*) FROM manga_authors WHERE manga_id = $1")
            .bind(res.manga_id)
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(n_authors.0, 1);
    let n_genres: (i64,) =
        sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
            .bind(res.manga_id)
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(n_genres.0, 2, "Action + Fantasy");
    let n_tags: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_tags WHERE manga_id = $1")
        .bind(res.manga_id)
        .fetch_one(&pool)
        .await
        .unwrap();
    assert_eq!(n_tags.0, 1);
}

#[sqlx::test(migrations = "./migrations")]
async fn second_upsert_with_same_hash_reports_unchanged(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    assert_eq!(second.status, UpsertStatus::Unchanged);
    assert_eq!(second.manga_id, first.manga_id);
}

#[sqlx::test(migrations = "./migrations")]
async fn upsert_with_changed_hash_updates_fields(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let mut m = sample_manga("foo", "Foo Manga", "hash-1");
    let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    m.title = "Foo Manga (Revised)".into();
    m.status = Some("completed".into());
    m.metadata_hash = "hash-2".into();
    let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    assert_eq!(second.status, UpsertStatus::Updated);
    assert_eq!(second.manga_id, first.manga_id);

    let row: (String, String) =
        sqlx::query_as("SELECT title, status FROM mangas WHERE id = $1")
            .bind(first.manga_id)
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(row.0, "Foo Manga (Revised)");
    assert_eq!(row.1, "completed");
}

#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_adds_new_refreshes_existing_and_drops_vanished(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    let initial = vec![
        SourceChapterRef {
            source_chapter_key: "1".into(),
            number: 1,
            title: Some("Ch.1".into()),
            url: "https://x.example/foo/1".into(),
        },
        SourceChapterRef {
            source_chapter_key: "2".into(),
            number: 2,
            title: Some("Ch.2".into()),
            url: "https://x.example/foo/2".into(),
        },
    ];
    let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &initial)
        .await
        .unwrap();
    assert_eq!(
        diff,
        ChapterDiff {
            new: 2,
            refreshed: 0,
            dropped: 0
        }
    );

    // Second run: keep ch1, replace ch2 with ch3 — ch2 should be dropped.
    let second = vec![
        SourceChapterRef {
            source_chapter_key: "1".into(),
            number: 1,
            title: Some("Ch.1 (renamed)".into()),
            url: "https://x.example/foo/1".into(),
        },
        SourceChapterRef {
            source_chapter_key: "3".into(),
            number: 3,
            title: Some("Ch.3".into()),
            url: "https://x.example/foo/3".into(),
        },
    ];
    let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &second)
        .await
        .unwrap();
    assert_eq!(
        diff,
        ChapterDiff {
            new: 1,
            refreshed: 1,
            dropped: 1
        }
    );

    // Renamed title propagated to chapters.title
    let title: (Option<String>,) =
        sqlx::query_as("SELECT c.title FROM chapters c JOIN chapter_sources cs ON cs.chapter_id = c.id WHERE cs.source_chapter_key = '1'")
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(title.0.as_deref(), Some("Ch.1 (renamed)"));

    // Vanished chapter is soft-dropped (row still exists, dropped_at set).
    let dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
        sqlx::query_as("SELECT dropped_at FROM chapter_sources WHERE source_chapter_key = '2'")
            .fetch_one(&pool)
            .await
            .unwrap();
    assert!(dropped.0.is_some(), "ch2 should be soft-dropped");
}

#[sqlx::test(migrations = "./migrations")]
async fn live_chapter_count_returns_zero_for_unknown_source_key(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    // No manga_sources row yet → unknown key path. Must not error and
    // must report zero so the partial-render guard accepts the
    // "brand-new manga with no chapters" case as legitimate.
    let n = crawler::live_chapter_count_for_source_manga(&pool, "target", "nobody")
        .await
        .unwrap();
    assert_eq!(n, 0);
}

#[sqlx::test(migrations = "./migrations")]
async fn live_chapter_count_only_counts_live_sources(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    let chapters = vec![
        SourceChapterRef {
            source_chapter_key: "1".into(),
            number: 1,
            title: Some("Ch.1".into()),
            url: "https://x.example/foo/1".into(),
        },
        SourceChapterRef {
            source_chapter_key: "2".into(),
            number: 2,
            title: Some("Ch.2".into()),
            url: "https://x.example/foo/2".into(),
        },
    ];
    crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
        .await
        .unwrap();
    assert_eq!(
        crawler::live_chapter_count_for_source_manga(&pool, "target", "foo")
            .await
            .unwrap(),
        2
    );
    // Soft-drop one source row — count drops by one, the row stays.
    sqlx::query(
        "UPDATE chapter_sources SET dropped_at = NOW() WHERE source_chapter_key = '2'",
    )
    .execute(&pool)
    .await
    .unwrap();
    assert_eq!(
        crawler::live_chapter_count_for_source_manga(&pool, "target", "foo")
            .await
            .unwrap(),
        1
    );
}

/// Real-world sources publish multiple chapters at the same number
/// (different uploaders, translator notes, re-releases). After the
/// (manga_id, number) UNIQUE drop in 0013, each `SourceChapterRef`
/// becomes its own `chapters` row even when the parsed number matches
/// — chapter identity is now the chapter id, not the number.
#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    // Two distinct uploads of Ch.52 (different uploaders → different
    // URLs/keys, same parsed number) plus a notice/hiatus row that
    // parses to number=0 alongside a real chapter at number 1.
    let chapters = vec![
        SourceChapterRef {
            source_chapter_key: "br_chapter-A".into(),
            number: 52,
            title: Some("Ch.52 : Official".into()),
            url: "https://x.example/foo/A/pg-1/".into(),
        },
        SourceChapterRef {
            source_chapter_key: "br_chapter-B".into(),
            number: 52,
            title: Some("Ch.52 : Official (alt)".into()),
            url: "https://x.example/foo/B/pg-1/".into(),
        },
        SourceChapterRef {
            source_chapter_key: "br_chapter-NOTICE".into(),
            number: 0,
            title: Some("hitaus.".into()),
            url: "https://x.example/foo/notice/pg-1/".into(),
        },
        SourceChapterRef {
            source_chapter_key: "br_chapter-1".into(),
            number: 1,
            title: Some("Ch.1 : Official".into()),
            url: "https://x.example/foo/1/pg-1/".into(),
        },
    ];

    let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
        .await
        .unwrap();
    assert_eq!(
        diff,
        ChapterDiff {
            new: 4,
            refreshed: 0,
            dropped: 0
        },
        "every source ref yields a new chapter row"
    );

    let rows: (i64,) =
        sqlx::query_as("SELECT COUNT(*) FROM chapters WHERE manga_id = $1")
            .bind(up.manga_id)
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(rows.0, 4, "4 distinct chapter rows even with duplicate numbers");

    let ch52_count: (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM chapters WHERE manga_id = $1 AND number = 52",
    )
    .bind(up.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(ch52_count.0, 2, "both Ch.52 uploads survive as separate rows");
}

#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_isolates_colliding_keys_across_mangas(pool: PgPool) {
    // Two mangas, both with a chapter whose source_chapter_key is
    // "chapter-1". Pre-migration-0017 the PK enforced (source_id,
    // source_chapter_key) globally and the lookup didn't filter by
    // manga_id, so the second manga's sync would adopt the first manga's
    // chapter_id (silent attribution corruption). After 0017 each manga
    // owns its own row.
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m1 = sample_manga("foo", "Manga Foo", "hash-foo");
    let m2 = sample_manga("bar", "Manga Bar", "hash-bar");
    let up1 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m1)
        .await
        .unwrap();
    let up2 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
        .await
        .unwrap();
    assert_ne!(up1.manga_id, up2.manga_id);

    let shared = vec![SourceChapterRef {
        source_chapter_key: "chapter-1".into(),
        number: 1,
        title: Some("Ch.1".into()),
        url: "https://x.example/foo/chapter-1/".into(),
    }];
    let diff1 = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &shared)
        .await
        .unwrap();
    assert_eq!(diff1.new, 1, "manga foo: chapter inserted fresh");

    // Manga bar now syncs *the same key*. Under the old schema this would
    // either fail on PK conflict or attribute the chapter to foo. Under
    // the new schema bar gets its own chapter row.
    let bar_chapters = vec![SourceChapterRef {
        source_chapter_key: "chapter-1".into(),
        number: 1,
        title: Some("Ch.1 (bar)".into()),
        url: "https://x.example/bar/chapter-1/".into(),
    }];
    let diff2 = crawler::sync_manga_chapters(&pool, "target", up2.manga_id, &bar_chapters)
        .await
        .unwrap();
    assert_eq!(
        diff2.new, 1,
        "manga bar: same key resolved per-manga to a fresh row"
    );

    let foo_count: (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
    )
    .bind(up1.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    let bar_count: (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
    )
    .bind(up2.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(foo_count.0, 1);
    assert_eq!(bar_count.0, 1);

    let bar_title: (Option<String>,) = sqlx::query_as(
        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
    )
    .bind(up2.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(
        bar_title.0.as_deref(),
        Some("Ch.1 (bar)"),
        "bar's chapter has bar's title, not foo's"
    );

    // A subsequent re-sync of foo with the same key correctly refreshes
    // foo's row, not bar's.
    let foo_resync = vec![SourceChapterRef {
        source_chapter_key: "chapter-1".into(),
        number: 1,
        title: Some("Ch.1 (foo updated)".into()),
        url: "https://x.example/foo/chapter-1/".into(),
    }];
    let diff_refresh = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &foo_resync)
        .await
        .unwrap();
    assert_eq!(diff_refresh.refreshed, 1);
    assert_eq!(diff_refresh.new, 0);

    let foo_title: (Option<String>,) = sqlx::query_as(
        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
    )
    .bind(up1.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(foo_title.0.as_deref(), Some("Ch.1 (foo updated)"));
    let bar_title_after: (Option<String>,) = sqlx::query_as(
        "SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
    )
    .bind(up2.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(
        bar_title_after.0.as_deref(),
        Some("Ch.1 (bar)"),
        "bar's row is untouched by foo's refresh"
    );
}

#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_serializes_concurrent_calls_for_same_manga(pool: PgPool) {
    // Without the per-manga advisory lock, two concurrent calls would
    // both read `seen_keys`, both run the drop UPDATE filtered on `NOT
    // (key = ANY $3)`, and the later commit could soft-drop a chapter
    // the earlier had just inserted. The lock makes the calls strictly
    // sequential per-manga: whichever runs second sees the first one's
    // committed chapters and treats their absence as a "dropped" signal
    // only if the second list legitimately omits them.
    //
    // Concretely: pre-state [A]. Call X syncs [A, B]; call Y syncs
    // [A, B, C]. Whatever the schedule, the final state must include
    // *all three* chapters because neither call legitimately omits the
    // other's contribution — both lists are supersets of each other's
    // pre-existing rows.
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    let manga_id = up.manga_id;

    // Pre-state: [A].
    let pre = vec![SourceChapterRef {
        source_chapter_key: "A".into(),
        number: 1,
        title: Some("Ch.A".into()),
        url: "https://x.example/foo/A".into(),
    }];
    crawler::sync_manga_chapters(&pool, "target", manga_id, &pre)
        .await
        .unwrap();

    // Two concurrent calls. Call X adds B; call Y adds B + C. Both keep
    // A. Their drop branches would otherwise race against each other.
    let list_x = vec![
        SourceChapterRef {
            source_chapter_key: "A".into(),
            number: 1,
            title: Some("Ch.A".into()),
            url: "https://x.example/foo/A".into(),
        },
        SourceChapterRef {
            source_chapter_key: "B".into(),
            number: 2,
            title: Some("Ch.B".into()),
            url: "https://x.example/foo/B".into(),
        },
    ];
    let list_y = vec![
        SourceChapterRef {
            source_chapter_key: "A".into(),
            number: 1,
            title: Some("Ch.A".into()),
            url: "https://x.example/foo/A".into(),
        },
        SourceChapterRef {
            source_chapter_key: "B".into(),
            number: 2,
            title: Some("Ch.B".into()),
            url: "https://x.example/foo/B".into(),
        },
        SourceChapterRef {
            source_chapter_key: "C".into(),
            number: 3,
            title: Some("Ch.C".into()),
            url: "https://x.example/foo/C".into(),
        },
    ];
    let pool_x = pool.clone();
    let pool_y = pool.clone();
    let (rx, ry) = tokio::join!(
        tokio::spawn(async move {
            crawler::sync_manga_chapters(&pool_x, "target", manga_id, &list_x).await
        }),
        tokio::spawn(async move {
            crawler::sync_manga_chapters(&pool_y, "target", manga_id, &list_y).await
        }),
    );
    rx.unwrap().expect("call X");
    ry.unwrap().expect("call Y");

    // All three keys must survive with dropped_at NULL — the lock
    // ensures the later call sees the earlier one's INSERTs and the
    // drop UPDATE finds nothing to drop.
    let alive: Vec<String> = sqlx::query_scalar(
        "SELECT cs.source_chapter_key \
           FROM chapter_sources cs \
           JOIN chapters ch ON ch.id = cs.chapter_id \
          WHERE ch.manga_id = $1 AND cs.dropped_at IS NULL \
          ORDER BY cs.source_chapter_key",
    )
    .bind(manga_id)
    .fetch_all(&pool)
    .await
    .unwrap();
    assert_eq!(
        alive,
        vec!["A".to_string(), "B".to_string(), "C".to_string()],
        "all chapters survive concurrent syncs that both contain them"
    );
}

#[sqlx::test(migrations = "./migrations")]
async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo", "h1");

    // First upsert: row is brand new, no cover stored yet.
    let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    assert!(first.cover_image_path.is_none(), "new manga has no cover yet");

    // Simulate cover landing in storage post-upsert.
    sqlx::query("UPDATE mangas SET cover_image_path = $1 WHERE id = $2")
        .bind("mangas/foo/cover.jpg")
        .bind(first.manga_id)
        .execute(&pool)
        .await
        .unwrap();

    // Second upsert with same hash → Unchanged, but cover path is now
    // surfaced so the caller knows the backfill is done.
    let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    assert_eq!(second.status, UpsertStatus::Unchanged);
    assert_eq!(
        second.cover_image_path.as_deref(),
        Some("mangas/foo/cover.jpg")
    );
}

#[sqlx::test(migrations = "./migrations")]
async fn arbitrary_genres_from_source_get_inserted(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let mut m = sample_manga("foo", "Foo", "h");
    // "Action" is seeded by migration 0009. "Webtoons" is not.
    m.genres = vec!["Action".into(), "Webtoons".into()];

    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    let n_genre_links: (i64,) =
        sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
            .bind(up.manga_id)
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(n_genre_links.0, 2, "both seeded and source-added genres attach");

    let webtoons: (i64,) =
        sqlx::query_as("SELECT COUNT(*) FROM genres WHERE name = 'Webtoons'")
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(webtoons.0, 1, "non-seeded genre was inserted");

    // Case-insensitive de-dup: a second sync with the genre re-cased
    // attaches the existing row, not a new one.
    let mut m2 = sample_manga("bar", "Bar", "h2");
    m2.genres = vec!["webtoons".into()];
    let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
        .await
        .unwrap();
    let webtoons_count: (i64,) =
        sqlx::query_as("SELECT COUNT(*) FROM genres WHERE lower(name) = 'webtoons'")
            .fetch_one(&pool)
            .await
            .unwrap();
    assert_eq!(webtoons_count.0, 1, "case-insensitive lookup reuses the existing row");
}

/// User-attached tags (rows with non-NULL `added_by` in `manga_tags`)
/// must survive a crawler upsert. The crawler owns source-attached tags
/// (added_by IS NULL); user attachments are owned by the user who made
/// them and the recurring metadata pass must not delete them.
#[sqlx::test(migrations = "./migrations")]
async fn sync_tags_preserves_user_attached_tags(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    // A real user attaches a personal tag.
    let user = mangalord::repo::user::create(&pool, "alice", "phc-stub")
        .await
        .unwrap();
    let outcome = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
        .await
        .unwrap();
    assert!(outcome.created_attachment);

    // Second crawler pass. Use a different metadata_hash so the upsert
    // takes the Updated branch, but the bug also fires on Unchanged
    // ticks since sync_tags runs unconditionally.
    let mut m2 = m.clone();
    m2.metadata_hash = "hash-2".into();
    m2.tags = vec!["popular".into(), "weekly".into()];
    let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
        .await
        .unwrap();

    // The user tag must still be attached.
    let user_tag_rows: (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM manga_tags mt \
           JOIN tags t ON t.id = mt.tag_id \
          WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
            AND mt.added_by = $2",
    )
    .bind(up.manga_id)
    .bind(user.id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(
        user_tag_rows.0, 1,
        "user-attached tag must survive a crawler upsert"
    );

    // The source's tags should still attach as well, as crawler-owned.
    let source_tag_rows: (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM manga_tags mt \
           JOIN tags t ON t.id = mt.tag_id \
          WHERE mt.manga_id = $1 \
            AND mt.added_by IS NULL \
            AND lower(t.name) IN ('popular', 'weekly')",
    )
    .bind(up.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(source_tag_rows.0, 2, "source tags re-attach on each pass");

    // A subsequent pass where the source drops a previously-seen tag
    // must clear that crawler-owned attachment (otherwise crawler-tags
    // would only ever accumulate).
    let mut m3 = m2.clone();
    m3.metadata_hash = "hash-3".into();
    m3.tags = vec!["popular".into()];
    let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m3)
        .await
        .unwrap();
    let weekly_rows: (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM manga_tags mt \
           JOIN tags t ON t.id = mt.tag_id \
          WHERE mt.manga_id = $1 AND lower(t.name) = 'weekly'",
    )
    .bind(up.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(weekly_rows.0, 0, "source-owned tag dropped by source goes away");

    // And the user tag still survives that third pass.
    let user_tag_rows: (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM manga_tags mt \
           JOIN tags t ON t.id = mt.tag_id \
          WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
            AND mt.added_by = $2",
    )
    .bind(up.manga_id)
    .bind(user.id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(user_tag_rows.0, 1);
}

/// `manga_tags.added_by` is `ON DELETE SET NULL` on the user FK. When
/// the attaching user is deleted, their attachments become orphans
/// indistinguishable from crawler-owned rows — and the crawler should
/// reap them on the next pass. Pins the semantic so a future change
/// can't quietly leave orphan rows lying around.
#[sqlx::test(migrations = "./migrations")]
async fn sync_tags_garbage_collects_orphan_user_attachments(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    // A user attaches "personal", then the user gets deleted. The
    // attachment row stays (manga_tags.manga_id FK is CASCADE on
    // mangas only; we never CASCADE-delete user attachments). The FK
    // on added_by is `ON DELETE SET NULL`, so the row's owner column
    // goes NULL — same shape as a crawler-owned row.
    let user = mangalord::repo::user::create(&pool, "bob", "phc-stub")
        .await
        .unwrap();
    let _ = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
        .await
        .unwrap();
    sqlx::query("DELETE FROM users WHERE id = $1")
        .bind(user.id)
        .execute(&pool)
        .await
        .unwrap();

    // Sanity: the orphan still exists post-user-delete with added_by NULL.
    let (orphan_rows,): (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM manga_tags mt \
           JOIN tags t ON t.id = mt.tag_id \
          WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
            AND mt.added_by IS NULL",
    )
    .bind(up.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(orphan_rows, 1);

    // Next crawler pass — orphan should be reaped along with any
    // other source-owned rows that aren't in the new tag list.
    let mut m2 = m.clone();
    m2.metadata_hash = "hash-2".into();
    m2.tags = vec!["popular".into()];
    let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
        .await
        .unwrap();
    let (orphan_rows,): (i64,) = sqlx::query_as(
        "SELECT COUNT(*) FROM manga_tags mt \
           JOIN tags t ON t.id = mt.tag_id \
          WHERE mt.manga_id = $1 AND lower(t.name) = 'personal'",
    )
    .bind(up.manga_id)
    .fetch_one(&pool)
    .await
    .unwrap();
    assert_eq!(orphan_rows, 0, "orphan user-attached tag should be reaped");
}

// ---- list_missing_covers ---------------------------------------------------

#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_only_returns_rows_without_cover(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let with_cover = sample_manga("with", "With Cover", "h1");
    let without_cover = sample_manga("without", "No Cover", "h2");
    let _w = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/with", &with_cover)
        .await
        .unwrap();
    let nc = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/without", &without_cover)
        .await
        .unwrap();

    // Manually set a cover for `with` only.
    sqlx::query("UPDATE mangas SET cover_image_path = 'mangas/x/cover.jpg' WHERE id = $1")
        .bind(_w.manga_id)
        .execute(&pool)
        .await
        .unwrap();

    let entries = crawler::list_missing_covers(&pool, 50).await.unwrap();
    assert_eq!(entries.len(), 1, "exactly the manga without a cover");
    assert_eq!(entries[0].manga_id, nc.manga_id);
    assert_eq!(entries[0].source_manga_key, "without");
    assert_eq!(entries[0].source_url, "https://x.example/without");
}

#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_skips_dropped_source_rows(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo", "h1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    sqlx::query("UPDATE manga_sources SET dropped_at = NOW() WHERE manga_id = $1")
        .bind(up.manga_id)
        .execute(&pool)
        .await
        .unwrap();

    let entries = crawler::list_missing_covers(&pool, 50).await.unwrap();
    assert!(
        entries.is_empty(),
        "dropped-source mangas must not be backfilled — no live source to fetch from"
    );
}

#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_respects_limit(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    for i in 0..5 {
        let key = format!("m{i}");
        let url = format!("https://x.example/{key}");
        let m = sample_manga(&key, &format!("M{i}"), &format!("h{i}"));
        let _ = crawler::upsert_manga_from_source(&pool, "target", &url, &m)
            .await
            .unwrap();
    }
    let entries = crawler::list_missing_covers(&pool, 3).await.unwrap();
    assert_eq!(entries.len(), 3, "limit caps the result set");
}

#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_deduplicates_per_manga(pool: PgPool) {
    // A manga surfaced by two sources should produce ONE backfill
    // entry, not two — otherwise the per-tick cap could be eaten by
    // duplicates and starve other mangas.
    crawler::ensure_source(&pool, "src-a", "A", "https://a.example")
        .await
        .unwrap();
    crawler::ensure_source(&pool, "src-b", "B", "https://b.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo", "h1");
    let up = crawler::upsert_manga_from_source(&pool, "src-a", "https://a.example/foo", &m)
        .await
        .unwrap();
    // Second source attaches to the SAME manga row.
    sqlx::query(
        "INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url) \
         VALUES ($1, $2, $3, $4)",
    )
    .bind("src-b")
    .bind("foo-on-b")
    .bind(up.manga_id)
    .bind("https://b.example/foo")
    .execute(&pool)
    .await
    .unwrap();

    let entries = crawler::list_missing_covers(&pool, 50).await.unwrap();
    assert_eq!(entries.len(), 1, "DISTINCT ON (m.id) collapses duplicate source rows");
}

#[sqlx::test(migrations = "./migrations")]
async fn re_appearing_manga_clears_dropped_at(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo", "h1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    // Drop it manually.
    sqlx::query(
        "UPDATE manga_sources SET dropped_at = NOW() WHERE source_manga_key = 'foo'",
    )
    .execute(&pool)
    .await
    .unwrap();

    // Re-upsert: the link should un-drop.
    let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();
    let dropped: (Option<chrono::DateTime<chrono::Utc>>, Uuid) = sqlx::query_as(
        "SELECT dropped_at, manga_id FROM manga_sources WHERE source_manga_key = 'foo'",
    )
    .fetch_one(&pool)
    .await
    .unwrap();
    assert!(dropped.0.is_none());
    assert_eq!(dropped.1, up.manga_id);
}

// ---- source_index: site-order preservation ----
//
// The user-facing chapter list reverses the source-site order so that
// the oldest chapter appears first. The crawler records each row's DOM
// position in `chapters.source_index` (0 = first in source DOM = newest
// on this site) on every sync; the list query orders by source_index
// DESC NULLS LAST, falling through to number/created_at for rows with
// no source row (e.g. user uploads).

#[sqlx::test(migrations = "./migrations")]
async fn source_index_set_on_insert_matches_dom_order(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    let chapters = vec![
        SourceChapterRef {
            source_chapter_key: "a".into(),
            number: 30,
            title: Some("Ch.30".into()),
            url: "https://x.example/foo/a".into(),
        },
        SourceChapterRef {
            source_chapter_key: "b".into(),
            number: 29,
            title: Some("Ch.29".into()),
            url: "https://x.example/foo/b".into(),
        },
        SourceChapterRef {
            source_chapter_key: "c".into(),
            number: 28,
            title: Some("Ch.28".into()),
            url: "https://x.example/foo/c".into(),
        },
    ];
    crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
        .await
        .unwrap();

    let rows: Vec<(String, Option<i32>)> = sqlx::query_as(
        "SELECT cs.source_chapter_key, c.source_index \
           FROM chapters c \
           JOIN chapter_sources cs ON cs.chapter_id = c.id \
          WHERE c.manga_id = $1 \
          ORDER BY cs.source_chapter_key",
    )
    .bind(up.manga_id)
    .fetch_all(&pool)
    .await
    .unwrap();
    assert_eq!(
        rows,
        vec![
            ("a".to_string(), Some(0)),
            ("b".to_string(), Some(1)),
            ("c".to_string(), Some(2)),
        ],
        "source_index reflects enumerate() position in the input slice",
    );
}

#[sqlx::test(migrations = "./migrations")]
async fn source_index_rewritten_on_resync_when_new_chapter_prepended(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    let first = vec![
        SourceChapterRef {
            source_chapter_key: "a".into(),
            number: 1,
            title: Some("Ch.1".into()),
            url: "https://x.example/foo/a".into(),
        },
        SourceChapterRef {
            source_chapter_key: "b".into(),
            number: 2,
            title: Some("Ch.2".into()),
            url: "https://x.example/foo/b".into(),
        },
    ];
    crawler::sync_manga_chapters(&pool, "target", up.manga_id, &first)
        .await
        .unwrap();

    // Second sync: a brand-new chapter appears at the top of the source
    // (newest first on the site). All existing rows must shift their
    // source_index down by one so the display order stays correct.
    let second = vec![
        SourceChapterRef {
            source_chapter_key: "new".into(),
            number: 3,
            title: Some("Ch.3".into()),
            url: "https://x.example/foo/new".into(),
        },
        SourceChapterRef {
            source_chapter_key: "a".into(),
            number: 1,
            title: Some("Ch.1".into()),
            url: "https://x.example/foo/a".into(),
        },
        SourceChapterRef {
            source_chapter_key: "b".into(),
            number: 2,
            title: Some("Ch.2".into()),
            url: "https://x.example/foo/b".into(),
        },
    ];
    crawler::sync_manga_chapters(&pool, "target", up.manga_id, &second)
        .await
        .unwrap();

    let rows: Vec<(String, Option<i32>)> = sqlx::query_as(
        "SELECT cs.source_chapter_key, c.source_index \
           FROM chapters c \
           JOIN chapter_sources cs ON cs.chapter_id = c.id \
          WHERE c.manga_id = $1 \
          ORDER BY cs.source_chapter_key",
    )
    .bind(up.manga_id)
    .fetch_all(&pool)
    .await
    .unwrap();
    assert_eq!(
        rows,
        vec![
            ("a".to_string(), Some(1)),
            ("b".to_string(), Some(2)),
            ("new".to_string(), Some(0)),
        ],
        "new chapter takes index 0, existing rows shift down on UPDATE",
    );
}

#[sqlx::test(migrations = "./migrations")]
async fn list_for_manga_returns_source_order_reversed(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    // Site DOM order (top-down = newest-first):
    //   ch11  (number = 11)
    //   notice (number = 0, non-numeric label on the site)
    //   ch10  (number = 10)
    // Numbers deliberately disagree with DOM order: a number-based sort
    // would put notice first, but the site places it between ch10 and
    // ch11. Reversed-DOM display should yield [ch10, notice, ch11].
    let chapters = vec![
        SourceChapterRef {
            source_chapter_key: "ch11".into(),
            number: 11,
            title: Some("Ch.11 : Official".into()),
            url: "https://x.example/foo/11".into(),
        },
        SourceChapterRef {
            source_chapter_key: "notice".into(),
            number: 0,
            title: Some("notice. : Officials".into()),
            url: "https://x.example/foo/notice".into(),
        },
        SourceChapterRef {
            source_chapter_key: "ch10".into(),
            number: 10,
            title: Some("Ch.10 : Official".into()),
            url: "https://x.example/foo/10".into(),
        },
    ];
    crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
        .await
        .unwrap();

    let listed = chapter_repo::list_for_manga(&pool, up.manga_id, 50, 0)
        .await
        .unwrap();
    let keys: Vec<String> = listed
        .iter()
        .map(|c| c.title.clone().unwrap_or_default())
        .collect();
    assert_eq!(
        keys,
        vec![
            "Ch.10 : Official".to_string(),
            "notice. : Officials".to_string(),
            "Ch.11 : Official".to_string(),
        ],
        "list returns chapters in reversed source-DOM order, so the \
         oldest appears first and non-numeric entries land where the \
         site placed them",
    );
}

#[sqlx::test(migrations = "./migrations")]
async fn list_for_manga_places_null_source_index_last(pool: PgPool) {
    crawler::ensure_source(&pool, "target", "T", "https://x.example")
        .await
        .unwrap();
    let m = sample_manga("foo", "Foo Manga", "hash-1");
    let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
        .await
        .unwrap();

    // Crawled chapters get source_index 0 and 1; the upload path leaves
    // it NULL. NULLS LAST plus the (number, created_at) tail means the
    // upload sits after both crawled rows even though its number is in
    // the middle.
    let crawled = vec![
        SourceChapterRef {
            source_chapter_key: "a".into(),
            number: 1,
            title: Some("Ch.1".into()),
            url: "https://x.example/foo/a".into(),
        },
        SourceChapterRef {
            source_chapter_key: "b".into(),
            number: 3,
            title: Some("Ch.3".into()),
            url: "https://x.example/foo/b".into(),
        },
    ];
    crawler::sync_manga_chapters(&pool, "target", up.manga_id, &crawled)
        .await
        .unwrap();

    chapter_repo::create(&pool, up.manga_id, 2, Some("User upload Ch.2"), None)
        .await
        .unwrap();

    let listed = chapter_repo::list_for_manga(&pool, up.manga_id, 50, 0)
        .await
        .unwrap();
    let titles: Vec<String> = listed
        .iter()
        .map(|c| c.title.clone().unwrap_or_default())
        .collect();
    assert_eq!(
        titles,
        vec![
            "Ch.3".to_string(),
            "Ch.1".to_string(),
            "User upload Ch.2".to_string(),
        ],
        "crawled rows ordered by reversed source_index; user upload \
         (NULL source_index) falls through to the end",
    );
}