The wait_for_selector wait in 0.36.2 narrows the partial-render race window but doesn't close it: a render that takes longer than SELECTOR_TIMEOUT (10s) still hands an empty Vec to sync_manga_chapters, and the soft-drop branch flips every existing chapter to dropped_at. The next tick recovers but a manga's reader briefly stops working in between. Close it at the pipeline level. Between fetch_manga and the upsert/ sync, if the parsed chapter list is empty and the prior live count for (source_id, source_manga_key) is > 0, treat the fetch as a transient failure: log, bump mangas_failed, skip upsert + sync + the seen.insert so a later batch / tick retries. Brand-new mangas with genuinely zero chapters (prior == 0) pass through unchanged. New repo helper repo::crawler::live_chapter_count_for_source_manga joins chapters → chapter_sources → manga_sources with dropped_at IS NULL — same lockstep as dispatch_target and the enqueue queries. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
863 lines
30 KiB
Rust
863 lines
30 KiB
Rust
//! Integration tests for `repo::crawler`.
|
|
//!
|
|
//! Each test runs against a fresh, migrated DB via `#[sqlx::test]`.
|
|
//! `DATABASE_URL` must point to a Postgres where the test user can
|
|
//! `CREATEDB`.
|
|
|
|
use mangalord::crawler::source::{SourceChapterRef, SourceManga};
|
|
use mangalord::repo::crawler::{self, ChapterDiff, UpsertStatus};
|
|
use sqlx::PgPool;
|
|
use uuid::Uuid;
|
|
|
|
/// Helper to spin up a `SourceManga` fixture with a stable shape so
|
|
/// each test can tweak just the fields it cares about.
|
|
fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga {
|
|
SourceManga {
|
|
source_manga_key: key.to_string(),
|
|
title: title.to_string(),
|
|
alternative_titles: vec!["Alt 1".into()],
|
|
authors: vec!["Author One".into()],
|
|
// Action is in the seeded `genres` table; Fantasy is too.
|
|
genres: vec!["Action".into(), "Fantasy".into()],
|
|
tags: vec!["popular".into()],
|
|
status: Some("ongoing".into()),
|
|
summary: Some("Sample summary.".into()),
|
|
cover_url: Some("/cover.jpg".into()),
|
|
chapters: vec![],
|
|
metadata_hash: hash.to_string(),
|
|
}
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn ensure_source_is_idempotent(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "Target Site", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
crawler::ensure_source(&pool, "target", "Target Site v2", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM sources WHERE id = 'target'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(count.0, 1);
|
|
let name: (String,) = sqlx::query_as("SELECT name FROM sources WHERE id = 'target'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(name.0, "Target Site v2", "name updates on re-call");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn first_upsert_inserts_manga_and_links_metadata(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
|
|
let res = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(res.status, UpsertStatus::New);
|
|
|
|
// mangas row created
|
|
let row: (String, String, Vec<String>) =
|
|
sqlx::query_as("SELECT title, status, alt_titles FROM mangas WHERE id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(row.0, "Foo Manga");
|
|
assert_eq!(row.1, "ongoing");
|
|
assert_eq!(row.2, vec!["Alt 1"]);
|
|
|
|
// manga_sources row links the two
|
|
let link: (String, Uuid, Option<String>) = sqlx::query_as(
|
|
"SELECT source_id, manga_id, metadata_hash FROM manga_sources WHERE source_manga_key = $1",
|
|
)
|
|
.bind("foo")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(link.0, "target");
|
|
assert_eq!(link.1, res.manga_id);
|
|
assert_eq!(link.2.as_deref(), Some("hash-1"));
|
|
|
|
// Authors, genres, tags M2M populated
|
|
let n_authors: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM manga_authors WHERE manga_id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_authors.0, 1);
|
|
let n_genres: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_genres.0, 2, "Action + Fantasy");
|
|
let n_tags: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_tags WHERE manga_id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_tags.0, 1);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn second_upsert_with_same_hash_reports_unchanged(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(second.status, UpsertStatus::Unchanged);
|
|
assert_eq!(second.manga_id, first.manga_id);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn upsert_with_changed_hash_updates_fields(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let mut m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
m.title = "Foo Manga (Revised)".into();
|
|
m.status = Some("completed".into());
|
|
m.metadata_hash = "hash-2".into();
|
|
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
assert_eq!(second.status, UpsertStatus::Updated);
|
|
assert_eq!(second.manga_id, first.manga_id);
|
|
|
|
let row: (String, String) =
|
|
sqlx::query_as("SELECT title, status FROM mangas WHERE id = $1")
|
|
.bind(first.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(row.0, "Foo Manga (Revised)");
|
|
assert_eq!(row.1, "completed");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn sync_chapters_adds_new_refreshes_existing_and_drops_vanished(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
let initial = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1".into()),
|
|
url: "https://x.example/foo/1".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "2".into(),
|
|
number: 2,
|
|
title: Some("Ch.2".into()),
|
|
url: "https://x.example/foo/2".into(),
|
|
},
|
|
];
|
|
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &initial)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
diff,
|
|
ChapterDiff {
|
|
new: 2,
|
|
refreshed: 0,
|
|
dropped: 0
|
|
}
|
|
);
|
|
|
|
// Second run: keep ch1, replace ch2 with ch3 — ch2 should be dropped.
|
|
let second = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1 (renamed)".into()),
|
|
url: "https://x.example/foo/1".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "3".into(),
|
|
number: 3,
|
|
title: Some("Ch.3".into()),
|
|
url: "https://x.example/foo/3".into(),
|
|
},
|
|
];
|
|
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &second)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
diff,
|
|
ChapterDiff {
|
|
new: 1,
|
|
refreshed: 1,
|
|
dropped: 1
|
|
}
|
|
);
|
|
|
|
// Renamed title propagated to chapters.title
|
|
let title: (Option<String>,) =
|
|
sqlx::query_as("SELECT c.title FROM chapters c JOIN chapter_sources cs ON cs.chapter_id = c.id WHERE cs.source_chapter_key = '1'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(title.0.as_deref(), Some("Ch.1 (renamed)"));
|
|
|
|
// Vanished chapter is soft-dropped (row still exists, dropped_at set).
|
|
let dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
|
sqlx::query_as("SELECT dropped_at FROM chapter_sources WHERE source_chapter_key = '2'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert!(dropped.0.is_some(), "ch2 should be soft-dropped");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn live_chapter_count_returns_zero_for_unknown_source_key(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
// No manga_sources row yet → unknown key path. Must not error and
|
|
// must report zero so the partial-render guard accepts the
|
|
// "brand-new manga with no chapters" case as legitimate.
|
|
let n = crawler::live_chapter_count_for_source_manga(&pool, "target", "nobody")
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n, 0);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn live_chapter_count_only_counts_live_sources(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
let chapters = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1".into()),
|
|
url: "https://x.example/foo/1".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "2".into(),
|
|
number: 2,
|
|
title: Some("Ch.2".into()),
|
|
url: "https://x.example/foo/2".into(),
|
|
},
|
|
];
|
|
crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
crawler::live_chapter_count_for_source_manga(&pool, "target", "foo")
|
|
.await
|
|
.unwrap(),
|
|
2
|
|
);
|
|
// Soft-drop one source row — count drops by one, the row stays.
|
|
sqlx::query(
|
|
"UPDATE chapter_sources SET dropped_at = NOW() WHERE source_chapter_key = '2'",
|
|
)
|
|
.execute(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
crawler::live_chapter_count_for_source_manga(&pool, "target", "foo")
|
|
.await
|
|
.unwrap(),
|
|
1
|
|
);
|
|
}
|
|
|
|
/// Real-world sources publish multiple chapters at the same number
|
|
/// (different uploaders, translator notes, re-releases). After the
|
|
/// (manga_id, number) UNIQUE drop in 0013, each `SourceChapterRef`
|
|
/// becomes its own `chapters` row even when the parsed number matches
|
|
/// — chapter identity is now the chapter id, not the number.
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Two distinct uploads of Ch.52 (different uploaders → different
|
|
// URLs/keys, same parsed number) plus a notice/hiatus row that
|
|
// parses to number=0 alongside a real chapter at number 1.
|
|
let chapters = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "br_chapter-A".into(),
|
|
number: 52,
|
|
title: Some("Ch.52 : Official".into()),
|
|
url: "https://x.example/foo/A/pg-1/".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "br_chapter-B".into(),
|
|
number: 52,
|
|
title: Some("Ch.52 : Official (alt)".into()),
|
|
url: "https://x.example/foo/B/pg-1/".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "br_chapter-NOTICE".into(),
|
|
number: 0,
|
|
title: Some("hitaus.".into()),
|
|
url: "https://x.example/foo/notice/pg-1/".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "br_chapter-1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1 : Official".into()),
|
|
url: "https://x.example/foo/1/pg-1/".into(),
|
|
},
|
|
];
|
|
|
|
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
diff,
|
|
ChapterDiff {
|
|
new: 4,
|
|
refreshed: 0,
|
|
dropped: 0
|
|
},
|
|
"every source ref yields a new chapter row"
|
|
);
|
|
|
|
let rows: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM chapters WHERE manga_id = $1")
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(rows.0, 4, "4 distinct chapter rows even with duplicate numbers");
|
|
|
|
let ch52_count: (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM chapters WHERE manga_id = $1 AND number = 52",
|
|
)
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(ch52_count.0, 2, "both Ch.52 uploads survive as separate rows");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn sync_chapters_isolates_colliding_keys_across_mangas(pool: PgPool) {
|
|
// Two mangas, both with a chapter whose source_chapter_key is
|
|
// "chapter-1". Pre-migration-0017 the PK enforced (source_id,
|
|
// source_chapter_key) globally and the lookup didn't filter by
|
|
// manga_id, so the second manga's sync would adopt the first manga's
|
|
// chapter_id (silent attribution corruption). After 0017 each manga
|
|
// owns its own row.
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m1 = sample_manga("foo", "Manga Foo", "hash-foo");
|
|
let m2 = sample_manga("bar", "Manga Bar", "hash-bar");
|
|
let up1 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m1)
|
|
.await
|
|
.unwrap();
|
|
let up2 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
|
|
.await
|
|
.unwrap();
|
|
assert_ne!(up1.manga_id, up2.manga_id);
|
|
|
|
let shared = vec![SourceChapterRef {
|
|
source_chapter_key: "chapter-1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1".into()),
|
|
url: "https://x.example/foo/chapter-1/".into(),
|
|
}];
|
|
let diff1 = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &shared)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(diff1.new, 1, "manga foo: chapter inserted fresh");
|
|
|
|
// Manga bar now syncs *the same key*. Under the old schema this would
|
|
// either fail on PK conflict or attribute the chapter to foo. Under
|
|
// the new schema bar gets its own chapter row.
|
|
let bar_chapters = vec![SourceChapterRef {
|
|
source_chapter_key: "chapter-1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1 (bar)".into()),
|
|
url: "https://x.example/bar/chapter-1/".into(),
|
|
}];
|
|
let diff2 = crawler::sync_manga_chapters(&pool, "target", up2.manga_id, &bar_chapters)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
diff2.new, 1,
|
|
"manga bar: same key resolved per-manga to a fresh row"
|
|
);
|
|
|
|
let foo_count: (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
|
|
)
|
|
.bind(up1.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
let bar_count: (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
|
|
)
|
|
.bind(up2.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(foo_count.0, 1);
|
|
assert_eq!(bar_count.0, 1);
|
|
|
|
let bar_title: (Option<String>,) = sqlx::query_as(
|
|
"SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
|
|
)
|
|
.bind(up2.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
bar_title.0.as_deref(),
|
|
Some("Ch.1 (bar)"),
|
|
"bar's chapter has bar's title, not foo's"
|
|
);
|
|
|
|
// A subsequent re-sync of foo with the same key correctly refreshes
|
|
// foo's row, not bar's.
|
|
let foo_resync = vec![SourceChapterRef {
|
|
source_chapter_key: "chapter-1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1 (foo updated)".into()),
|
|
url: "https://x.example/foo/chapter-1/".into(),
|
|
}];
|
|
let diff_refresh = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &foo_resync)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(diff_refresh.refreshed, 1);
|
|
assert_eq!(diff_refresh.new, 0);
|
|
|
|
let foo_title: (Option<String>,) = sqlx::query_as(
|
|
"SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
|
|
)
|
|
.bind(up1.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(foo_title.0.as_deref(), Some("Ch.1 (foo updated)"));
|
|
let bar_title_after: (Option<String>,) = sqlx::query_as(
|
|
"SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
|
|
)
|
|
.bind(up2.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
bar_title_after.0.as_deref(),
|
|
Some("Ch.1 (bar)"),
|
|
"bar's row is untouched by foo's refresh"
|
|
);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn sync_chapters_serializes_concurrent_calls_for_same_manga(pool: PgPool) {
|
|
// Without the per-manga advisory lock, two concurrent calls would
|
|
// both read `seen_keys`, both run the drop UPDATE filtered on `NOT
|
|
// (key = ANY $3)`, and the later commit could soft-drop a chapter
|
|
// the earlier had just inserted. The lock makes the calls strictly
|
|
// sequential per-manga: whichever runs second sees the first one's
|
|
// committed chapters and treats their absence as a "dropped" signal
|
|
// only if the second list legitimately omits them.
|
|
//
|
|
// Concretely: pre-state [A]. Call X syncs [A, B]; call Y syncs
|
|
// [A, B, C]. Whatever the schedule, the final state must include
|
|
// *all three* chapters because neither call legitimately omits the
|
|
// other's contribution — both lists are supersets of each other's
|
|
// pre-existing rows.
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
let manga_id = up.manga_id;
|
|
|
|
// Pre-state: [A].
|
|
let pre = vec![SourceChapterRef {
|
|
source_chapter_key: "A".into(),
|
|
number: 1,
|
|
title: Some("Ch.A".into()),
|
|
url: "https://x.example/foo/A".into(),
|
|
}];
|
|
crawler::sync_manga_chapters(&pool, "target", manga_id, &pre)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Two concurrent calls. Call X adds B; call Y adds B + C. Both keep
|
|
// A. Their drop branches would otherwise race against each other.
|
|
let list_x = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "A".into(),
|
|
number: 1,
|
|
title: Some("Ch.A".into()),
|
|
url: "https://x.example/foo/A".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "B".into(),
|
|
number: 2,
|
|
title: Some("Ch.B".into()),
|
|
url: "https://x.example/foo/B".into(),
|
|
},
|
|
];
|
|
let list_y = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "A".into(),
|
|
number: 1,
|
|
title: Some("Ch.A".into()),
|
|
url: "https://x.example/foo/A".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "B".into(),
|
|
number: 2,
|
|
title: Some("Ch.B".into()),
|
|
url: "https://x.example/foo/B".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "C".into(),
|
|
number: 3,
|
|
title: Some("Ch.C".into()),
|
|
url: "https://x.example/foo/C".into(),
|
|
},
|
|
];
|
|
let pool_x = pool.clone();
|
|
let pool_y = pool.clone();
|
|
let (rx, ry) = tokio::join!(
|
|
tokio::spawn(async move {
|
|
crawler::sync_manga_chapters(&pool_x, "target", manga_id, &list_x).await
|
|
}),
|
|
tokio::spawn(async move {
|
|
crawler::sync_manga_chapters(&pool_y, "target", manga_id, &list_y).await
|
|
}),
|
|
);
|
|
rx.unwrap().expect("call X");
|
|
ry.unwrap().expect("call Y");
|
|
|
|
// All three keys must survive with dropped_at NULL — the lock
|
|
// ensures the later call sees the earlier one's INSERTs and the
|
|
// drop UPDATE finds nothing to drop.
|
|
let alive: Vec<String> = sqlx::query_scalar(
|
|
"SELECT cs.source_chapter_key \
|
|
FROM chapter_sources cs \
|
|
JOIN chapters ch ON ch.id = cs.chapter_id \
|
|
WHERE ch.manga_id = $1 AND cs.dropped_at IS NULL \
|
|
ORDER BY cs.source_chapter_key",
|
|
)
|
|
.bind(manga_id)
|
|
.fetch_all(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
alive,
|
|
vec!["A".to_string(), "B".to_string(), "C".to_string()],
|
|
"all chapters survive concurrent syncs that both contain them"
|
|
);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo", "h1");
|
|
|
|
// First upsert: row is brand new, no cover stored yet.
|
|
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert!(first.cover_image_path.is_none(), "new manga has no cover yet");
|
|
|
|
// Simulate cover landing in storage post-upsert.
|
|
sqlx::query("UPDATE mangas SET cover_image_path = $1 WHERE id = $2")
|
|
.bind("mangas/foo/cover.jpg")
|
|
.bind(first.manga_id)
|
|
.execute(&pool)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Second upsert with same hash → Unchanged, but cover path is now
|
|
// surfaced so the caller knows the backfill is done.
|
|
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(second.status, UpsertStatus::Unchanged);
|
|
assert_eq!(
|
|
second.cover_image_path.as_deref(),
|
|
Some("mangas/foo/cover.jpg")
|
|
);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn arbitrary_genres_from_source_get_inserted(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let mut m = sample_manga("foo", "Foo", "h");
|
|
// "Action" is seeded by migration 0009. "Webtoons" is not.
|
|
m.genres = vec!["Action".into(), "Webtoons".into()];
|
|
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
let n_genre_links: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_genre_links.0, 2, "both seeded and source-added genres attach");
|
|
|
|
let webtoons: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM genres WHERE name = 'Webtoons'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(webtoons.0, 1, "non-seeded genre was inserted");
|
|
|
|
// Case-insensitive de-dup: a second sync with the genre re-cased
|
|
// attaches the existing row, not a new one.
|
|
let mut m2 = sample_manga("bar", "Bar", "h2");
|
|
m2.genres = vec!["webtoons".into()];
|
|
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
|
|
.await
|
|
.unwrap();
|
|
let webtoons_count: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM genres WHERE lower(name) = 'webtoons'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(webtoons_count.0, 1, "case-insensitive lookup reuses the existing row");
|
|
}
|
|
|
|
/// User-attached tags (rows with non-NULL `added_by` in `manga_tags`)
|
|
/// must survive a crawler upsert. The crawler owns source-attached tags
|
|
/// (added_by IS NULL); user attachments are owned by the user who made
|
|
/// them and the recurring metadata pass must not delete them.
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn sync_tags_preserves_user_attached_tags(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
// A real user attaches a personal tag.
|
|
let user = mangalord::repo::user::create(&pool, "alice", "phc-stub")
|
|
.await
|
|
.unwrap();
|
|
let outcome = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
|
|
.await
|
|
.unwrap();
|
|
assert!(outcome.created_attachment);
|
|
|
|
// Second crawler pass. Use a different metadata_hash so the upsert
|
|
// takes the Updated branch, but the bug also fires on Unchanged
|
|
// ticks since sync_tags runs unconditionally.
|
|
let mut m2 = m.clone();
|
|
m2.metadata_hash = "hash-2".into();
|
|
m2.tags = vec!["popular".into(), "weekly".into()];
|
|
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
|
|
.await
|
|
.unwrap();
|
|
|
|
// The user tag must still be attached.
|
|
let user_tag_rows: (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM manga_tags mt \
|
|
JOIN tags t ON t.id = mt.tag_id \
|
|
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
|
|
AND mt.added_by = $2",
|
|
)
|
|
.bind(up.manga_id)
|
|
.bind(user.id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
user_tag_rows.0, 1,
|
|
"user-attached tag must survive a crawler upsert"
|
|
);
|
|
|
|
// The source's tags should still attach as well, as crawler-owned.
|
|
let source_tag_rows: (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM manga_tags mt \
|
|
JOIN tags t ON t.id = mt.tag_id \
|
|
WHERE mt.manga_id = $1 \
|
|
AND mt.added_by IS NULL \
|
|
AND lower(t.name) IN ('popular', 'weekly')",
|
|
)
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(source_tag_rows.0, 2, "source tags re-attach on each pass");
|
|
|
|
// A subsequent pass where the source drops a previously-seen tag
|
|
// must clear that crawler-owned attachment (otherwise crawler-tags
|
|
// would only ever accumulate).
|
|
let mut m3 = m2.clone();
|
|
m3.metadata_hash = "hash-3".into();
|
|
m3.tags = vec!["popular".into()];
|
|
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m3)
|
|
.await
|
|
.unwrap();
|
|
let weekly_rows: (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM manga_tags mt \
|
|
JOIN tags t ON t.id = mt.tag_id \
|
|
WHERE mt.manga_id = $1 AND lower(t.name) = 'weekly'",
|
|
)
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(weekly_rows.0, 0, "source-owned tag dropped by source goes away");
|
|
|
|
// And the user tag still survives that third pass.
|
|
let user_tag_rows: (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM manga_tags mt \
|
|
JOIN tags t ON t.id = mt.tag_id \
|
|
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
|
|
AND mt.added_by = $2",
|
|
)
|
|
.bind(up.manga_id)
|
|
.bind(user.id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(user_tag_rows.0, 1);
|
|
}
|
|
|
|
/// `manga_tags.added_by` is `ON DELETE SET NULL` on the user FK. When
|
|
/// the attaching user is deleted, their attachments become orphans
|
|
/// indistinguishable from crawler-owned rows — and the crawler should
|
|
/// reap them on the next pass. Pins the semantic so a future change
|
|
/// can't quietly leave orphan rows lying around.
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn sync_tags_garbage_collects_orphan_user_attachments(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo", "hash-1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
// A user attaches "personal", then the user gets deleted. The
|
|
// attachment row stays (manga_tags.manga_id FK is CASCADE on
|
|
// mangas only; we never CASCADE-delete user attachments). The FK
|
|
// on added_by is `ON DELETE SET NULL`, so the row's owner column
|
|
// goes NULL — same shape as a crawler-owned row.
|
|
let user = mangalord::repo::user::create(&pool, "bob", "phc-stub")
|
|
.await
|
|
.unwrap();
|
|
let _ = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
|
|
.await
|
|
.unwrap();
|
|
sqlx::query("DELETE FROM users WHERE id = $1")
|
|
.bind(user.id)
|
|
.execute(&pool)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Sanity: the orphan still exists post-user-delete with added_by NULL.
|
|
let (orphan_rows,): (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM manga_tags mt \
|
|
JOIN tags t ON t.id = mt.tag_id \
|
|
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
|
|
AND mt.added_by IS NULL",
|
|
)
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(orphan_rows, 1);
|
|
|
|
// Next crawler pass — orphan should be reaped along with any
|
|
// other source-owned rows that aren't in the new tag list.
|
|
let mut m2 = m.clone();
|
|
m2.metadata_hash = "hash-2".into();
|
|
m2.tags = vec!["popular".into()];
|
|
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
|
|
.await
|
|
.unwrap();
|
|
let (orphan_rows,): (i64,) = sqlx::query_as(
|
|
"SELECT COUNT(*) FROM manga_tags mt \
|
|
JOIN tags t ON t.id = mt.tag_id \
|
|
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal'",
|
|
)
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(orphan_rows, 0, "orphan user-attached tag should be reaped");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn re_appearing_manga_clears_dropped_at(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo", "h1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Drop it manually.
|
|
sqlx::query(
|
|
"UPDATE manga_sources SET dropped_at = NOW() WHERE source_manga_key = 'foo'",
|
|
)
|
|
.execute(&pool)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Re-upsert: the link should un-drop.
|
|
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
let dropped: (Option<chrono::DateTime<chrono::Utc>>, Uuid) = sqlx::query_as(
|
|
"SELECT dropped_at, manga_id FROM manga_sources WHERE source_manga_key = 'foo'",
|
|
)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert!(dropped.0.is_none());
|
|
assert_eq!(dropped.1, up.manga_id);
|
|
}
|