Files
Mangalord/backend/tests/crawler_sync.rs
MechaCat02 8d34132883 bugfix: security & correctness bundle (0.34.1)
Five fixes bundled into one release:

- preserve user-attached tags across crawler upserts
  (repo::crawler::sync_tags now scopes to added_by IS NULL; orphaned
  attachments from deleted users are reaped as crawler-owned)
- gate manga PATCH and cover endpoints on uploaded_by (require_can_edit
  in api::mangas; non-NULL uploaded_by must match the caller)
- equalise login response time across user-existence branches
  (run argon2 against a OnceLock-cached dummy hash on the no-user
  branch so timing doesn't leak username existence)
- crawler download defences (SSRF allowlist of host literals
  including IPv4-mapped IPv6 ranges, 32 MiB streamed size cap,
  reject non-whitelisted image types, three-way chapter-probe
  classifier replaces the binary #avatar_menu check)
- tighten validation and clean up dead unload path
  (attach_tag + create_token enforce 64-char caps; LocalStorage
  rejects NUL bytes explicitly; reader flushFinalProgress drops
  the always-405 sendBeacon path)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:24:51 +02:00

638 lines
22 KiB
Rust

//! Integration tests for `repo::crawler`.
//!
//! Each test runs against a fresh, migrated DB via `#[sqlx::test]`.
//! `DATABASE_URL` must point to a Postgres where the test user can
//! `CREATEDB`.
use mangalord::crawler::source::{SourceChapterRef, SourceManga};
use mangalord::repo::crawler::{self, ChapterDiff, UpsertStatus};
use sqlx::PgPool;
use uuid::Uuid;
/// Helper to spin up a `SourceManga` fixture with a stable shape so
/// each test can tweak just the fields it cares about.
fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga {
SourceManga {
source_manga_key: key.to_string(),
title: title.to_string(),
alternative_titles: vec!["Alt 1".into()],
authors: vec!["Author One".into()],
// Action is in the seeded `genres` table; Fantasy is too.
genres: vec!["Action".into(), "Fantasy".into()],
tags: vec!["popular".into()],
status: Some("ongoing".into()),
summary: Some("Sample summary.".into()),
cover_url: Some("/cover.jpg".into()),
chapters: vec![],
metadata_hash: hash.to_string(),
}
}
#[sqlx::test(migrations = "./migrations")]
async fn ensure_source_is_idempotent(pool: PgPool) {
crawler::ensure_source(&pool, "target", "Target Site", "https://x.example")
.await
.unwrap();
crawler::ensure_source(&pool, "target", "Target Site v2", "https://x.example")
.await
.unwrap();
let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM sources WHERE id = 'target'")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(count.0, 1);
let name: (String,) = sqlx::query_as("SELECT name FROM sources WHERE id = 'target'")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(name.0, "Target Site v2", "name updates on re-call");
}
#[sqlx::test(migrations = "./migrations")]
async fn first_upsert_inserts_manga_and_links_metadata(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let res = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
assert_eq!(res.status, UpsertStatus::New);
// mangas row created
let row: (String, String, Vec<String>) =
sqlx::query_as("SELECT title, status, alt_titles FROM mangas WHERE id = $1")
.bind(res.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(row.0, "Foo Manga");
assert_eq!(row.1, "ongoing");
assert_eq!(row.2, vec!["Alt 1"]);
// manga_sources row links the two
let link: (String, Uuid, Option<String>) = sqlx::query_as(
"SELECT source_id, manga_id, metadata_hash FROM manga_sources WHERE source_manga_key = $1",
)
.bind("foo")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(link.0, "target");
assert_eq!(link.1, res.manga_id);
assert_eq!(link.2.as_deref(), Some("hash-1"));
// Authors, genres, tags M2M populated
let n_authors: (i64,) =
sqlx::query_as("SELECT COUNT(*) FROM manga_authors WHERE manga_id = $1")
.bind(res.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(n_authors.0, 1);
let n_genres: (i64,) =
sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
.bind(res.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(n_genres.0, 2, "Action + Fantasy");
let n_tags: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_tags WHERE manga_id = $1")
.bind(res.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(n_tags.0, 1);
}
#[sqlx::test(migrations = "./migrations")]
async fn second_upsert_with_same_hash_reports_unchanged(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
assert_eq!(second.status, UpsertStatus::Unchanged);
assert_eq!(second.manga_id, first.manga_id);
}
#[sqlx::test(migrations = "./migrations")]
async fn upsert_with_changed_hash_updates_fields(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let mut m = sample_manga("foo", "Foo Manga", "hash-1");
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
m.title = "Foo Manga (Revised)".into();
m.status = Some("completed".into());
m.metadata_hash = "hash-2".into();
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
assert_eq!(second.status, UpsertStatus::Updated);
assert_eq!(second.manga_id, first.manga_id);
let row: (String, String) =
sqlx::query_as("SELECT title, status FROM mangas WHERE id = $1")
.bind(first.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(row.0, "Foo Manga (Revised)");
assert_eq!(row.1, "completed");
}
#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_adds_new_refreshes_existing_and_drops_vanished(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let initial = vec![
SourceChapterRef {
source_chapter_key: "1".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/1".into(),
},
SourceChapterRef {
source_chapter_key: "2".into(),
number: 2,
title: Some("Ch.2".into()),
url: "https://x.example/foo/2".into(),
},
];
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &initial)
.await
.unwrap();
assert_eq!(
diff,
ChapterDiff {
new: 2,
refreshed: 0,
dropped: 0
}
);
// Second run: keep ch1, replace ch2 with ch3 — ch2 should be dropped.
let second = vec![
SourceChapterRef {
source_chapter_key: "1".into(),
number: 1,
title: Some("Ch.1 (renamed)".into()),
url: "https://x.example/foo/1".into(),
},
SourceChapterRef {
source_chapter_key: "3".into(),
number: 3,
title: Some("Ch.3".into()),
url: "https://x.example/foo/3".into(),
},
];
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &second)
.await
.unwrap();
assert_eq!(
diff,
ChapterDiff {
new: 1,
refreshed: 1,
dropped: 1
}
);
// Renamed title propagated to chapters.title
let title: (Option<String>,) =
sqlx::query_as("SELECT c.title FROM chapters c JOIN chapter_sources cs ON cs.chapter_id = c.id WHERE cs.source_chapter_key = '1'")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(title.0.as_deref(), Some("Ch.1 (renamed)"));
// Vanished chapter is soft-dropped (row still exists, dropped_at set).
let dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
sqlx::query_as("SELECT dropped_at FROM chapter_sources WHERE source_chapter_key = '2'")
.fetch_one(&pool)
.await
.unwrap();
assert!(dropped.0.is_some(), "ch2 should be soft-dropped");
}
/// Real-world sources publish multiple chapters at the same number
/// (different uploaders, translator notes, re-releases). After the
/// (manga_id, number) UNIQUE drop in 0013, each `SourceChapterRef`
/// becomes its own `chapters` row even when the parsed number matches
/// — chapter identity is now the chapter id, not the number.
#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// Two distinct uploads of Ch.52 (different uploaders → different
// URLs/keys, same parsed number) plus a notice/hiatus row that
// parses to number=0 alongside a real chapter at number 1.
let chapters = vec![
SourceChapterRef {
source_chapter_key: "br_chapter-A".into(),
number: 52,
title: Some("Ch.52 : Official".into()),
url: "https://x.example/foo/A/pg-1/".into(),
},
SourceChapterRef {
source_chapter_key: "br_chapter-B".into(),
number: 52,
title: Some("Ch.52 : Official (alt)".into()),
url: "https://x.example/foo/B/pg-1/".into(),
},
SourceChapterRef {
source_chapter_key: "br_chapter-NOTICE".into(),
number: 0,
title: Some("hitaus.".into()),
url: "https://x.example/foo/notice/pg-1/".into(),
},
SourceChapterRef {
source_chapter_key: "br_chapter-1".into(),
number: 1,
title: Some("Ch.1 : Official".into()),
url: "https://x.example/foo/1/pg-1/".into(),
},
];
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
.await
.unwrap();
assert_eq!(
diff,
ChapterDiff {
new: 4,
refreshed: 0,
dropped: 0
},
"every source ref yields a new chapter row"
);
let rows: (i64,) =
sqlx::query_as("SELECT COUNT(*) FROM chapters WHERE manga_id = $1")
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(rows.0, 4, "4 distinct chapter rows even with duplicate numbers");
let ch52_count: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM chapters WHERE manga_id = $1 AND number = 52",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(ch52_count.0, 2, "both Ch.52 uploads survive as separate rows");
}
#[sqlx::test(migrations = "./migrations")]
async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
// Seed two mangas before "now" so a later run_started_at sees them as stale.
let _ = crawler::upsert_manga_from_source(
&pool,
"target",
"https://x.example/foo",
&sample_manga("foo", "Foo", "hf"),
)
.await
.unwrap();
let _ = crawler::upsert_manga_from_source(
&pool,
"target",
"https://x.example/bar",
&sample_manga("bar", "Bar", "hb"),
)
.await
.unwrap();
// Now mark a new "run" beginning. Re-upsert only `foo` — `bar`
// should be the one flagged dropped.
let run_started = chrono::Utc::now();
// Sleep briefly so the second upsert's NOW() > run_started_at.
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
let _ = crawler::upsert_manga_from_source(
&pool,
"target",
"https://x.example/foo",
&sample_manga("foo", "Foo", "hf"),
)
.await
.unwrap();
let n = crawler::mark_dropped_mangas(&pool, "target", run_started)
.await
.unwrap();
assert_eq!(n, 1, "only bar should have been dropped");
let foo_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'foo'")
.fetch_one(&pool)
.await
.unwrap();
assert!(foo_dropped.0.is_none(), "foo seen this run, must not be dropped");
let bar_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'bar'")
.fetch_one(&pool)
.await
.unwrap();
assert!(bar_dropped.0.is_some());
}
#[sqlx::test(migrations = "./migrations")]
async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo", "h1");
// First upsert: row is brand new, no cover stored yet.
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
assert!(first.cover_image_path.is_none(), "new manga has no cover yet");
// Simulate cover landing in storage post-upsert.
sqlx::query("UPDATE mangas SET cover_image_path = $1 WHERE id = $2")
.bind("mangas/foo/cover.jpg")
.bind(first.manga_id)
.execute(&pool)
.await
.unwrap();
// Second upsert with same hash → Unchanged, but cover path is now
// surfaced so the caller knows the backfill is done.
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
assert_eq!(second.status, UpsertStatus::Unchanged);
assert_eq!(
second.cover_image_path.as_deref(),
Some("mangas/foo/cover.jpg")
);
}
#[sqlx::test(migrations = "./migrations")]
async fn arbitrary_genres_from_source_get_inserted(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let mut m = sample_manga("foo", "Foo", "h");
// "Action" is seeded by migration 0009. "Webtoons" is not.
m.genres = vec!["Action".into(), "Webtoons".into()];
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let n_genre_links: (i64,) =
sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(n_genre_links.0, 2, "both seeded and source-added genres attach");
let webtoons: (i64,) =
sqlx::query_as("SELECT COUNT(*) FROM genres WHERE name = 'Webtoons'")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(webtoons.0, 1, "non-seeded genre was inserted");
// Case-insensitive de-dup: a second sync with the genre re-cased
// attaches the existing row, not a new one.
let mut m2 = sample_manga("bar", "Bar", "h2");
m2.genres = vec!["webtoons".into()];
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
.await
.unwrap();
let webtoons_count: (i64,) =
sqlx::query_as("SELECT COUNT(*) FROM genres WHERE lower(name) = 'webtoons'")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(webtoons_count.0, 1, "case-insensitive lookup reuses the existing row");
}
/// User-attached tags (rows with non-NULL `added_by` in `manga_tags`)
/// must survive a crawler upsert. The crawler owns source-attached tags
/// (added_by IS NULL); user attachments are owned by the user who made
/// them and the recurring metadata pass must not delete them.
#[sqlx::test(migrations = "./migrations")]
async fn sync_tags_preserves_user_attached_tags(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// A real user attaches a personal tag.
let user = mangalord::repo::user::create(&pool, "alice", "phc-stub")
.await
.unwrap();
let outcome = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
.await
.unwrap();
assert!(outcome.created_attachment);
// Second crawler pass. Use a different metadata_hash so the upsert
// takes the Updated branch, but the bug also fires on Unchanged
// ticks since sync_tags runs unconditionally.
let mut m2 = m.clone();
m2.metadata_hash = "hash-2".into();
m2.tags = vec!["popular".into(), "weekly".into()];
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
.await
.unwrap();
// The user tag must still be attached.
let user_tag_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
AND mt.added_by = $2",
)
.bind(up.manga_id)
.bind(user.id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(
user_tag_rows.0, 1,
"user-attached tag must survive a crawler upsert"
);
// The source's tags should still attach as well, as crawler-owned.
let source_tag_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 \
AND mt.added_by IS NULL \
AND lower(t.name) IN ('popular', 'weekly')",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(source_tag_rows.0, 2, "source tags re-attach on each pass");
// A subsequent pass where the source drops a previously-seen tag
// must clear that crawler-owned attachment (otherwise crawler-tags
// would only ever accumulate).
let mut m3 = m2.clone();
m3.metadata_hash = "hash-3".into();
m3.tags = vec!["popular".into()];
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m3)
.await
.unwrap();
let weekly_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'weekly'",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(weekly_rows.0, 0, "source-owned tag dropped by source goes away");
// And the user tag still survives that third pass.
let user_tag_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
AND mt.added_by = $2",
)
.bind(up.manga_id)
.bind(user.id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(user_tag_rows.0, 1);
}
/// `manga_tags.added_by` is `ON DELETE SET NULL` on the user FK. When
/// the attaching user is deleted, their attachments become orphans
/// indistinguishable from crawler-owned rows — and the crawler should
/// reap them on the next pass. Pins the semantic so a future change
/// can't quietly leave orphan rows lying around.
#[sqlx::test(migrations = "./migrations")]
async fn sync_tags_garbage_collects_orphan_user_attachments(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// A user attaches "personal", then the user gets deleted. The
// attachment row stays (manga_tags.manga_id FK is CASCADE on
// mangas only; we never CASCADE-delete user attachments). The FK
// on added_by is `ON DELETE SET NULL`, so the row's owner column
// goes NULL — same shape as a crawler-owned row.
let user = mangalord::repo::user::create(&pool, "bob", "phc-stub")
.await
.unwrap();
let _ = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
.await
.unwrap();
sqlx::query("DELETE FROM users WHERE id = $1")
.bind(user.id)
.execute(&pool)
.await
.unwrap();
// Sanity: the orphan still exists post-user-delete with added_by NULL.
let (orphan_rows,): (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
AND mt.added_by IS NULL",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(orphan_rows, 1);
// Next crawler pass — orphan should be reaped along with any
// other source-owned rows that aren't in the new tag list.
let mut m2 = m.clone();
m2.metadata_hash = "hash-2".into();
m2.tags = vec!["popular".into()];
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
.await
.unwrap();
let (orphan_rows,): (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal'",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(orphan_rows, 0, "orphan user-attached tag should be reaped");
}
#[sqlx::test(migrations = "./migrations")]
async fn re_appearing_manga_clears_dropped_at(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo", "h1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// Drop it manually.
sqlx::query(
"UPDATE manga_sources SET dropped_at = NOW() WHERE source_manga_key = 'foo'",
)
.execute(&pool)
.await
.unwrap();
// Re-upsert: the link should un-drop.
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let dropped: (Option<chrono::DateTime<chrono::Utc>>, Uuid) = sqlx::query_as(
"SELECT dropped_at, manga_id FROM manga_sources WHERE source_manga_key = 'foo'",
)
.fetch_one(&pool)
.await
.unwrap();
assert!(dropped.0.is_none());
assert_eq!(dropped.1, up.manga_id);
}