- TargetSource: first concrete impl of the Source trait, modeled on
the old Puppeteer crawler's selectors (+ status normalization,
tag-count stripping, chapter list)
- DiscoverMode::Backfill walks pagination last->1, reverse within each
page (oldest-first); Incremental walks forward
- RateLimiter (tokio-time aware) plumbed through FetchContext so the
pagination walk honors the same per-host budget as the outer loop
- repo::crawler: ensure_source, upsert_manga_from_source (returns
New/Updated/Unchanged + current cover_image_path for backfill
decisions), sync_manga_chapters, mark_dropped_mangas — all
transactional, with case-insensitive lookups and source-insertable
genres
- Cover image download via reqwest+infer; stored under
mangas/{id}/cover.{ext} via the Storage trait
- Single CRAWLER_PROXY env wires both Chromium (--proxy-server) and
reqwest::Proxy::all (HTTP/HTTPS/SOCKS5)
- Crawler binary: positional start URL or $CRAWLER_START_URL,
$CRAWLER_LIMIT (cap fetches + skip drop pass on partial runs),
$CRAWLER_SKIP_CHAPTERS (disable selector AND sync), $CRAWLER_RATE_MS
- Silences chromiumoxide 0.7's known CDP deserialize log spam via
default tracing filter + CdpError::Serde downgrade
- 9 sqlx integration tests + 11 selector/rate-limit unit tests
398 lines
14 KiB
Rust
398 lines
14 KiB
Rust
//! Integration tests for `repo::crawler`.
|
|
//!
|
|
//! Each test runs against a fresh, migrated DB via `#[sqlx::test]`.
|
|
//! `DATABASE_URL` must point to a Postgres where the test user can
|
|
//! `CREATEDB`.
|
|
|
|
use mangalord::crawler::source::{SourceChapterRef, SourceManga};
|
|
use mangalord::repo::crawler::{self, ChapterDiff, UpsertStatus};
|
|
use sqlx::PgPool;
|
|
use uuid::Uuid;
|
|
|
|
/// Helper to spin up a `SourceManga` fixture with a stable shape so
|
|
/// each test can tweak just the fields it cares about.
|
|
fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga {
|
|
SourceManga {
|
|
source_manga_key: key.to_string(),
|
|
title: title.to_string(),
|
|
alternative_titles: vec!["Alt 1".into()],
|
|
authors: vec!["Author One".into()],
|
|
// Action is in the seeded `genres` table; Fantasy is too.
|
|
genres: vec!["Action".into(), "Fantasy".into()],
|
|
tags: vec!["popular".into()],
|
|
status: Some("ongoing".into()),
|
|
summary: Some("Sample summary.".into()),
|
|
cover_url: Some("/cover.jpg".into()),
|
|
chapters: vec![],
|
|
metadata_hash: hash.to_string(),
|
|
}
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn ensure_source_is_idempotent(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "Target Site", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
crawler::ensure_source(&pool, "target", "Target Site v2", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM sources WHERE id = 'target'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(count.0, 1);
|
|
let name: (String,) = sqlx::query_as("SELECT name FROM sources WHERE id = 'target'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(name.0, "Target Site v2", "name updates on re-call");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn first_upsert_inserts_manga_and_links_metadata(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
|
|
let res = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(res.status, UpsertStatus::New);
|
|
|
|
// mangas row created
|
|
let row: (String, String, Vec<String>) =
|
|
sqlx::query_as("SELECT title, status, alt_titles FROM mangas WHERE id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(row.0, "Foo Manga");
|
|
assert_eq!(row.1, "ongoing");
|
|
assert_eq!(row.2, vec!["Alt 1"]);
|
|
|
|
// manga_sources row links the two
|
|
let link: (String, Uuid, Option<String>) = sqlx::query_as(
|
|
"SELECT source_id, manga_id, metadata_hash FROM manga_sources WHERE source_manga_key = $1",
|
|
)
|
|
.bind("foo")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(link.0, "target");
|
|
assert_eq!(link.1, res.manga_id);
|
|
assert_eq!(link.2.as_deref(), Some("hash-1"));
|
|
|
|
// Authors, genres, tags M2M populated
|
|
let n_authors: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM manga_authors WHERE manga_id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_authors.0, 1);
|
|
let n_genres: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_genres.0, 2, "Action + Fantasy");
|
|
let n_tags: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM manga_tags WHERE manga_id = $1")
|
|
.bind(res.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_tags.0, 1);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn second_upsert_with_same_hash_reports_unchanged(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(second.status, UpsertStatus::Unchanged);
|
|
assert_eq!(second.manga_id, first.manga_id);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn upsert_with_changed_hash_updates_fields(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let mut m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
m.title = "Foo Manga (Revised)".into();
|
|
m.status = Some("completed".into());
|
|
m.metadata_hash = "hash-2".into();
|
|
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
assert_eq!(second.status, UpsertStatus::Updated);
|
|
assert_eq!(second.manga_id, first.manga_id);
|
|
|
|
let row: (String, String) =
|
|
sqlx::query_as("SELECT title, status FROM mangas WHERE id = $1")
|
|
.bind(first.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(row.0, "Foo Manga (Revised)");
|
|
assert_eq!(row.1, "completed");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn sync_chapters_adds_new_refreshes_existing_and_drops_vanished(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo Manga", "hash-1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
let initial = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1".into()),
|
|
url: "https://x.example/foo/1".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "2".into(),
|
|
number: 2,
|
|
title: Some("Ch.2".into()),
|
|
url: "https://x.example/foo/2".into(),
|
|
},
|
|
];
|
|
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &initial)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
diff,
|
|
ChapterDiff {
|
|
new: 2,
|
|
refreshed: 0,
|
|
dropped: 0
|
|
}
|
|
);
|
|
|
|
// Second run: keep ch1, replace ch2 with ch3 — ch2 should be dropped.
|
|
let second = vec![
|
|
SourceChapterRef {
|
|
source_chapter_key: "1".into(),
|
|
number: 1,
|
|
title: Some("Ch.1 (renamed)".into()),
|
|
url: "https://x.example/foo/1".into(),
|
|
},
|
|
SourceChapterRef {
|
|
source_chapter_key: "3".into(),
|
|
number: 3,
|
|
title: Some("Ch.3".into()),
|
|
url: "https://x.example/foo/3".into(),
|
|
},
|
|
];
|
|
let diff = crawler::sync_manga_chapters(&pool, "target", up.manga_id, &second)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(
|
|
diff,
|
|
ChapterDiff {
|
|
new: 1,
|
|
refreshed: 1,
|
|
dropped: 1
|
|
}
|
|
);
|
|
|
|
// Renamed title propagated to chapters.title
|
|
let title: (Option<String>,) =
|
|
sqlx::query_as("SELECT c.title FROM chapters c JOIN chapter_sources cs ON cs.chapter_id = c.id WHERE cs.source_chapter_key = '1'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(title.0.as_deref(), Some("Ch.1 (renamed)"));
|
|
|
|
// Vanished chapter is soft-dropped (row still exists, dropped_at set).
|
|
let dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
|
sqlx::query_as("SELECT dropped_at FROM chapter_sources WHERE source_chapter_key = '2'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert!(dropped.0.is_some(), "ch2 should be soft-dropped");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
// Seed two mangas before "now" so a later run_started_at sees them as stale.
|
|
let _ = crawler::upsert_manga_from_source(
|
|
&pool,
|
|
"target",
|
|
"https://x.example/foo",
|
|
&sample_manga("foo", "Foo", "hf"),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
let _ = crawler::upsert_manga_from_source(
|
|
&pool,
|
|
"target",
|
|
"https://x.example/bar",
|
|
&sample_manga("bar", "Bar", "hb"),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Now mark a new "run" beginning. Re-upsert only `foo` — `bar`
|
|
// should be the one flagged dropped.
|
|
let run_started = chrono::Utc::now();
|
|
// Sleep briefly so the second upsert's NOW() > run_started_at.
|
|
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
|
let _ = crawler::upsert_manga_from_source(
|
|
&pool,
|
|
"target",
|
|
"https://x.example/foo",
|
|
&sample_manga("foo", "Foo", "hf"),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
|
|
let n = crawler::mark_dropped_mangas(&pool, "target", run_started)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n, 1, "only bar should have been dropped");
|
|
|
|
let foo_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
|
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'foo'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert!(foo_dropped.0.is_none(), "foo seen this run, must not be dropped");
|
|
let bar_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
|
|
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'bar'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert!(bar_dropped.0.is_some());
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn upsert_surfaces_cover_image_path_for_backfill_decisions(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo", "h1");
|
|
|
|
// First upsert: row is brand new, no cover stored yet.
|
|
let first = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert!(first.cover_image_path.is_none(), "new manga has no cover yet");
|
|
|
|
// Simulate cover landing in storage post-upsert.
|
|
sqlx::query("UPDATE mangas SET cover_image_path = $1 WHERE id = $2")
|
|
.bind("mangas/foo/cover.jpg")
|
|
.bind(first.manga_id)
|
|
.execute(&pool)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Second upsert with same hash → Unchanged, but cover path is now
|
|
// surfaced so the caller knows the backfill is done.
|
|
let second = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(second.status, UpsertStatus::Unchanged);
|
|
assert_eq!(
|
|
second.cover_image_path.as_deref(),
|
|
Some("mangas/foo/cover.jpg")
|
|
);
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn arbitrary_genres_from_source_get_inserted(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let mut m = sample_manga("foo", "Foo", "h");
|
|
// "Action" is seeded by migration 0009. "Webtoons" is not.
|
|
m.genres = vec!["Action".into(), "Webtoons".into()];
|
|
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
let n_genre_links: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM manga_genres WHERE manga_id = $1")
|
|
.bind(up.manga_id)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(n_genre_links.0, 2, "both seeded and source-added genres attach");
|
|
|
|
let webtoons: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM genres WHERE name = 'Webtoons'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(webtoons.0, 1, "non-seeded genre was inserted");
|
|
|
|
// Case-insensitive de-dup: a second sync with the genre re-cased
|
|
// attaches the existing row, not a new one.
|
|
let mut m2 = sample_manga("bar", "Bar", "h2");
|
|
m2.genres = vec!["webtoons".into()];
|
|
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
|
|
.await
|
|
.unwrap();
|
|
let webtoons_count: (i64,) =
|
|
sqlx::query_as("SELECT COUNT(*) FROM genres WHERE lower(name) = 'webtoons'")
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert_eq!(webtoons_count.0, 1, "case-insensitive lookup reuses the existing row");
|
|
}
|
|
|
|
#[sqlx::test(migrations = "./migrations")]
|
|
async fn re_appearing_manga_clears_dropped_at(pool: PgPool) {
|
|
crawler::ensure_source(&pool, "target", "T", "https://x.example")
|
|
.await
|
|
.unwrap();
|
|
let m = sample_manga("foo", "Foo", "h1");
|
|
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Drop it manually.
|
|
sqlx::query(
|
|
"UPDATE manga_sources SET dropped_at = NOW() WHERE source_manga_key = 'foo'",
|
|
)
|
|
.execute(&pool)
|
|
.await
|
|
.unwrap();
|
|
|
|
// Re-upsert: the link should un-drop.
|
|
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
|
|
.await
|
|
.unwrap();
|
|
let dropped: (Option<chrono::DateTime<chrono::Utc>>, Uuid) = sqlx::query_as(
|
|
"SELECT dropped_at, manga_id FROM manga_sources WHERE source_manga_key = 'foo'",
|
|
)
|
|
.fetch_one(&pool)
|
|
.await
|
|
.unwrap();
|
|
assert!(dropped.0.is_none());
|
|
assert_eq!(dropped.1, up.manga_id);
|
|
}
|