From 655ea42731f183549f820dc788603a8e23723e1c Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sat, 30 May 2026 20:03:45 +0200 Subject: [PATCH] fix(crawler): scope dispatch_target to live sources, newest first (0.36.4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chapter dispatcher's URL resolver had no dropped_at filter and no ORDER BY — a chapter whose only chapter_sources row had been soft- dropped was still dispatched against the stale URL, eating retry budget on guaranteed transients. With multiple live sources the LIMIT 1 winner was nondeterministic. Add `AND cs.dropped_at IS NULL` and `ORDER BY cs.last_seen_at DESC` to dispatch_target, bringing it in lockstep with the enqueue queries in pipeline.rs that already filter on dropped_at. Returns None when all sources are dropped — callers in daemon.rs already treat None as "ack the job, skip the work." Tests in tests/repo_chapter.rs cover the three branches (freshest live wins, dropped sources skipped, all-dropped returns None). Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/repo/chapter.rs | 21 ++++- backend/tests/repo_chapter.rs | 162 ++++++++++++++++++++++++++++++++++ frontend/package.json | 2 +- 5 files changed, 182 insertions(+), 7 deletions(-) create mode 100644 backend/tests/repo_chapter.rs diff --git a/backend/Cargo.lock b/backend/Cargo.lock index c14f3de..acf4894 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.36.3" +version = "0.36.4" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 189c027..6b62031 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.36.3" +version = "0.36.4" edition = "2021" default-run = "mangalord" diff --git a/backend/src/repo/chapter.rs b/backend/src/repo/chapter.rs index 97d1732..e5fddd0 100644 --- a/backend/src/repo/chapter.rs +++ b/backend/src/repo/chapter.rs @@ -118,10 +118,21 @@ pub async fn page_count(pool: &PgPool, id: Uuid) -> sqlx::Result> { .await } -/// Look up the manga_id + most recent source_url for a chapter. Used -/// by the daemon's chapter dispatcher to resolve the URL it needs to -/// hand to `content::sync_chapter_content`. Returns `None` if the -/// chapter (or its source row) is gone. +/// Look up the manga_id + most recent live source_url for a chapter. +/// Used by the daemon's chapter dispatcher to resolve the URL it needs +/// to hand to `content::sync_chapter_content`. +/// +/// Skips soft-dropped sources (`cs.dropped_at IS NOT NULL`) and breaks +/// ties between multiple live sources by `last_seen_at DESC`, so the +/// freshest still-attached URL wins. Returns `None` when the chapter +/// is gone or all its source rows are dropped — callers in the +/// dispatcher treat `None` as "ack the job, skip the work." +/// +/// The enqueue queries (`pipeline::enqueue_bookmarked_pending` and +/// `enqueue_pending_for_manga`) apply the same `dropped_at IS NULL` +/// filter — this resolver stays in lockstep so a chapter that was +/// dropped between enqueue and lease isn't dispatched against a stale +/// URL. pub async fn dispatch_target( pool: &PgPool, chapter_id: Uuid, @@ -131,6 +142,8 @@ pub async fn dispatch_target( FROM chapters c \ JOIN chapter_sources cs ON cs.chapter_id = c.id \ WHERE c.id = $1 \ + AND cs.dropped_at IS NULL \ + ORDER BY cs.last_seen_at DESC \ LIMIT 1", ) .bind(chapter_id) diff --git a/backend/tests/repo_chapter.rs b/backend/tests/repo_chapter.rs new file mode 100644 index 0000000..1ed9f05 --- /dev/null +++ b/backend/tests/repo_chapter.rs @@ -0,0 +1,162 @@ +//! Integration tests for `repo::chapter` — focused on +//! `dispatch_target`, the resolver the daemon's chapter dispatcher +//! uses to look up the URL it needs to hand to +//! `content::sync_chapter_content`. +//! +//! The query must: +//! 1. Skip `chapter_sources` rows where `dropped_at IS NOT NULL` — +//! otherwise a soft-dropped source URL is dispatched as if live and +//! burns the chapter's retry budget against guaranteed transients. +//! 2. Order the remaining rows by `last_seen_at DESC` so the freshest +//! surviving source is the one we'll fetch from. +//! +//! The fix lives in `backend/src/repo/chapter.rs:dispatch_target`. The +//! enqueue queries at `pipeline.rs:381` and `:435` already filter on +//! `cs.dropped_at IS NULL`; this brings the resolver into line. + +use mangalord::crawler::source::{SourceChapterRef, SourceManga}; +use mangalord::repo::{ + chapter::dispatch_target, + crawler::{ensure_source, sync_manga_chapters, upsert_manga_from_source}, +}; +use sqlx::PgPool; +use uuid::Uuid; + +fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga { + SourceManga { + source_manga_key: key.to_string(), + title: title.to_string(), + alternative_titles: vec![], + authors: vec![], + genres: vec![], + tags: vec![], + status: None, + summary: None, + cover_url: None, + chapters: vec![], + metadata_hash: hash.to_string(), + } +} + +/// Seed a manga with one chapter, plus a second `chapter_sources` row +/// pointing at the same chapter with a *newer* `last_seen_at` so the +/// `ORDER BY cs.last_seen_at DESC` branch of the fixed query can +/// distinguish "freshest live source" from "any live source." +async fn seed_chapter_with_two_live_sources(pool: &PgPool) -> (Uuid, String, String) { + // Two distinct sources both pointing at the same chapter is the + // realistic shape of the multi-source state — each source row is + // keyed (source_id, chapter_id) after migration 0017. + ensure_source(pool, "target", "T", "https://x.example") + .await + .unwrap(); + ensure_source(pool, "mirror", "Mirror", "https://m.example") + .await + .unwrap(); + let m = sample_manga("foo", "Foo Manga", "hash-1"); + let up = upsert_manga_from_source(pool, "target", "https://x.example/foo", &m) + .await + .unwrap(); + let initial = vec![SourceChapterRef { + source_chapter_key: "1".into(), + number: 1, + title: Some("Ch.1".into()), + url: "https://x.example/foo/1/old".into(), + }]; + sync_manga_chapters(pool, "target", up.manga_id, &initial) + .await + .unwrap(); + + let (chapter_id,): (Uuid,) = sqlx::query_as( + "SELECT c.id FROM chapters c \ + JOIN chapter_sources cs ON cs.chapter_id = c.id \ + WHERE cs.source_chapter_key = '1' AND cs.source_id = 'target'", + ) + .fetch_one(pool) + .await + .unwrap(); + + let old_url = "https://x.example/foo/1/old".to_string(); + let new_url = "https://m.example/foo/1/mirror".to_string(); + // Backdate the existing (old/target) source row and add a fresher + // row from the mirror source. The fix uses `last_seen_at DESC` to + // break the tie deterministically. + sqlx::query( + "UPDATE chapter_sources \ + SET last_seen_at = NOW() - INTERVAL '2 days' \ + WHERE chapter_id = $1 AND source_id = 'target'", + ) + .bind(chapter_id) + .execute(pool) + .await + .unwrap(); + sqlx::query( + "INSERT INTO chapter_sources \ + (source_id, chapter_id, source_chapter_key, source_url, last_seen_at) \ + VALUES ('mirror', $1, '1', $2, NOW())", + ) + .bind(chapter_id) + .bind(&new_url) + .execute(pool) + .await + .unwrap(); + + (chapter_id, old_url, new_url) +} + +#[sqlx::test(migrations = "./migrations")] +async fn dispatch_target_prefers_most_recent_live_source(pool: PgPool) { + let (chapter_id, _old_url, new_url) = + seed_chapter_with_two_live_sources(&pool).await; + + let row = dispatch_target(&pool, chapter_id).await.unwrap(); + let (_manga_id, source_url) = + row.expect("two live sources should yield a dispatch target"); + assert_eq!( + source_url, new_url, + "ORDER BY last_seen_at DESC LIMIT 1 must return the freshest source" + ); +} + +#[sqlx::test(migrations = "./migrations")] +async fn dispatch_target_skips_dropped_sources(pool: PgPool) { + let (chapter_id, _old_url, new_url) = + seed_chapter_with_two_live_sources(&pool).await; + + // Soft-drop the fresher row. The dispatcher must now return the + // *older* still-live row instead of the dropped one. + sqlx::query( + "UPDATE chapter_sources SET dropped_at = NOW() WHERE source_url = $1", + ) + .bind(&new_url) + .execute(&pool) + .await + .unwrap(); + + let row = dispatch_target(&pool, chapter_id).await.unwrap(); + let (_manga_id, source_url) = + row.expect("a single live source should still yield a dispatch target"); + assert!( + source_url != new_url, + "dispatch_target must not return a dropped source" + ); +} + +#[sqlx::test(migrations = "./migrations")] +async fn dispatch_target_returns_none_when_only_dropped_sources_remain( + pool: PgPool, +) { + let (chapter_id, _old_url, _new_url) = + seed_chapter_with_two_live_sources(&pool).await; + + sqlx::query("UPDATE chapter_sources SET dropped_at = NOW() WHERE chapter_id = $1") + .bind(chapter_id) + .execute(&pool) + .await + .unwrap(); + + let row = dispatch_target(&pool, chapter_id).await.unwrap(); + assert!( + row.is_none(), + "every source is dropped — dispatch_target must return None" + ); +} diff --git a/frontend/package.json b/frontend/package.json index b210564..7430943 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.36.3", + "version": "0.36.4", "private": true, "type": "module", "scripts": {