fix(crawler): scope dispatch_target to live sources, newest first (0.36.4)

The chapter dispatcher's URL resolver had no dropped_at filter and no
ORDER BY — a chapter whose only chapter_sources row had been soft-
dropped was still dispatched against the stale URL, eating retry
budget on guaranteed transients. With multiple live sources the LIMIT
1 winner was nondeterministic.

Add `AND cs.dropped_at IS NULL` and `ORDER BY cs.last_seen_at DESC`
to dispatch_target, bringing it in lockstep with the enqueue queries
in pipeline.rs that already filter on dropped_at. Returns None when
all sources are dropped — callers in daemon.rs already treat None
as "ack the job, skip the work."

Tests in tests/repo_chapter.rs cover the three branches (freshest
live wins, dropped sources skipped, all-dropped returns None).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-30 20:03:45 +02:00
parent 70e8a7895c
commit 655ea42731
5 changed files with 182 additions and 7 deletions

2
backend/Cargo.lock generated
View File

@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mangalord"
version = "0.36.3"
version = "0.36.4"
dependencies = [
"anyhow",
"argon2",

View File

@@ -1,6 +1,6 @@
[package]
name = "mangalord"
version = "0.36.3"
version = "0.36.4"
edition = "2021"
default-run = "mangalord"

View File

@@ -118,10 +118,21 @@ pub async fn page_count(pool: &PgPool, id: Uuid) -> sqlx::Result<Option<i32>> {
.await
}
/// Look up the manga_id + most recent source_url for a chapter. Used
/// by the daemon's chapter dispatcher to resolve the URL it needs to
/// hand to `content::sync_chapter_content`. Returns `None` if the
/// chapter (or its source row) is gone.
/// Look up the manga_id + most recent live source_url for a chapter.
/// Used by the daemon's chapter dispatcher to resolve the URL it needs
/// to hand to `content::sync_chapter_content`.
///
/// Skips soft-dropped sources (`cs.dropped_at IS NOT NULL`) and breaks
/// ties between multiple live sources by `last_seen_at DESC`, so the
/// freshest still-attached URL wins. Returns `None` when the chapter
/// is gone or all its source rows are dropped — callers in the
/// dispatcher treat `None` as "ack the job, skip the work."
///
/// The enqueue queries (`pipeline::enqueue_bookmarked_pending` and
/// `enqueue_pending_for_manga`) apply the same `dropped_at IS NULL`
/// filter — this resolver stays in lockstep so a chapter that was
/// dropped between enqueue and lease isn't dispatched against a stale
/// URL.
pub async fn dispatch_target(
pool: &PgPool,
chapter_id: Uuid,
@@ -131,6 +142,8 @@ pub async fn dispatch_target(
FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
WHERE c.id = $1 \
AND cs.dropped_at IS NULL \
ORDER BY cs.last_seen_at DESC \
LIMIT 1",
)
.bind(chapter_id)

View File

@@ -0,0 +1,162 @@
//! Integration tests for `repo::chapter` — focused on
//! `dispatch_target`, the resolver the daemon's chapter dispatcher
//! uses to look up the URL it needs to hand to
//! `content::sync_chapter_content`.
//!
//! The query must:
//! 1. Skip `chapter_sources` rows where `dropped_at IS NOT NULL` —
//! otherwise a soft-dropped source URL is dispatched as if live and
//! burns the chapter's retry budget against guaranteed transients.
//! 2. Order the remaining rows by `last_seen_at DESC` so the freshest
//! surviving source is the one we'll fetch from.
//!
//! The fix lives in `backend/src/repo/chapter.rs:dispatch_target`. The
//! enqueue queries at `pipeline.rs:381` and `:435` already filter on
//! `cs.dropped_at IS NULL`; this brings the resolver into line.
use mangalord::crawler::source::{SourceChapterRef, SourceManga};
use mangalord::repo::{
chapter::dispatch_target,
crawler::{ensure_source, sync_manga_chapters, upsert_manga_from_source},
};
use sqlx::PgPool;
use uuid::Uuid;
fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga {
SourceManga {
source_manga_key: key.to_string(),
title: title.to_string(),
alternative_titles: vec![],
authors: vec![],
genres: vec![],
tags: vec![],
status: None,
summary: None,
cover_url: None,
chapters: vec![],
metadata_hash: hash.to_string(),
}
}
/// Seed a manga with one chapter, plus a second `chapter_sources` row
/// pointing at the same chapter with a *newer* `last_seen_at` so the
/// `ORDER BY cs.last_seen_at DESC` branch of the fixed query can
/// distinguish "freshest live source" from "any live source."
async fn seed_chapter_with_two_live_sources(pool: &PgPool) -> (Uuid, String, String) {
// Two distinct sources both pointing at the same chapter is the
// realistic shape of the multi-source state — each source row is
// keyed (source_id, chapter_id) after migration 0017.
ensure_source(pool, "target", "T", "https://x.example")
.await
.unwrap();
ensure_source(pool, "mirror", "Mirror", "https://m.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = upsert_manga_from_source(pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let initial = vec![SourceChapterRef {
source_chapter_key: "1".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/1/old".into(),
}];
sync_manga_chapters(pool, "target", up.manga_id, &initial)
.await
.unwrap();
let (chapter_id,): (Uuid,) = sqlx::query_as(
"SELECT c.id FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
WHERE cs.source_chapter_key = '1' AND cs.source_id = 'target'",
)
.fetch_one(pool)
.await
.unwrap();
let old_url = "https://x.example/foo/1/old".to_string();
let new_url = "https://m.example/foo/1/mirror".to_string();
// Backdate the existing (old/target) source row and add a fresher
// row from the mirror source. The fix uses `last_seen_at DESC` to
// break the tie deterministically.
sqlx::query(
"UPDATE chapter_sources \
SET last_seen_at = NOW() - INTERVAL '2 days' \
WHERE chapter_id = $1 AND source_id = 'target'",
)
.bind(chapter_id)
.execute(pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO chapter_sources \
(source_id, chapter_id, source_chapter_key, source_url, last_seen_at) \
VALUES ('mirror', $1, '1', $2, NOW())",
)
.bind(chapter_id)
.bind(&new_url)
.execute(pool)
.await
.unwrap();
(chapter_id, old_url, new_url)
}
#[sqlx::test(migrations = "./migrations")]
async fn dispatch_target_prefers_most_recent_live_source(pool: PgPool) {
let (chapter_id, _old_url, new_url) =
seed_chapter_with_two_live_sources(&pool).await;
let row = dispatch_target(&pool, chapter_id).await.unwrap();
let (_manga_id, source_url) =
row.expect("two live sources should yield a dispatch target");
assert_eq!(
source_url, new_url,
"ORDER BY last_seen_at DESC LIMIT 1 must return the freshest source"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn dispatch_target_skips_dropped_sources(pool: PgPool) {
let (chapter_id, _old_url, new_url) =
seed_chapter_with_two_live_sources(&pool).await;
// Soft-drop the fresher row. The dispatcher must now return the
// *older* still-live row instead of the dropped one.
sqlx::query(
"UPDATE chapter_sources SET dropped_at = NOW() WHERE source_url = $1",
)
.bind(&new_url)
.execute(&pool)
.await
.unwrap();
let row = dispatch_target(&pool, chapter_id).await.unwrap();
let (_manga_id, source_url) =
row.expect("a single live source should still yield a dispatch target");
assert!(
source_url != new_url,
"dispatch_target must not return a dropped source"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn dispatch_target_returns_none_when_only_dropped_sources_remain(
pool: PgPool,
) {
let (chapter_id, _old_url, _new_url) =
seed_chapter_with_two_live_sources(&pool).await;
sqlx::query("UPDATE chapter_sources SET dropped_at = NOW() WHERE chapter_id = $1")
.bind(chapter_id)
.execute(&pool)
.await
.unwrap();
let row = dispatch_target(&pool, chapter_id).await.unwrap();
assert!(
row.is_none(),
"every source is dropped — dispatch_target must return None"
);
}

View File

@@ -1,6 +1,6 @@
{
"name": "mangalord-frontend",
"version": "0.36.3",
"version": "0.36.4",
"private": true,
"type": "module",
"scripts": {