fix(crawler): sentinel-gate parse_chapter_list to stop false drops (0.35.2)

parse_chapter_list previously returned Vec::new() on any selector
miss. The empty list flowed into sync_manga_chapters, whose soft-drop
branch then flipped every existing chapter's dropped_at to NOW().
Bookmarks subsequently pointed at dropped sources, and
enqueue_bookmarked_pending (filters on cs.dropped_at IS NULL) silently
stopped re-fetching pages.

Same shape as the walker race fixed in 0.35.1: a transient parse miss
masquerading as "source removed everything" → false soft-drop.

Fix: require #chapter_table in the DOM. Present-but-empty is preserved
as Ok(vec![]) so a freshly added manga with no published chapters
still parses cleanly. Absent table is now Transient — the job system
reschedules with backoff instead of treating the partial render as
data.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-29 20:41:47 +02:00
parent dea9b1aaa8
commit c686d6eb51
3 changed files with 98 additions and 8 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "mangalord"
version = "0.35.1"
version = "0.35.2"
edition = "2021"
default-run = "mangalord"

View File

@@ -444,7 +444,7 @@ fn parse_manga_detail(
.collect();
let chapters = if include_chapters {
parse_chapter_list(&doc)
parse_chapter_list(&doc)?
} else {
Vec::new()
};
@@ -502,9 +502,22 @@ fn strip_tag_count(s: &str) -> String {
trimmed.to_string()
}
fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
/// Parse the chapter table on a manga detail page. Returns `Transient` if
/// `#chapter_table` isn't in the DOM at all — the table is required even
/// for mangas with no published chapters yet (the source renders an empty
/// `<table>`), so an absent table signals a partial render (post-load JS
/// not done, layout drift) rather than a legitimately empty list. Without
/// this sentinel, an empty `Vec` reaches `sync_manga_chapters` and the
/// soft-drop branch flips every existing chapter to `dropped_at`.
fn parse_chapter_list(doc: &scraper::Html) -> Result<Vec<SourceChapterRef>, PageError> {
if !has_chapter_table_sentinel(doc) {
return Err(PageError::transient(
"manga detail: #chapter_table sentinel missing",
));
}
let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap();
doc.select(&sel)
Ok(doc
.select(&sel)
.filter_map(|a| {
let url = a.value().attr("href")?.trim().to_string();
if url.is_empty() {
@@ -519,7 +532,16 @@ fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
url,
})
})
.collect()
.collect())
}
/// Returns true when the chapter-table container is present in the DOM.
/// Source-specific: the target site uses `#chapter_table` as the wrapper
/// element. Distinguishes "table is present but empty" (legit edge case
/// for new mangas) from "table is missing entirely" (partial render).
fn has_chapter_table_sentinel(doc: &scraper::Html) -> bool {
let sel = scraper::Selector::parse("#chapter_table").expect("valid selector");
doc.select(&sel).next().is_some()
}
fn parse_chapter_number(text: &str) -> Option<i32> {
@@ -880,7 +902,7 @@ mod tests {
"../../../tests/fixtures/target/chapter_list_uu.html"
);
let doc = scraper::Html::parse_document(html);
let chapters = parse_chapter_list(&doc);
let chapters = parse_chapter_list(&doc).expect("fixture has the table");
assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
@@ -1027,9 +1049,17 @@ mod tests {
#[test]
fn missing_optional_fields_parse_to_none() {
// Minimal but well-formed detail page: title is required, every
// other field is optional, but the chapter table is structural —
// its absence is treated as Transient (a freshly added manga
// renders the table empty, not absent). See
// `parse_chapter_list_returns_transient_when_table_missing` for
// the negative case.
let html = r#"<html><body>\
<header><div id="logo">Target</div></header>\
<div class="w-title"><h1>Minimal</h1></div></body></html>"#;
<div class="w-title"><h1>Minimal</h1></div>\
<table id="chapter_table"></table>\
</body></html>"#;
let m = parse_manga_detail(html, "min", true).unwrap();
assert_eq!(m.title, "Minimal");
assert!(m.summary.is_none());
@@ -1222,4 +1252,64 @@ mod tests {
assert_eq!(displaced[0].source_manga_key, "X");
assert_eq!(outcome, DisplacementOutcome::Shifted(1));
}
#[test]
fn parse_chapter_list_returns_transient_when_table_missing() {
// Partial render (post-load JS hadn't injected the table, layout
// drift, etc). Returning Vec::new() would silently soft-drop every
// existing chapter for the manga via sync_manga_chapters; Transient
// is the signal the job system retries on.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<div class="w-title"><h1>Test</h1></div>
</body></html>"#;
let doc = scraper::Html::parse_document(html);
let err = parse_chapter_list(&doc).expect_err("expected Transient");
assert!(err.is_transient(), "got non-transient: {err}");
}
#[test]
fn parse_chapter_list_ok_empty_when_table_present_but_no_rows() {
// A freshly-added manga with no chapters yet — the source renders
// the `<table id="chapter_table">` wrapper but no `<tr>` rows
// inside. Must stay distinguishable from a missing-table render.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<table id="chapter_table"></table>
</body></html>"#;
let doc = scraper::Html::parse_document(html);
let chapters = parse_chapter_list(&doc).expect("present table is not transient");
assert!(chapters.is_empty());
}
#[test]
fn parse_manga_detail_propagates_chapter_table_transient() {
// End-to-end: a detail page that survives the #logo sentinel but
// has the chapter table stripped must fail Transient at the parser
// boundary, not return a SourceManga with empty chapters.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<div class="w-title"><h1>Test Title</h1></div>
<div class="cover"><img src="/cover.jpg"></div>
<!-- intentionally no #chapter_table -->
</body></html>"#;
let err = parse_manga_detail(html, "key", true).expect_err("expected Transient");
assert!(err.is_transient(), "got non-transient: {err}");
}
#[test]
fn parse_manga_detail_skips_chapter_sentinel_when_include_chapters_false() {
// Metadata-only mode (`skip_chapters` upstream) must not require
// the chapter table — pipeline.rs avoids calling sync_manga_chapters
// for these mangas, so the absent table is not a correctness issue
// and shouldn't surface as Transient.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<div class="w-title"><h1>Test Title</h1></div>
<div class="cover"><img src="/cover.jpg"></div>
</body></html>"#;
let manga = parse_manga_detail(html, "key", false)
.expect("metadata-only parse must not require chapter table");
assert!(manga.chapters.is_empty());
}
}