fix(crawler): sentinel-gate parse_chapter_list to stop false drops (0.35.2)
parse_chapter_list previously returned Vec::new() on any selector miss. The empty list flowed into sync_manga_chapters, whose soft-drop branch then flipped every existing chapter's dropped_at to NOW(). Bookmarks subsequently pointed at dropped sources, and enqueue_bookmarked_pending (filters on cs.dropped_at IS NULL) silently stopped re-fetching pages. Same shape as the walker race fixed in 0.35.1: a transient parse miss masquerading as "source removed everything" → false soft-drop. Fix: require #chapter_table in the DOM. Present-but-empty is preserved as Ok(vec![]) so a freshly added manga with no published chapters still parses cleanly. Absent table is now Transient — the job system reschedules with backoff instead of treating the partial render as data. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -444,7 +444,7 @@ fn parse_manga_detail(
|
||||
.collect();
|
||||
|
||||
let chapters = if include_chapters {
|
||||
parse_chapter_list(&doc)
|
||||
parse_chapter_list(&doc)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
@@ -502,9 +502,22 @@ fn strip_tag_count(s: &str) -> String {
|
||||
trimmed.to_string()
|
||||
}
|
||||
|
||||
fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
|
||||
/// Parse the chapter table on a manga detail page. Returns `Transient` if
|
||||
/// `#chapter_table` isn't in the DOM at all — the table is required even
|
||||
/// for mangas with no published chapters yet (the source renders an empty
|
||||
/// `<table>`), so an absent table signals a partial render (post-load JS
|
||||
/// not done, layout drift) rather than a legitimately empty list. Without
|
||||
/// this sentinel, an empty `Vec` reaches `sync_manga_chapters` and the
|
||||
/// soft-drop branch flips every existing chapter to `dropped_at`.
|
||||
fn parse_chapter_list(doc: &scraper::Html) -> Result<Vec<SourceChapterRef>, PageError> {
|
||||
if !has_chapter_table_sentinel(doc) {
|
||||
return Err(PageError::transient(
|
||||
"manga detail: #chapter_table sentinel missing",
|
||||
));
|
||||
}
|
||||
let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap();
|
||||
doc.select(&sel)
|
||||
Ok(doc
|
||||
.select(&sel)
|
||||
.filter_map(|a| {
|
||||
let url = a.value().attr("href")?.trim().to_string();
|
||||
if url.is_empty() {
|
||||
@@ -519,7 +532,16 @@ fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
|
||||
url,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Returns true when the chapter-table container is present in the DOM.
|
||||
/// Source-specific: the target site uses `#chapter_table` as the wrapper
|
||||
/// element. Distinguishes "table is present but empty" (legit edge case
|
||||
/// for new mangas) from "table is missing entirely" (partial render).
|
||||
fn has_chapter_table_sentinel(doc: &scraper::Html) -> bool {
|
||||
let sel = scraper::Selector::parse("#chapter_table").expect("valid selector");
|
||||
doc.select(&sel).next().is_some()
|
||||
}
|
||||
|
||||
fn parse_chapter_number(text: &str) -> Option<i32> {
|
||||
@@ -880,7 +902,7 @@ mod tests {
|
||||
"../../../tests/fixtures/target/chapter_list_uu.html"
|
||||
);
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
let chapters = parse_chapter_list(&doc);
|
||||
let chapters = parse_chapter_list(&doc).expect("fixture has the table");
|
||||
|
||||
assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
|
||||
|
||||
@@ -1027,9 +1049,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn missing_optional_fields_parse_to_none() {
|
||||
// Minimal but well-formed detail page: title is required, every
|
||||
// other field is optional, but the chapter table is structural —
|
||||
// its absence is treated as Transient (a freshly added manga
|
||||
// renders the table empty, not absent). See
|
||||
// `parse_chapter_list_returns_transient_when_table_missing` for
|
||||
// the negative case.
|
||||
let html = r#"<html><body>\
|
||||
<header><div id="logo">Target</div></header>\
|
||||
<div class="w-title"><h1>Minimal</h1></div></body></html>"#;
|
||||
<div class="w-title"><h1>Minimal</h1></div>\
|
||||
<table id="chapter_table"></table>\
|
||||
</body></html>"#;
|
||||
let m = parse_manga_detail(html, "min", true).unwrap();
|
||||
assert_eq!(m.title, "Minimal");
|
||||
assert!(m.summary.is_none());
|
||||
@@ -1222,4 +1252,64 @@ mod tests {
|
||||
assert_eq!(displaced[0].source_manga_key, "X");
|
||||
assert_eq!(outcome, DisplacementOutcome::Shifted(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_list_returns_transient_when_table_missing() {
|
||||
// Partial render (post-load JS hadn't injected the table, layout
|
||||
// drift, etc). Returning Vec::new() would silently soft-drop every
|
||||
// existing chapter for the manga via sync_manga_chapters; Transient
|
||||
// is the signal the job system retries on.
|
||||
let html = r#"<html><body>
|
||||
<header><div id="logo">Target</div></header>
|
||||
<div class="w-title"><h1>Test</h1></div>
|
||||
</body></html>"#;
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
let err = parse_chapter_list(&doc).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_list_ok_empty_when_table_present_but_no_rows() {
|
||||
// A freshly-added manga with no chapters yet — the source renders
|
||||
// the `<table id="chapter_table">` wrapper but no `<tr>` rows
|
||||
// inside. Must stay distinguishable from a missing-table render.
|
||||
let html = r#"<html><body>
|
||||
<header><div id="logo">Target</div></header>
|
||||
<table id="chapter_table"></table>
|
||||
</body></html>"#;
|
||||
let doc = scraper::Html::parse_document(html);
|
||||
let chapters = parse_chapter_list(&doc).expect("present table is not transient");
|
||||
assert!(chapters.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_manga_detail_propagates_chapter_table_transient() {
|
||||
// End-to-end: a detail page that survives the #logo sentinel but
|
||||
// has the chapter table stripped must fail Transient at the parser
|
||||
// boundary, not return a SourceManga with empty chapters.
|
||||
let html = r#"<html><body>
|
||||
<header><div id="logo">Target</div></header>
|
||||
<div class="w-title"><h1>Test Title</h1></div>
|
||||
<div class="cover"><img src="/cover.jpg"></div>
|
||||
<!-- intentionally no #chapter_table -->
|
||||
</body></html>"#;
|
||||
let err = parse_manga_detail(html, "key", true).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_manga_detail_skips_chapter_sentinel_when_include_chapters_false() {
|
||||
// Metadata-only mode (`skip_chapters` upstream) must not require
|
||||
// the chapter table — pipeline.rs avoids calling sync_manga_chapters
|
||||
// for these mangas, so the absent table is not a correctness issue
|
||||
// and shouldn't surface as Transient.
|
||||
let html = r#"<html><body>
|
||||
<header><div id="logo">Target</div></header>
|
||||
<div class="w-title"><h1>Test Title</h1></div>
|
||||
<div class="cover"><img src="/cover.jpg"></div>
|
||||
</body></html>"#;
|
||||
let manga = parse_manga_detail(html, "key", false)
|
||||
.expect("metadata-only parse must not require chapter table");
|
||||
assert!(manga.chapters.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user