fix(crawler): skip sync when empty chapters + prior > 0 (0.36.6)
The wait_for_selector wait in 0.36.2 narrows the partial-render race window but doesn't close it: a render that takes longer than SELECTOR_TIMEOUT (10s) still hands an empty Vec to sync_manga_chapters, and the soft-drop branch flips every existing chapter to dropped_at. The next tick recovers but a manga's reader briefly stops working in between. Close it at the pipeline level. Between fetch_manga and the upsert/ sync, if the parsed chapter list is empty and the prior live count for (source_id, source_manga_key) is > 0, treat the fetch as a transient failure: log, bump mangas_failed, skip upsert + sync + the seen.insert so a later batch / tick retries. Brand-new mangas with genuinely zero chapters (prior == 0) pass through unchanged. New repo helper repo::crawler::live_chapter_count_for_source_manga joins chapters → chapter_sources → manga_sources with dropped_at IS NULL — same lockstep as dispatch_target and the enqueue queries. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -215,6 +215,48 @@ pub async fn run_metadata_pass(
|
||||
}
|
||||
};
|
||||
|
||||
// Partial-render guard: an empty chapter list paired with a
|
||||
// prior count > 0 is overwhelmingly a chromium snapshot
|
||||
// taken between the #chapter_table wrapper render and its
|
||||
// rows render. The wait_for_selector wait in `navigate`
|
||||
// narrows this window but cannot close it for slow renders
|
||||
// beyond the selector budget. Treat as a transient failure
|
||||
// here — skip upsert, skip seen.insert — so the next batch
|
||||
// (or the next tick) retries. Skipped in `skip_chapters`
|
||||
// mode because the parser is configured to return an empty
|
||||
// Vec by design there.
|
||||
if !skip_chapters && manga.chapters.is_empty() {
|
||||
match repo::crawler::live_chapter_count_for_source_manga(
|
||||
db, source_id, &r.source_manga_key,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(prior) if prior > 0 => {
|
||||
tracing::warn!(
|
||||
key = %r.source_manga_key,
|
||||
url = %r.url,
|
||||
prior_chapter_count = prior,
|
||||
"fetch_manga returned empty chapters but prior count > 0; treating as partial-render transient and skipping"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
continue;
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(e) => {
|
||||
// DB lookup failed — fail safe: skip rather
|
||||
// than risk a soft-drop on a manga whose prior
|
||||
// count we couldn't confirm.
|
||||
tracing::warn!(
|
||||
key = %r.source_manga_key,
|
||||
error = ?e,
|
||||
"live_chapter_count_for_source_manga failed; skipping cautiously"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let upsert = match repo::crawler::upsert_manga_from_source(
|
||||
db, source_id, &r.url, &manga,
|
||||
)
|
||||
|
||||
@@ -458,6 +458,44 @@ pub async fn sync_manga_chapters(
|
||||
Ok(diff)
|
||||
}
|
||||
|
||||
/// Count the chapters that the source `(source_id, source_manga_key)`
|
||||
/// is currently known to attach to — i.e. the number of `chapter_sources`
|
||||
/// rows for the manga identified by the (source_id, source_manga_key)
|
||||
/// pair, restricted to live (`dropped_at IS NULL`) rows.
|
||||
///
|
||||
/// Used by the metadata pass's partial-render guard: if `fetch_manga`
|
||||
/// returns an empty `chapters` Vec but the source previously surfaced
|
||||
/// chapters here, that's most likely a chromium snapshot taken between
|
||||
/// the `#chapter_table` wrapper render and its rows render — the
|
||||
/// safest move is to skip `sync_manga_chapters` so the soft-drop
|
||||
/// branch doesn't flip every existing chapter to `dropped_at`.
|
||||
///
|
||||
/// Returns `Ok(0)` when the manga is brand-new (no `manga_sources`
|
||||
/// row yet), which is the legitimate "this manga has no chapters yet"
|
||||
/// case and must NOT be flagged.
|
||||
pub async fn live_chapter_count_for_source_manga(
|
||||
pool: &PgPool,
|
||||
source_id: &str,
|
||||
source_manga_key: &str,
|
||||
) -> sqlx::Result<i64> {
|
||||
let row: Option<(i64,)> = sqlx::query_as(
|
||||
"SELECT COUNT(*) \
|
||||
FROM chapter_sources cs \
|
||||
JOIN chapters c ON c.id = cs.chapter_id \
|
||||
JOIN manga_sources ms \
|
||||
ON ms.manga_id = c.manga_id \
|
||||
AND ms.source_id = cs.source_id \
|
||||
WHERE ms.source_id = $1 \
|
||||
AND ms.source_manga_key = $2 \
|
||||
AND cs.dropped_at IS NULL",
|
||||
)
|
||||
.bind(source_id)
|
||||
.bind(source_manga_key)
|
||||
.fetch_optional(pool)
|
||||
.await?;
|
||||
Ok(row.map(|(n,)| n).unwrap_or(0))
|
||||
}
|
||||
|
||||
/// Mark a metadata pass as in-flight for `source_id`. Stamps
|
||||
/// `last_run_completed:<source_id>` in `crawler_state` with
|
||||
/// `{"completed": false, "at": now}`. A crash, panic, or SIGKILL after
|
||||
|
||||
Reference in New Issue
Block a user