feat(crawler): reliability fixes — heartbeat, streaming, jitter, timeout, breaker

A1 Lease heartbeat: jobs::renew keeps a long-but-healthy job's lease fresh
so it is never stolen mid-flight nor inflated toward max_attempts.
A2 Stream chapter pages straight to storage (peak memory = one image) and
persist rows + page_count in one short transaction off the network path
(S3-ready); roll back stored blobs on failure via Storage::delete.
A3 ±20% jitter on exponential backoff to avoid a retry thundering herd.
A4 Outer per-dispatch timeout (CRAWLER_JOB_TIMEOUT_SECS, default 600) so a
hung job is acked-failed instead of wedging a worker.
A5 Metadata circuit-breaker (CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES,
default 10): abort a pass on a source outage without marking a clean exit,
so the next tick recovery-sweeps.

Adds CRAWLER_BROWSER_RESTART_THRESHOLD config (used by the upcoming
coordinated browser restart). Bumps version 0.52.0 -> 0.53.0.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-06-03 20:13:17 +02:00
parent 679abae736
commit 7a6815661f
12 changed files with 578 additions and 71 deletions

View File

@@ -65,6 +65,17 @@ pub(crate) fn should_mark_clean_exit(
walked_to_completion || hit_stop_condition
}
/// Circuit-breaker: abort the walk once `consecutive` `fetch_manga`
/// failures reach `threshold`. A `threshold` of 0 disables the breaker
/// (unbounded — the legacy behaviour). When it fires the caller must NOT
/// mark a clean exit, so the next tick does a recovery sweep over the
/// catalog tail the aborted pass never reached.
///
/// Pure so the rule is unit-testable without the walker.
pub(crate) fn should_abort_pass(consecutive: u32, threshold: u32) -> bool {
threshold > 0 && consecutive >= threshold
}
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
/// for the target source. Pure metadata; chapter content is enqueued as
/// separate `SyncChapterContent` jobs by the caller after this returns.
@@ -103,6 +114,7 @@ pub async fn run_metadata_pass(
skip_chapters: bool,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
max_consecutive_failures: u32,
tor: Option<&crate::crawler::tor::TorController>,
) -> anyhow::Result<MetadataStats> {
let lease = browser_manager
@@ -165,6 +177,11 @@ pub async fn run_metadata_pass(
let mut walked_to_completion = false;
let mut hit_limit = false;
let mut hit_stop_condition = false;
// Circuit-breaker state: consecutive fetch_manga failures. A sustained
// run abort (source outage) leaves the pass un-clean → recovery sweep
// next tick.
let mut consecutive_failures = 0u32;
let mut hit_failure_breaker = false;
'outer: loop {
let batch = match walker.next_batch(&ctx).await? {
@@ -204,7 +221,10 @@ pub async fn run_metadata_pass(
"fetching metadata"
);
let manga = match source.fetch_manga(&ctx, &r).await {
Ok(m) => m,
Ok(m) => {
consecutive_failures = 0;
m
}
Err(e) => {
tracing::warn!(
key = %r.source_manga_key,
@@ -213,6 +233,17 @@ pub async fn run_metadata_pass(
"fetch_manga failed"
);
stats.mangas_failed += 1;
consecutive_failures += 1;
if should_abort_pass(consecutive_failures, max_consecutive_failures) {
hit_failure_breaker = true;
tracing::error!(
consecutive_failures,
threshold = max_consecutive_failures,
"metadata pass: too many consecutive fetch_manga failures; \
aborting (recovery sweep on next tick)"
);
break 'outer;
}
continue;
}
};
@@ -390,6 +421,7 @@ pub async fn run_metadata_pass(
walked_to_completion,
hit_limit,
hit_stop_condition,
hit_failure_breaker,
exited_cleanly,
"metadata pass complete"
);
@@ -756,6 +788,18 @@ mod tests {
assert!(!should_stop(false, UpsertStatus::New, None));
}
#[test]
fn abort_pass_fires_at_threshold_and_respects_disable() {
// Disabled (0) never fires, no matter how many failures.
assert!(!should_abort_pass(0, 0));
assert!(!should_abort_pass(100, 0));
// Below threshold: keep going.
assert!(!should_abort_pass(9, 10));
// At/above threshold: abort.
assert!(should_abort_pass(10, 10));
assert!(should_abort_pass(11, 10));
}
#[test]
fn clean_exit_when_walked_to_completion() {
// End-of-walk reached the catalog tail — the recovery flag may