feat(crawler): reliability fixes — heartbeat, streaming, jitter, timeout, breaker
A1 Lease heartbeat: jobs::renew keeps a long-but-healthy job's lease fresh so it is never stolen mid-flight nor inflated toward max_attempts. A2 Stream chapter pages straight to storage (peak memory = one image) and persist rows + page_count in one short transaction off the network path (S3-ready); roll back stored blobs on failure via Storage::delete. A3 ±20% jitter on exponential backoff to avoid a retry thundering herd. A4 Outer per-dispatch timeout (CRAWLER_JOB_TIMEOUT_SECS, default 600) so a hung job is acked-failed instead of wedging a worker. A5 Metadata circuit-breaker (CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES, default 10): abort a pass on a source outage without marking a clean exit, so the next tick recovery-sweeps. Adds CRAWLER_BROWSER_RESTART_THRESHOLD config (used by the upcoming coordinated browser restart). Bumps version 0.52.0 -> 0.53.0. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -65,6 +65,17 @@ pub(crate) fn should_mark_clean_exit(
|
||||
walked_to_completion || hit_stop_condition
|
||||
}
|
||||
|
||||
/// Circuit-breaker: abort the walk once `consecutive` `fetch_manga`
|
||||
/// failures reach `threshold`. A `threshold` of 0 disables the breaker
|
||||
/// (unbounded — the legacy behaviour). When it fires the caller must NOT
|
||||
/// mark a clean exit, so the next tick does a recovery sweep over the
|
||||
/// catalog tail the aborted pass never reached.
|
||||
///
|
||||
/// Pure so the rule is unit-testable without the walker.
|
||||
pub(crate) fn should_abort_pass(consecutive: u32, threshold: u32) -> bool {
|
||||
threshold > 0 && consecutive >= threshold
|
||||
}
|
||||
|
||||
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
|
||||
/// for the target source. Pure metadata; chapter content is enqueued as
|
||||
/// separate `SyncChapterContent` jobs by the caller after this returns.
|
||||
@@ -103,6 +114,7 @@ pub async fn run_metadata_pass(
|
||||
skip_chapters: bool,
|
||||
allowlist: &DownloadAllowlist,
|
||||
max_image_bytes: usize,
|
||||
max_consecutive_failures: u32,
|
||||
tor: Option<&crate::crawler::tor::TorController>,
|
||||
) -> anyhow::Result<MetadataStats> {
|
||||
let lease = browser_manager
|
||||
@@ -165,6 +177,11 @@ pub async fn run_metadata_pass(
|
||||
let mut walked_to_completion = false;
|
||||
let mut hit_limit = false;
|
||||
let mut hit_stop_condition = false;
|
||||
// Circuit-breaker state: consecutive fetch_manga failures. A sustained
|
||||
// run abort (source outage) leaves the pass un-clean → recovery sweep
|
||||
// next tick.
|
||||
let mut consecutive_failures = 0u32;
|
||||
let mut hit_failure_breaker = false;
|
||||
|
||||
'outer: loop {
|
||||
let batch = match walker.next_batch(&ctx).await? {
|
||||
@@ -204,7 +221,10 @@ pub async fn run_metadata_pass(
|
||||
"fetching metadata"
|
||||
);
|
||||
let manga = match source.fetch_manga(&ctx, &r).await {
|
||||
Ok(m) => m,
|
||||
Ok(m) => {
|
||||
consecutive_failures = 0;
|
||||
m
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
key = %r.source_manga_key,
|
||||
@@ -213,6 +233,17 @@ pub async fn run_metadata_pass(
|
||||
"fetch_manga failed"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
consecutive_failures += 1;
|
||||
if should_abort_pass(consecutive_failures, max_consecutive_failures) {
|
||||
hit_failure_breaker = true;
|
||||
tracing::error!(
|
||||
consecutive_failures,
|
||||
threshold = max_consecutive_failures,
|
||||
"metadata pass: too many consecutive fetch_manga failures; \
|
||||
aborting (recovery sweep on next tick)"
|
||||
);
|
||||
break 'outer;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -390,6 +421,7 @@ pub async fn run_metadata_pass(
|
||||
walked_to_completion,
|
||||
hit_limit,
|
||||
hit_stop_condition,
|
||||
hit_failure_breaker,
|
||||
exited_cleanly,
|
||||
"metadata pass complete"
|
||||
);
|
||||
@@ -756,6 +788,18 @@ mod tests {
|
||||
assert!(!should_stop(false, UpsertStatus::New, None));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn abort_pass_fires_at_threshold_and_respects_disable() {
|
||||
// Disabled (0) never fires, no matter how many failures.
|
||||
assert!(!should_abort_pass(0, 0));
|
||||
assert!(!should_abort_pass(100, 0));
|
||||
// Below threshold: keep going.
|
||||
assert!(!should_abort_pass(9, 10));
|
||||
// At/above threshold: abort.
|
||||
assert!(should_abort_pass(10, 10));
|
||||
assert!(should_abort_pass(11, 10));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clean_exit_when_walked_to_completion() {
|
||||
// End-of-walk reached the catalog tail — the recovery flag may
|
||||
|
||||
Reference in New Issue
Block a user