feat(crawler): single-mode walker gated by recovery flag (0.36.0)
Collapses the crawler to a single newest-first walker and replaces the N-consecutive-unchanged streak with a per-manga rule: stop on the first manga where metadata is Unchanged AND chapter sync reports zero new chapters. The early stop is gated by a per-source recovery flag stored in `crawler_state` — set to `false` when a run starts, back to `true` only on a clean exit (end-of-walk or intentional stop). A crashed run leaves the flag `false` automatically (no shutdown code runs), so the next tick walks the full catalog instead of bailing at the first caught-up manga. This means a crashed mid-walk run self-heals on the next tick: the flag stays `false`, the next walk visits every page (recovering anything the crash missed past its crash point), and steady state resumes once the recovery sweep reaches end-of-walk. Removed: - DiscoverMode enum, Backfill mode, the boundary re-check + displaced-refs machinery in TargetSourceWalker. - Drop-pass (mark_dropped_mangas) and seed-completion plumbing (mark_seed_completed / seed_completed_at). The recovery flag subsumes the seed-completion signal; drop detection was explicitly opted out. - JobPayload::Discover (no production callers). - CRAWLER_MODE / CRAWLER_INCREMENTAL_STOP_AFTER env vars and the CrawlerModePref config type. `should_mark_clean_exit(walked_to_completion, hit_stop_condition)` encodes the clean-exit truth table in its signature — `hit_limit` is deliberately absent so a future edit cannot accidentally count a caller-imposed cap as a clean exit. Net -501 lines, 261 backend tests passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,22 +15,23 @@ use async_trait::async_trait;
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
use super::{
|
||||
DiscoverMode, DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef,
|
||||
SourceManga, SourceMangaRef,
|
||||
DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
|
||||
SourceMangaRef,
|
||||
};
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
};
|
||||
|
||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||
/// daemon can look up per-source state (e.g. `seed_completed_at`)
|
||||
/// before constructing the Source itself.
|
||||
/// daemon can look up per-source state (e.g. the recovery flag) before
|
||||
/// constructing the Source itself.
|
||||
pub const SOURCE_ID: &str = "target";
|
||||
|
||||
/// In-loop retry budget for transient pages encountered during a single
|
||||
/// `discover` walk. Bounded small because the job system itself retries
|
||||
/// the whole `Discover` job on failure — these inline retries only need
|
||||
/// to absorb a brief site hiccup mid-walk.
|
||||
/// `discover` walk. Bounded small because the next cron tick will pick up
|
||||
/// where this run left off via the recovery flag — these inline retries
|
||||
/// only need to absorb a brief site hiccup mid-walk, not a sustained
|
||||
/// outage.
|
||||
const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3;
|
||||
const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2);
|
||||
|
||||
@@ -72,7 +73,6 @@ impl Source for TargetSource {
|
||||
async fn discover(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
mode: DiscoverMode,
|
||||
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
|
||||
// Always visit page 1 first because that's the only way to
|
||||
// discover `last_page`. Retry it on transient — a broken first
|
||||
@@ -89,10 +89,8 @@ impl Source for TargetSource {
|
||||
parse_last_page(&doc)
|
||||
};
|
||||
|
||||
let backfill = matches!(mode, DiscoverMode::Backfill);
|
||||
let order = build_page_order(last_page, backfill);
|
||||
let order = build_page_order(last_page);
|
||||
tracing::info!(
|
||||
?mode,
|
||||
last_page = ?last_page,
|
||||
page_count = order.len(),
|
||||
"walking pagination"
|
||||
@@ -100,10 +98,8 @@ impl Source for TargetSource {
|
||||
|
||||
Ok(Box::new(TargetSourceWalker {
|
||||
base_url: self.base_url.clone(),
|
||||
backfill,
|
||||
pages_remaining: order,
|
||||
first_page_html: Some(first_html),
|
||||
prev: None,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -139,16 +135,13 @@ impl Source for TargetSource {
|
||||
}
|
||||
|
||||
/// Build the queue of page numbers `TargetSource::discover` will walk.
|
||||
/// Backfill is oldest-first: pages `last..=1` (within each page the
|
||||
/// walker reverses entries, since the source orders by update_date
|
||||
/// DESC). Incremental is newest-first: pages `1..=last` in natural
|
||||
/// order. If `last_page` is unknown (source surfaces no pagination)
|
||||
/// only page 1 is visited.
|
||||
fn build_page_order(last_page: Option<i32>, backfill: bool) -> VecDeque<i32> {
|
||||
match (last_page, backfill) {
|
||||
(None, _) => VecDeque::from([1]),
|
||||
(Some(last), true) => (1..=last).rev().collect(),
|
||||
(Some(last), false) => (1..=last).collect(),
|
||||
/// The site orders by `update_date DESC`, so newest-first is just the
|
||||
/// natural page order: `1..=last`. If `last_page` is unknown (source
|
||||
/// surfaces no pagination) only page 1 is visited.
|
||||
fn build_page_order(last_page: Option<i32>) -> VecDeque<i32> {
|
||||
match last_page {
|
||||
None => VecDeque::from([1]),
|
||||
Some(last) => (1..=last).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,16 +151,8 @@ fn build_page_order(last_page: Option<i32>, backfill: bool) -> VecDeque<i32> {
|
||||
/// batch covering page 1 doesn't re-fetch.
|
||||
struct TargetSourceWalker {
|
||||
base_url: String,
|
||||
backfill: bool,
|
||||
pages_remaining: VecDeque<i32>,
|
||||
first_page_html: Option<String>,
|
||||
/// Page number and slot-0 `source_manga_key` of the previously-walked
|
||||
/// page. Updated after every batch (cheap, unconditional) but only
|
||||
/// *read* by the boundary re-check, which itself runs only in backfill
|
||||
/// mode. A single `Option` so the half-set state (`page_num` known but
|
||||
/// `key` not, or vice versa) is unrepresentable; `None` here suppresses
|
||||
/// the next iteration's re-check (no anchor to compare against).
|
||||
prev: Option<(i32, String)>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -179,7 +164,7 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
let Some(page_num) = self.pages_remaining.pop_front() else {
|
||||
return Ok(None);
|
||||
};
|
||||
let mut page_refs = if page_num == 1 {
|
||||
let page_refs = if page_num == 1 {
|
||||
// Reuse the cached page-1 HTML from the initial probe. Take
|
||||
// it (rather than clone) so a malformed page-order queue
|
||||
// that re-visits page 1 still falls back to a real fetch.
|
||||
@@ -214,77 +199,7 @@ impl DiscoverWalk for TargetSourceWalker {
|
||||
)
|
||||
.await?
|
||||
};
|
||||
// Capture slot-0 of the page as the site presents it (newest first)
|
||||
// *before* the backfill `.reverse()` below — after reversal slot 0
|
||||
// is the oldest entry, which would defeat the next iteration's
|
||||
// boundary re-check.
|
||||
let current_first = page_refs.first().map(|r| r.source_manga_key.clone());
|
||||
|
||||
// Boundary re-check (backfill only). The site orders by update_date
|
||||
// DESC, so shifts during the walk push items from low-numbered pages
|
||||
// to high-numbered ones — into pages backfill has already finished.
|
||||
// After fetching this page, re-fetch the *previous* iteration's page
|
||||
// and look for items that slid past us mid-walk. Must be the last
|
||||
// navigation of the iteration to close the within-iteration race.
|
||||
let mut displaced: Vec<SourceMangaRef> = Vec::new();
|
||||
if self.backfill {
|
||||
if let Some((prev_page_num, prev_first_key)) = self.prev.clone() {
|
||||
match recheck_prev_page(ctx, &self.base_url, prev_page_num).await {
|
||||
Ok(refetched) => {
|
||||
let (d, outcome) = detect_displaced(&prev_first_key, &refetched);
|
||||
match outcome {
|
||||
DisplacementOutcome::NoShift => {}
|
||||
DisplacementOutcome::Shifted(k) => {
|
||||
tracing::info!(
|
||||
page_num,
|
||||
prev_page_num,
|
||||
k,
|
||||
"boundary re-check: shift detected, recovering displaced refs"
|
||||
);
|
||||
}
|
||||
DisplacementOutcome::NotFoundFallback => {
|
||||
tracing::warn!(
|
||||
page_num,
|
||||
prev_page_num,
|
||||
prev_first_key = %prev_first_key,
|
||||
refetched_len = refetched.len(),
|
||||
"boundary re-check: prev_first not found, falling back to full re-process"
|
||||
);
|
||||
}
|
||||
}
|
||||
displaced = d;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
page_num,
|
||||
prev_page_num,
|
||||
error = ?e,
|
||||
"boundary re-check: re-fetch failed, skipping check for this boundary"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remember the boundary for the next iteration. An empty `page_refs`
|
||||
// yields `None`, which intentionally suppresses the next re-check
|
||||
// (no anchor to compare against).
|
||||
self.prev = current_first.map(|key| (page_num, key));
|
||||
|
||||
if self.backfill {
|
||||
page_refs.reverse();
|
||||
}
|
||||
// Append displaced refs to the end of the batch. Order doesn't
|
||||
// affect backfill semantics (no `consecutive_unchanged` streak in
|
||||
// this mode), and the pipeline-level dedup set handles any overlap.
|
||||
let displaced_count = displaced.len();
|
||||
page_refs.extend(displaced);
|
||||
tracing::info!(
|
||||
page_num,
|
||||
count = page_refs.len(),
|
||||
displaced = displaced_count,
|
||||
"page walked"
|
||||
);
|
||||
tracing::info!(page_num, count = page_refs.len(), "page walked");
|
||||
Ok(Some(page_refs))
|
||||
}
|
||||
}
|
||||
@@ -689,67 +604,6 @@ fn compute_metadata_hash(m: &SourceManga) -> String {
|
||||
format!("{:x}", h.finalize())
|
||||
}
|
||||
|
||||
/// Outcome of a boundary re-check, surfaced for telemetry + tests. The
|
||||
/// caller's recovery action is determined by the returned `Vec`; this enum
|
||||
/// only labels which branch fired so logging and assertions can distinguish
|
||||
/// "site is stable" from "we papered over a shift" from "we fell back".
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum DisplacementOutcome {
|
||||
/// `prev_first` is still at slot 0 of the re-fetched page — no shift
|
||||
/// happened between the prior iteration and this one's re-check.
|
||||
NoShift,
|
||||
/// `prev_first` slid down to slot `K`; the first `K` entries are items
|
||||
/// that used to live on the page we just walked.
|
||||
Shifted(usize),
|
||||
/// `prev_first` is gone from the re-fetched page — multiple pages-worth
|
||||
/// of shifts happened, or it was bumped to page 1. Treat all refetched
|
||||
/// entries as potentially displaced; the pipeline-level dedup absorbs
|
||||
/// the noise.
|
||||
NotFoundFallback,
|
||||
}
|
||||
|
||||
/// Compare a previously-walked page's slot-0 key against a fresh fetch of
|
||||
/// that page. Returns the entries that appear *ahead* of `prev_first` in
|
||||
/// the re-fetched page — items that slid in from the page the caller is
|
||||
/// currently processing.
|
||||
fn detect_displaced(
|
||||
prev_first: &str,
|
||||
refetched: &[SourceMangaRef],
|
||||
) -> (Vec<SourceMangaRef>, DisplacementOutcome) {
|
||||
let Some(k) = refetched
|
||||
.iter()
|
||||
.position(|r| r.source_manga_key == prev_first)
|
||||
else {
|
||||
return (refetched.to_vec(), DisplacementOutcome::NotFoundFallback);
|
||||
};
|
||||
if k == 0 {
|
||||
(Vec::new(), DisplacementOutcome::NoShift)
|
||||
} else {
|
||||
(refetched[..k].to_vec(), DisplacementOutcome::Shifted(k))
|
||||
}
|
||||
}
|
||||
|
||||
/// Re-fetch a previously-walked listing page to feed [`detect_displaced`].
|
||||
/// Uses the same retry chain as the primary page fetch in `next_batch` so
|
||||
/// a transient hiccup doesn't tank an entire backfill walk.
|
||||
async fn recheck_prev_page(
|
||||
ctx: &FetchContext<'_>,
|
||||
base_url: &str,
|
||||
page_num: i32,
|
||||
) -> Result<Vec<SourceMangaRef>, PageError> {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let url = page_url(base_url, page_num);
|
||||
let html = navigate(ctx, &url).await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1125,132 +979,25 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_backfill_is_last_to_one() {
|
||||
// Backfill walks pages oldest-first: queue is [last, last-1, ..., 1]
|
||||
// so popping from the front yields the last page first.
|
||||
let order = build_page_order(Some(3), true);
|
||||
assert_eq!(Vec::from(order), vec![3, 2, 1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_incremental_is_one_to_last() {
|
||||
// Incremental walks newest-first in natural source order.
|
||||
let order = build_page_order(Some(3), false);
|
||||
fn build_page_order_is_natural_one_to_last() {
|
||||
// Newest-first is just the source's natural pagination order:
|
||||
// (update_date DESC) lives at page 1, oldest at the last page.
|
||||
let order = build_page_order(Some(3));
|
||||
assert_eq!(Vec::from(order), vec![1, 2, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_falls_back_to_page_one_only_without_pagination() {
|
||||
let backfill = build_page_order(None, true);
|
||||
assert_eq!(Vec::from(backfill), vec![1]);
|
||||
let incremental = build_page_order(None, false);
|
||||
assert_eq!(Vec::from(incremental), vec![1]);
|
||||
// Source surfaced no pagination control — visit page 1 alone
|
||||
// and let the walk end after one batch.
|
||||
let order = build_page_order(None);
|
||||
assert_eq!(Vec::from(order), vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_single_page_index_yields_one_entry() {
|
||||
// Sources with exactly one page should not yield duplicates
|
||||
// regardless of mode.
|
||||
let backfill = build_page_order(Some(1), true);
|
||||
assert_eq!(Vec::from(backfill), vec![1]);
|
||||
let incremental = build_page_order(Some(1), false);
|
||||
assert_eq!(Vec::from(incremental), vec![1]);
|
||||
}
|
||||
|
||||
fn make_ref(key: &str) -> SourceMangaRef {
|
||||
SourceMangaRef {
|
||||
source_manga_key: key.to_string(),
|
||||
title: key.to_string(),
|
||||
url: format!("https://target.example/manga/{key}"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_displaced_no_shift_when_prev_first_still_at_slot_zero() {
|
||||
let refetched = vec![make_ref("A"), make_ref("B"), make_ref("C")];
|
||||
let (displaced, outcome) = detect_displaced("A", &refetched);
|
||||
assert!(displaced.is_empty());
|
||||
assert_eq!(outcome, DisplacementOutcome::NoShift);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_displaced_one_shift_returns_single_intruder() {
|
||||
let refetched = vec![make_ref("X"), make_ref("A"), make_ref("B")];
|
||||
let (displaced, outcome) = detect_displaced("A", &refetched);
|
||||
assert_eq!(displaced.len(), 1);
|
||||
assert_eq!(displaced[0].source_manga_key, "X");
|
||||
assert_eq!(outcome, DisplacementOutcome::Shifted(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_displaced_multi_shift_returns_all_intruders() {
|
||||
let refetched = vec![
|
||||
make_ref("X1"),
|
||||
make_ref("X2"),
|
||||
make_ref("X3"),
|
||||
make_ref("A"),
|
||||
make_ref("B"),
|
||||
make_ref("C"),
|
||||
];
|
||||
let (displaced, outcome) = detect_displaced("A", &refetched);
|
||||
let keys: Vec<&str> = displaced
|
||||
.iter()
|
||||
.map(|r| r.source_manga_key.as_str())
|
||||
.collect();
|
||||
assert_eq!(keys, vec!["X1", "X2", "X3"]);
|
||||
assert_eq!(outcome, DisplacementOutcome::Shifted(3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_displaced_full_page_shift_returns_all_but_last() {
|
||||
// `prev_first` at the last slot — every preceding entry is an
|
||||
// intruder shifted in from the page the caller is processing.
|
||||
let mut refetched: Vec<_> = (0..9).map(|i| make_ref(&format!("X{i}"))).collect();
|
||||
refetched.push(make_ref("A"));
|
||||
let (displaced, outcome) = detect_displaced("A", &refetched);
|
||||
assert_eq!(displaced.len(), 9);
|
||||
assert_eq!(outcome, DisplacementOutcome::Shifted(9));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_displaced_not_found_returns_full_page_for_conservative_recovery() {
|
||||
// > page-worth of shifts (or `prev_first` itself was bumped to
|
||||
// page 1): can't pinpoint K, fall back to "process everything";
|
||||
// pipeline dedup absorbs the noise.
|
||||
let refetched = vec![make_ref("Y"), make_ref("Z")];
|
||||
let (displaced, outcome) = detect_displaced("A", &refetched);
|
||||
let keys: Vec<&str> = displaced
|
||||
.iter()
|
||||
.map(|r| r.source_manga_key.as_str())
|
||||
.collect();
|
||||
assert_eq!(keys, vec!["Y", "Z"]);
|
||||
assert_eq!(outcome, DisplacementOutcome::NotFoundFallback);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_displaced_empty_page_returns_empty_with_fallback_outcome() {
|
||||
// Re-fetch came back empty (transient mimicry or last-page tail).
|
||||
// No anchor means we can't classify; fall back is the safe label.
|
||||
let (displaced, outcome) = detect_displaced("A", &[]);
|
||||
assert!(displaced.is_empty());
|
||||
assert_eq!(outcome, DisplacementOutcome::NotFoundFallback);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn detect_displaced_takes_first_occurrence_when_key_repeats() {
|
||||
// Defensive: if the source ever returns the same key twice on a
|
||||
// page, anchoring on the first match keeps the displaced slice
|
||||
// bounded and deterministic.
|
||||
let refetched = vec![
|
||||
make_ref("X"),
|
||||
make_ref("A"),
|
||||
make_ref("Y"),
|
||||
make_ref("A"),
|
||||
];
|
||||
let (displaced, outcome) = detect_displaced("A", &refetched);
|
||||
assert_eq!(displaced.len(), 1);
|
||||
assert_eq!(displaced[0].source_manga_key, "X");
|
||||
assert_eq!(outcome, DisplacementOutcome::Shifted(1));
|
||||
let order = build_page_order(Some(1));
|
||||
assert_eq!(Vec::from(order), vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user