feat: incremental crawl mode with seed-completion gate (0.33.0)

Daemon now auto-detects mode per source: Backfill until the first
full walk records `seed_completed:<source>` in `crawler_state`, then
Incremental (newest-first, stops after N consecutive Unchanged
upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects
`auto` since it has no pre-run DB state.

`Source::discover` returns a lazy `DiscoverWalk` so Incremental can
break out mid-walk without prefetching pages. The drop pass and seed
marker are now gated on a true full walk — fixes a latent soft-drop
of the index tail under partial sweeps.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-28 06:41:26 +02:00
parent 51f42b03e9
commit 45ce0d8f12
11 changed files with 761 additions and 162 deletions

View File

@@ -7,6 +7,7 @@
//! (`td:has(label:contains("Author:"))`) are implemented by walking
//! the parsed tree.
use std::collections::VecDeque;
use std::time::Duration;
use anyhow::Context;
@@ -14,13 +15,18 @@ use async_trait::async_trait;
use sha2::{Digest, Sha256};
use super::{
DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
SourceMangaRef,
DiscoverMode, DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef,
SourceManga, SourceMangaRef,
};
use crate::crawler::detect::{
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
};
/// `sources.id` value for this Source impl. Exposed as a const so the
/// daemon can look up per-source state (e.g. `seed_completed_at`)
/// before constructing the Source itself.
pub const SOURCE_ID: &str = "target";
/// In-loop retry budget for transient pages encountered during a single
/// `discover` walk. Bounded small because the job system itself retries
/// the whole `Discover` job on failure — these inline retries only need
@@ -60,15 +66,14 @@ impl TargetSource {
#[async_trait]
impl Source for TargetSource {
fn id(&self) -> &'static str {
"target"
SOURCE_ID
}
async fn discover(
&self,
ctx: &FetchContext<'_>,
mode: DiscoverMode,
max_results: Option<usize>,
) -> anyhow::Result<Vec<SourceMangaRef>> {
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
// Always visit page 1 first because that's the only way to
// discover `last_page`. Retry it on transient — a broken first
// page would otherwise abort the whole walk before we've even
@@ -85,15 +90,7 @@ impl Source for TargetSource {
};
let backfill = matches!(mode, DiscoverMode::Backfill);
let order: Vec<i32> = match (last_page, backfill) {
(None, _) => vec![1],
// Backfill = oldest-first: walk pages last → 1, then
// reverse within each page (the listing is update_date
// DESC, so the bottom of the last page is the oldest
// entry the source still surfaces).
(Some(last), true) => (1..=last).rev().collect(),
(Some(last), false) => (1..=last).collect(),
};
let order = build_page_order(last_page, backfill);
tracing::info!(
?mode,
last_page = ?last_page,
@@ -101,40 +98,12 @@ impl Source for TargetSource {
"walking pagination"
);
let mut all = Vec::new();
for page_num in order {
// Page 1 is already cached from the last_page probe — reuse
// it rather than navigating twice. Every other page goes
// through the retry helper so a single broken page mid-walk
// doesn't silently drop its mangas from the result.
let mut page_refs = if page_num == 1 {
let doc = scraper::Html::parse_document(&first_html);
parse_manga_list_from(&doc)?
} else {
retry_on_transient(
|| async {
let url = page_url(&self.base_url, page_num);
let html = navigate(ctx, &url).await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
)
.await?
};
if backfill {
page_refs.reverse();
}
tracing::info!(page_num, count = page_refs.len(), "page walked");
all.extend(page_refs);
if cap_reached(&all, max_results) {
tracing::info!(cap = ?max_results, "max_results reached; halting pagination");
break;
}
}
Ok(truncate_to_cap(all, max_results))
Ok(Box::new(TargetSourceWalker {
base_url: self.base_url.clone(),
backfill,
pages_remaining: order,
first_page_html: Some(first_html),
}))
}
async fn fetch_manga(
@@ -168,15 +137,81 @@ impl Source for TargetSource {
}
}
fn cap_reached<T>(buf: &[T], max: Option<usize>) -> bool {
matches!(max, Some(m) if buf.len() >= m)
/// Build the queue of page numbers `TargetSource::discover` will walk.
/// Backfill is oldest-first: pages `last..=1` (within each page the
/// walker reverses entries, since the source orders by update_date
/// DESC). Incremental is newest-first: pages `1..=last` in natural
/// order. If `last_page` is unknown (source surfaces no pagination)
/// only page 1 is visited.
fn build_page_order(last_page: Option<i32>, backfill: bool) -> VecDeque<i32> {
match (last_page, backfill) {
(None, _) => VecDeque::from([1]),
(Some(last), true) => (1..=last).rev().collect(),
(Some(last), false) => (1..=last).collect(),
}
}
fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {
if let Some(m) = max {
buf.truncate(m);
/// Walker returned by [`TargetSource::discover`]. Pops one source-index
/// page per `next_batch` call. Page 1's HTML is cached at construction
/// time (the discover call needed it to read `last_page` anyway) so the
/// batch covering page 1 doesn't re-fetch.
struct TargetSourceWalker {
base_url: String,
backfill: bool,
pages_remaining: VecDeque<i32>,
first_page_html: Option<String>,
}
#[async_trait]
impl DiscoverWalk for TargetSourceWalker {
async fn next_batch(
&mut self,
ctx: &FetchContext<'_>,
) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
let Some(page_num) = self.pages_remaining.pop_front() else {
return Ok(None);
};
let mut page_refs = if page_num == 1 {
// Reuse the cached page-1 HTML from the initial probe. Take
// it (rather than clone) so a malformed page-order queue
// that re-visits page 1 still falls back to a real fetch.
match self.first_page_html.take() {
Some(html) => {
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)?
}
None => {
retry_on_transient(
|| async {
let html = navigate(ctx, self.base_url.as_str()).await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
)
.await?
}
}
} else {
retry_on_transient(
|| async {
let url = page_url(&self.base_url, page_num);
let html = navigate(ctx, &url).await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
)
.await?
};
if self.backfill {
page_refs.reverse();
}
tracing::info!(page_num, count = page_refs.len(), "page walked");
Ok(Some(page_refs))
}
buf
}
/// Single point of rate-limited navigation. Every Source request goes
@@ -922,4 +957,37 @@ mod tests {
let err = parse_manga_detail(html, "x", true).expect_err("expected Transient");
assert!(err.is_transient(), "got non-transient: {err}");
}
#[test]
fn build_page_order_backfill_is_last_to_one() {
// Backfill walks pages oldest-first: queue is [last, last-1, ..., 1]
// so popping from the front yields the last page first.
let order = build_page_order(Some(3), true);
assert_eq!(Vec::from(order), vec![3, 2, 1]);
}
#[test]
fn build_page_order_incremental_is_one_to_last() {
// Incremental walks newest-first in natural source order.
let order = build_page_order(Some(3), false);
assert_eq!(Vec::from(order), vec![1, 2, 3]);
}
#[test]
fn build_page_order_falls_back_to_page_one_only_without_pagination() {
let backfill = build_page_order(None, true);
assert_eq!(Vec::from(backfill), vec![1]);
let incremental = build_page_order(None, false);
assert_eq!(Vec::from(incremental), vec![1]);
}
#[test]
fn build_page_order_single_page_index_yields_one_entry() {
// Sources with exactly one page should not yield duplicates
// regardless of mode.
let backfill = build_page_order(Some(1), true);
assert_eq!(Vec::from(backfill), vec![1]);
let incremental = build_page_order(Some(1), false);
assert_eq!(Vec::from(incremental), vec![1]);
}
}