feat: incremental crawl mode with seed-completion gate (0.33.0)
Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
//! (`td:has(label:contains("Author:"))`) are implemented by walking
|
||||
//! the parsed tree.
|
||||
|
||||
use std::collections::VecDeque;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -14,13 +15,18 @@ use async_trait::async_trait;
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
use super::{
|
||||
DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
|
||||
SourceMangaRef,
|
||||
DiscoverMode, DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef,
|
||||
SourceManga, SourceMangaRef,
|
||||
};
|
||||
use crate::crawler::detect::{
|
||||
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
|
||||
};
|
||||
|
||||
/// `sources.id` value for this Source impl. Exposed as a const so the
|
||||
/// daemon can look up per-source state (e.g. `seed_completed_at`)
|
||||
/// before constructing the Source itself.
|
||||
pub const SOURCE_ID: &str = "target";
|
||||
|
||||
/// In-loop retry budget for transient pages encountered during a single
|
||||
/// `discover` walk. Bounded small because the job system itself retries
|
||||
/// the whole `Discover` job on failure — these inline retries only need
|
||||
@@ -60,15 +66,14 @@ impl TargetSource {
|
||||
#[async_trait]
|
||||
impl Source for TargetSource {
|
||||
fn id(&self) -> &'static str {
|
||||
"target"
|
||||
SOURCE_ID
|
||||
}
|
||||
|
||||
async fn discover(
|
||||
&self,
|
||||
ctx: &FetchContext<'_>,
|
||||
mode: DiscoverMode,
|
||||
max_results: Option<usize>,
|
||||
) -> anyhow::Result<Vec<SourceMangaRef>> {
|
||||
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
|
||||
// Always visit page 1 first because that's the only way to
|
||||
// discover `last_page`. Retry it on transient — a broken first
|
||||
// page would otherwise abort the whole walk before we've even
|
||||
@@ -85,15 +90,7 @@ impl Source for TargetSource {
|
||||
};
|
||||
|
||||
let backfill = matches!(mode, DiscoverMode::Backfill);
|
||||
let order: Vec<i32> = match (last_page, backfill) {
|
||||
(None, _) => vec![1],
|
||||
// Backfill = oldest-first: walk pages last → 1, then
|
||||
// reverse within each page (the listing is update_date
|
||||
// DESC, so the bottom of the last page is the oldest
|
||||
// entry the source still surfaces).
|
||||
(Some(last), true) => (1..=last).rev().collect(),
|
||||
(Some(last), false) => (1..=last).collect(),
|
||||
};
|
||||
let order = build_page_order(last_page, backfill);
|
||||
tracing::info!(
|
||||
?mode,
|
||||
last_page = ?last_page,
|
||||
@@ -101,40 +98,12 @@ impl Source for TargetSource {
|
||||
"walking pagination"
|
||||
);
|
||||
|
||||
let mut all = Vec::new();
|
||||
for page_num in order {
|
||||
// Page 1 is already cached from the last_page probe — reuse
|
||||
// it rather than navigating twice. Every other page goes
|
||||
// through the retry helper so a single broken page mid-walk
|
||||
// doesn't silently drop its mangas from the result.
|
||||
let mut page_refs = if page_num == 1 {
|
||||
let doc = scraper::Html::parse_document(&first_html);
|
||||
parse_manga_list_from(&doc)?
|
||||
} else {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let url = page_url(&self.base_url, page_num);
|
||||
let html = navigate(ctx, &url).await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
.await?
|
||||
};
|
||||
if backfill {
|
||||
page_refs.reverse();
|
||||
}
|
||||
tracing::info!(page_num, count = page_refs.len(), "page walked");
|
||||
all.extend(page_refs);
|
||||
if cap_reached(&all, max_results) {
|
||||
tracing::info!(cap = ?max_results, "max_results reached; halting pagination");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(truncate_to_cap(all, max_results))
|
||||
Ok(Box::new(TargetSourceWalker {
|
||||
base_url: self.base_url.clone(),
|
||||
backfill,
|
||||
pages_remaining: order,
|
||||
first_page_html: Some(first_html),
|
||||
}))
|
||||
}
|
||||
|
||||
async fn fetch_manga(
|
||||
@@ -168,15 +137,81 @@ impl Source for TargetSource {
|
||||
}
|
||||
}
|
||||
|
||||
fn cap_reached<T>(buf: &[T], max: Option<usize>) -> bool {
|
||||
matches!(max, Some(m) if buf.len() >= m)
|
||||
/// Build the queue of page numbers `TargetSource::discover` will walk.
|
||||
/// Backfill is oldest-first: pages `last..=1` (within each page the
|
||||
/// walker reverses entries, since the source orders by update_date
|
||||
/// DESC). Incremental is newest-first: pages `1..=last` in natural
|
||||
/// order. If `last_page` is unknown (source surfaces no pagination)
|
||||
/// only page 1 is visited.
|
||||
fn build_page_order(last_page: Option<i32>, backfill: bool) -> VecDeque<i32> {
|
||||
match (last_page, backfill) {
|
||||
(None, _) => VecDeque::from([1]),
|
||||
(Some(last), true) => (1..=last).rev().collect(),
|
||||
(Some(last), false) => (1..=last).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {
|
||||
if let Some(m) = max {
|
||||
buf.truncate(m);
|
||||
/// Walker returned by [`TargetSource::discover`]. Pops one source-index
|
||||
/// page per `next_batch` call. Page 1's HTML is cached at construction
|
||||
/// time (the discover call needed it to read `last_page` anyway) so the
|
||||
/// batch covering page 1 doesn't re-fetch.
|
||||
struct TargetSourceWalker {
|
||||
base_url: String,
|
||||
backfill: bool,
|
||||
pages_remaining: VecDeque<i32>,
|
||||
first_page_html: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl DiscoverWalk for TargetSourceWalker {
|
||||
async fn next_batch(
|
||||
&mut self,
|
||||
ctx: &FetchContext<'_>,
|
||||
) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
|
||||
let Some(page_num) = self.pages_remaining.pop_front() else {
|
||||
return Ok(None);
|
||||
};
|
||||
let mut page_refs = if page_num == 1 {
|
||||
// Reuse the cached page-1 HTML from the initial probe. Take
|
||||
// it (rather than clone) so a malformed page-order queue
|
||||
// that re-visits page 1 still falls back to a real fetch.
|
||||
match self.first_page_html.take() {
|
||||
Some(html) => {
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)?
|
||||
}
|
||||
None => {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let html = navigate(ctx, self.base_url.as_str()).await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
}
|
||||
} else {
|
||||
retry_on_transient(
|
||||
|| async {
|
||||
let url = page_url(&self.base_url, page_num);
|
||||
let html = navigate(ctx, &url).await?;
|
||||
let doc = scraper::Html::parse_document(&html);
|
||||
parse_manga_list_from(&doc)
|
||||
},
|
||||
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
||||
PAGE_TRANSIENT_RETRY_DELAY,
|
||||
)
|
||||
.await?
|
||||
};
|
||||
if self.backfill {
|
||||
page_refs.reverse();
|
||||
}
|
||||
tracing::info!(page_num, count = page_refs.len(), "page walked");
|
||||
Ok(Some(page_refs))
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
/// Single point of rate-limited navigation. Every Source request goes
|
||||
@@ -922,4 +957,37 @@ mod tests {
|
||||
let err = parse_manga_detail(html, "x", true).expect_err("expected Transient");
|
||||
assert!(err.is_transient(), "got non-transient: {err}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_backfill_is_last_to_one() {
|
||||
// Backfill walks pages oldest-first: queue is [last, last-1, ..., 1]
|
||||
// so popping from the front yields the last page first.
|
||||
let order = build_page_order(Some(3), true);
|
||||
assert_eq!(Vec::from(order), vec![3, 2, 1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_incremental_is_one_to_last() {
|
||||
// Incremental walks newest-first in natural source order.
|
||||
let order = build_page_order(Some(3), false);
|
||||
assert_eq!(Vec::from(order), vec![1, 2, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_falls_back_to_page_one_only_without_pagination() {
|
||||
let backfill = build_page_order(None, true);
|
||||
assert_eq!(Vec::from(backfill), vec![1]);
|
||||
let incremental = build_page_order(None, false);
|
||||
assert_eq!(Vec::from(incremental), vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_page_order_single_page_index_yields_one_entry() {
|
||||
// Sources with exactly one page should not yield duplicates
|
||||
// regardless of mode.
|
||||
let backfill = build_page_order(Some(1), true);
|
||||
assert_eq!(Vec::from(backfill), vec![1]);
|
||||
let incremental = build_page_order(Some(1), false);
|
||||
assert_eq!(Vec::from(incremental), vec![1]);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user