feat: incremental crawl mode with seed-completion gate (0.33.0)

Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 06:41:26 +02:00
parent 51f42b03e9
commit 45ce0d8f12
11 changed files with 761 additions and 162 deletions
--- a/backend/src/crawler/source/target.rs
+++ b/backend/src/crawler/source/target.rs
@@ -7,6 +7,7 @@
 //! (`td:has(label:contains("Author:"))`) are implemented by walking
 //! the parsed tree.

+use std::collections::VecDeque;
 use std::time::Duration;

 use anyhow::Context;
@@ -14,13 +15,18 @@ use async_trait::async_trait;
 use sha2::{Digest, Sha256};

 use super::{
-    DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
-    SourceMangaRef,
+    DiscoverMode, DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef,
+    SourceManga, SourceMangaRef,
 };
 use crate::crawler::detect::{
    has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
 };

+/// `sources.id` value for this Source impl. Exposed as a const so the
+/// daemon can look up per-source state (e.g. `seed_completed_at`)
+/// before constructing the Source itself.
+pub const SOURCE_ID: &str = "target";
+
 /// In-loop retry budget for transient pages encountered during a single
 /// `discover` walk. Bounded small because the job system itself retries
 /// the whole `Discover` job on failure — these inline retries only need
@@ -60,15 +66,14 @@ impl TargetSource {
 #[async_trait]
 impl Source for TargetSource {
    fn id(&self) -> &'static str {
-        "target"
+        SOURCE_ID
    }

    async fn discover(
        &self,
        ctx: &FetchContext<'_>,
        mode: DiscoverMode,
-        max_results: Option<usize>,
-    ) -> anyhow::Result<Vec<SourceMangaRef>> {
+    ) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
        // Always visit page 1 first because that's the only way to
        // discover `last_page`. Retry it on transient — a broken first
        // page would otherwise abort the whole walk before we've even
@@ -85,15 +90,7 @@ impl Source for TargetSource {
        };

        let backfill = matches!(mode, DiscoverMode::Backfill);
-        let order: Vec<i32> = match (last_page, backfill) {
-            (None, _) => vec![1],
-            // Backfill = oldest-first: walk pages last → 1, then
-            // reverse within each page (the listing is update_date
-            // DESC, so the bottom of the last page is the oldest
-            // entry the source still surfaces).
-            (Some(last), true) => (1..=last).rev().collect(),
-            (Some(last), false) => (1..=last).collect(),
-        };
+        let order = build_page_order(last_page, backfill);
        tracing::info!(
            ?mode,
            last_page = ?last_page,
@@ -101,40 +98,12 @@ impl Source for TargetSource {
            "walking pagination"
        );

-        let mut all = Vec::new();
-        for page_num in order {
-            // Page 1 is already cached from the last_page probe — reuse
-            // it rather than navigating twice. Every other page goes
-            // through the retry helper so a single broken page mid-walk
-            // doesn't silently drop its mangas from the result.
-            let mut page_refs = if page_num == 1 {
-                let doc = scraper::Html::parse_document(&first_html);
-                parse_manga_list_from(&doc)?
-            } else {
-                retry_on_transient(
-                    || async {
-                        let url = page_url(&self.base_url, page_num);
-                        let html = navigate(ctx, &url).await?;
-                        let doc = scraper::Html::parse_document(&html);
-                        parse_manga_list_from(&doc)
-                    },
-                    PAGE_TRANSIENT_RETRY_ATTEMPTS,
-                    PAGE_TRANSIENT_RETRY_DELAY,
-                )
-                .await?
-            };
-            if backfill {
-                page_refs.reverse();
-            }
-            tracing::info!(page_num, count = page_refs.len(), "page walked");
-            all.extend(page_refs);
-            if cap_reached(&all, max_results) {
-                tracing::info!(cap = ?max_results, "max_results reached; halting pagination");
-                break;
-            }
-        }
-
-        Ok(truncate_to_cap(all, max_results))
+        Ok(Box::new(TargetSourceWalker {
+            base_url: self.base_url.clone(),
+            backfill,
+            pages_remaining: order,
+            first_page_html: Some(first_html),
+        }))
    }

    async fn fetch_manga(
@@ -168,15 +137,81 @@ impl Source for TargetSource {
    }
 }

-fn cap_reached<T>(buf: &[T], max: Option<usize>) -> bool {
-    matches!(max, Some(m) if buf.len() >= m)
+/// Build the queue of page numbers `TargetSource::discover` will walk.
+/// Backfill is oldest-first: pages `last..=1` (within each page the
+/// walker reverses entries, since the source orders by update_date
+/// DESC). Incremental is newest-first: pages `1..=last` in natural
+/// order. If `last_page` is unknown (source surfaces no pagination)
+/// only page 1 is visited.
+fn build_page_order(last_page: Option<i32>, backfill: bool) -> VecDeque<i32> {
+    match (last_page, backfill) {
+        (None, _) => VecDeque::from([1]),
+        (Some(last), true) => (1..=last).rev().collect(),
+        (Some(last), false) => (1..=last).collect(),
+    }
 }

-fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {
-    if let Some(m) = max {
-        buf.truncate(m);
+/// Walker returned by [`TargetSource::discover`]. Pops one source-index
+/// page per `next_batch` call. Page 1's HTML is cached at construction
+/// time (the discover call needed it to read `last_page` anyway) so the
+/// batch covering page 1 doesn't re-fetch.
+struct TargetSourceWalker {
+    base_url: String,
+    backfill: bool,
+    pages_remaining: VecDeque<i32>,
+    first_page_html: Option<String>,
+}
+
+#[async_trait]
+impl DiscoverWalk for TargetSourceWalker {
+    async fn next_batch(
+        &mut self,
+        ctx: &FetchContext<'_>,
+    ) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
+        let Some(page_num) = self.pages_remaining.pop_front() else {
+            return Ok(None);
+        };
+        let mut page_refs = if page_num == 1 {
+            // Reuse the cached page-1 HTML from the initial probe. Take
+            // it (rather than clone) so a malformed page-order queue
+            // that re-visits page 1 still falls back to a real fetch.
+            match self.first_page_html.take() {
+                Some(html) => {
+                    let doc = scraper::Html::parse_document(&html);
+                    parse_manga_list_from(&doc)?
+                }
+                None => {
+                    retry_on_transient(
+                        || async {
+                            let html = navigate(ctx, self.base_url.as_str()).await?;
+                            let doc = scraper::Html::parse_document(&html);
+                            parse_manga_list_from(&doc)
+                        },
+                        PAGE_TRANSIENT_RETRY_ATTEMPTS,
+                        PAGE_TRANSIENT_RETRY_DELAY,
+                    )
+                    .await?
+                }
+            }
+        } else {
+            retry_on_transient(
+                || async {
+                    let url = page_url(&self.base_url, page_num);
+                    let html = navigate(ctx, &url).await?;
+                    let doc = scraper::Html::parse_document(&html);
+                    parse_manga_list_from(&doc)
+                },
+                PAGE_TRANSIENT_RETRY_ATTEMPTS,
+                PAGE_TRANSIENT_RETRY_DELAY,
+            )
+            .await?
+        };
+        if self.backfill {
+            page_refs.reverse();
+        }
+        tracing::info!(page_num, count = page_refs.len(), "page walked");
+        Ok(Some(page_refs))
    }
-    buf
 }

 /// Single point of rate-limited navigation. Every Source request goes
@@ -922,4 +957,37 @@ mod tests {
        let err = parse_manga_detail(html, "x", true).expect_err("expected Transient");
        assert!(err.is_transient(), "got non-transient: {err}");
    }
+
+    #[test]
+    fn build_page_order_backfill_is_last_to_one() {
+        // Backfill walks pages oldest-first: queue is [last, last-1, ..., 1]
+        // so popping from the front yields the last page first.
+        let order = build_page_order(Some(3), true);
+        assert_eq!(Vec::from(order), vec![3, 2, 1]);
+    }
+
+    #[test]
+    fn build_page_order_incremental_is_one_to_last() {
+        // Incremental walks newest-first in natural source order.
+        let order = build_page_order(Some(3), false);
+        assert_eq!(Vec::from(order), vec![1, 2, 3]);
+    }
+
+    #[test]
+    fn build_page_order_falls_back_to_page_one_only_without_pagination() {
+        let backfill = build_page_order(None, true);
+        assert_eq!(Vec::from(backfill), vec![1]);
+        let incremental = build_page_order(None, false);
+        assert_eq!(Vec::from(incremental), vec![1]);
+    }
+
+    #[test]
+    fn build_page_order_single_page_index_yields_one_entry() {
+        // Sources with exactly one page should not yield duplicates
+        // regardless of mode.
+        let backfill = build_page_order(Some(1), true);
+        assert_eq!(Vec::from(backfill), vec![1]);
+        let incremental = build_page_order(Some(1), false);
+        assert_eq!(Vec::from(incremental), vec![1]);
+    }
 }