feat: crawler manga-list & metadata sync with cover download (0.23.0)

- TargetSource: first concrete impl of the Source trait, modeled on the old Puppeteer crawler's selectors (+ status normalization, tag-count stripping, chapter list) - DiscoverMode::Backfill walks pagination last->1, reverse within each page (oldest-first); Incremental walks forward - RateLimiter (tokio-time aware) plumbed through FetchContext so the pagination walk honors the same per-host budget as the outer loop - repo::crawler: ensure_source, upsert_manga_from_source (returns New/Updated/Unchanged + current cover_image_path for backfill decisions), sync_manga_chapters, mark_dropped_mangas — all transactional, with case-insensitive lookups and source-insertable genres - Cover image download via reqwest+infer; stored under mangas/{id}/cover.{ext} via the Storage trait - Single CRAWLER_PROXY env wires both Chromium (--proxy-server) and reqwest::Proxy::all (HTTP/HTTPS/SOCKS5) - Crawler binary: positional start URL or $CRAWLER_START_URL, $CRAWLER_LIMIT (cap fetches + skip drop pass on partial runs), $CRAWLER_SKIP_CHAPTERS (disable selector AND sync), $CRAWLER_RATE_MS - Silences chromiumoxide 0.7's known CDP deserialize log spam via default tracing filter + CdpError::Serde downgrade - 9 sqlx integration tests + 11 selector/rate-limit unit tests
2026-05-21 22:04:23 +02:00
parent 26eccd0abe
commit b1a3a4e9d3
13 changed files with 1930 additions and 39 deletions
--- a/backend/src/crawler/source.rs
+++ b/backend/src/crawler/source.rs
@@ -3,9 +3,8 @@
 //! Job handlers depend on this trait, not on a concrete site. Adding a
 //! new site is: implement `Source`, register it in a `sources` table
 //! row, and the existing job pipeline picks it up unchanged.
-//!
-//! Scaffold only — the first concrete impl lands in a follow-up commit
-//! once the target site is locked in.
+
+pub mod target;

 use async_trait::async_trait;
 use chromiumoxide::browser::Browser;
@@ -34,8 +33,10 @@ pub struct SourceMangaRef {
 }

 /// Full metadata returned by `fetch_manga`. The hash is computed by the
-/// source impl (typically over the normalized field set) and is the
-/// signal `diff` uses to detect metadata updates.
+/// source impl over the metadata-only field set (title through
+/// cover_url) — chapter changes are tracked separately via
+/// `chapter_sources`, so they intentionally do not affect
+/// `metadata_hash`.
 #[derive(Clone, Debug)]
 pub struct SourceManga {
    pub source_manga_key: String,
@@ -47,6 +48,10 @@ pub struct SourceManga {
    pub status: Option<String>,
    pub summary: Option<String>,
    pub cover_url: Option<String>,
+    /// Chapters surfaced on the same page as the metadata. Sources
+    /// where the chapter list lives elsewhere can leave this empty
+    /// and supply it via `fetch_chapter_list` instead.
+    pub chapters: Vec<SourceChapterRef>,
    pub metadata_hash: String,
 }

@@ -68,10 +73,13 @@ pub struct SourceChapter {
    pub page_urls: Vec<String>,
 }

-/// Context passed to every `Source` call. Owns the browser handle, so
-/// impls can `browser.new_page(...)` without bringing their own.
+/// Context passed to every `Source` call. Carries the browser handle
+/// plus a shared rate limiter so impls that issue multiple requests in
+/// one call (e.g. pagination walks) honor the same per-host budget as
+/// the outer job loop.
 pub struct FetchContext<'a> {
    pub browser: &'a Browser,
+    pub rate: &'a tokio::sync::Mutex<crate::crawler::rate_limit::RateLimiter>,
 }

 #[async_trait]
@@ -79,10 +87,15 @@ pub trait Source: Send + Sync {
    /// Stable identifier — also the row key in the `sources` table.
    fn id(&self) -> &'static str;

+    /// Returns up to `max_results` manga refs in source order. Pass
+    /// `None` for an uncapped walk (full backfill / incremental sweep).
+    /// Implementations should stop paginating as soon as the cap is
+    /// reached so partial runs don't pay for pages they won't use.
    async fn discover(
        &self,
        ctx: &FetchContext<'_>,
        mode: DiscoverMode,
+        max_results: Option<usize>,
    ) -> anyhow::Result<Vec<SourceMangaRef>>;

    async fn fetch_manga(