feat: crawler scaffold with chromium launcher (0.22.0)

- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
2026-05-20 22:07:56 +02:00
parent 89b8785a40
commit 26eccd0abe
12 changed files with 1951 additions and 27 deletions
--- a/backend/src/crawler/source.rs
+++ b/backend/src/crawler/source.rs
@@ -0,0 +1,105 @@
+//! `Source` trait — the per-site abstraction.
+//!
+//! Job handlers depend on this trait, not on a concrete site. Adding a
+//! new site is: implement `Source`, register it in a `sources` table
+//! row, and the existing job pipeline picks it up unchanged.
+//!
+//! Scaffold only — the first concrete impl lands in a follow-up commit
+//! once the target site is locked in.
+
+use async_trait::async_trait;
+use chromiumoxide::browser::Browser;
+use serde::{Deserialize, Serialize};
+
+/// How a `discover` job should walk the source's index.
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub enum DiscoverMode {
+    /// Walk every index page from last back to first. Used for the
+    /// initial seed of a source.
+    Backfill,
+    /// Walk index pages from page 1 forward, stopping after
+    /// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
+    /// matches storage. Used for the recurring cron tick.
+    Incremental { stop_after_unchanged: usize },
+}
+
+/// Pointer at a manga in the source's index, before we've fetched the
+/// detail page. The `source_manga_key` is whatever stable id the source
+/// uses (slug, numeric id, etc).
+#[derive(Clone, Debug)]
+pub struct SourceMangaRef {
+    pub source_manga_key: String,
+    pub title: String,
+    pub url: String,
+}
+
+/// Full metadata returned by `fetch_manga`. The hash is computed by the
+/// source impl (typically over the normalized field set) and is the
+/// signal `diff` uses to detect metadata updates.
+#[derive(Clone, Debug)]
+pub struct SourceManga {
+    pub source_manga_key: String,
+    pub title: String,
+    pub alternative_titles: Vec<String>,
+    pub authors: Vec<String>,
+    pub genres: Vec<String>,
+    pub tags: Vec<String>,
+    pub status: Option<String>,
+    pub summary: Option<String>,
+    pub cover_url: Option<String>,
+    pub metadata_hash: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct SourceChapterRef {
+    pub source_chapter_key: String,
+    pub number: i32,
+    pub title: Option<String>,
+    pub url: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct SourceChapter {
+    pub source_chapter_key: String,
+    pub number: i32,
+    pub title: Option<String>,
+    /// Ordered list of page image URLs, ready to be fetched and put
+    /// into `Storage`.
+    pub page_urls: Vec<String>,
+}
+
+/// Context passed to every `Source` call. Owns the browser handle, so
+/// impls can `browser.new_page(...)` without bringing their own.
+pub struct FetchContext<'a> {
+    pub browser: &'a Browser,
+}
+
+#[async_trait]
+pub trait Source: Send + Sync {
+    /// Stable identifier — also the row key in the `sources` table.
+    fn id(&self) -> &'static str;
+
+    async fn discover(
+        &self,
+        ctx: &FetchContext<'_>,
+        mode: DiscoverMode,
+    ) -> anyhow::Result<Vec<SourceMangaRef>>;
+
+    async fn fetch_manga(
+        &self,
+        ctx: &FetchContext<'_>,
+        r: &SourceMangaRef,
+    ) -> anyhow::Result<SourceManga>;
+
+    async fn fetch_chapter_list(
+        &self,
+        ctx: &FetchContext<'_>,
+        manga: &SourceManga,
+    ) -> anyhow::Result<Vec<SourceChapterRef>>;
+
+    async fn fetch_chapter(
+        &self,
+        ctx: &FetchContext<'_>,
+        r: &SourceChapterRef,
+    ) -> anyhow::Result<SourceChapter>;
+}