feat: crawler manga-list & metadata sync with cover download (0.23.0)

- TargetSource: first concrete impl of the Source trait, modeled on the old Puppeteer crawler's selectors (+ status normalization, tag-count stripping, chapter list) - DiscoverMode::Backfill walks pagination last->1, reverse within each page (oldest-first); Incremental walks forward - RateLimiter (tokio-time aware) plumbed through FetchContext so the pagination walk honors the same per-host budget as the outer loop - repo::crawler: ensure_source, upsert_manga_from_source (returns New/Updated/Unchanged + current cover_image_path for backfill decisions), sync_manga_chapters, mark_dropped_mangas — all transactional, with case-insensitive lookups and source-insertable genres - Cover image download via reqwest+infer; stored under mangas/{id}/cover.{ext} via the Storage trait - Single CRAWLER_PROXY env wires both Chromium (--proxy-server) and reqwest::Proxy::all (HTTP/HTTPS/SOCKS5) - Crawler binary: positional start URL or $CRAWLER_START_URL, $CRAWLER_LIMIT (cap fetches + skip drop pass on partial runs), $CRAWLER_SKIP_CHAPTERS (disable selector AND sync), $CRAWLER_RATE_MS - Silences chromiumoxide 0.7's known CDP deserialize log spam via default tracing filter + CdpError::Serde downgrade - 9 sqlx integration tests + 11 selector/rate-limit unit tests
2026-05-21 22:04:23 +02:00
parent 26eccd0abe
commit b1a3a4e9d3
13 changed files with 1930 additions and 39 deletions
--- a/backend/src/bin/crawler.rs
+++ b/backend/src/bin/crawler.rs
@@ -1,29 +1,329 @@
 //! Crawler binary.
 //!
-//! Today: a thin shell that launches Chromium via the shared
-//! `crawler::browser` module and exits. Useful as an ad-hoc smoke test
-//! for the launcher in addition to the integration test in
-//! `tests/crawler_browser_smoke.rs`.
+//! Walks the source's manga listing (all pages), fetches each manga's
+//! metadata + chapter list, downloads the cover into `Storage`, and
+//! reconciles everything into the DB. Chapter *content* (page images)
+//! is out of scope for now — only chapter rows + their source links
+//! are written.
 //!
-//! Future: reads config, picks `Source` impls, runs the job loop.
+//! Configuration:
+//! - **Start URL** (required): first CLI positional arg, else
+//!   `$CRAWLER_START_URL`. This is the manga *list* page (page 1).
+//! - **Database** (required): `$DATABASE_URL`.
+//! - **Storage dir**: `$STORAGE_DIR`, default `./data/storage` —
+//!   matches the API binary so both write to the same local tree.
+//! - **Browser**: see `LaunchOptions::from_env` —
+//!   `CRAWLER_BROWSER_MODE` (`headed`|`headless`) and
+//!   `CRAWLER_BROWSER_ARGS`.
+//! - **Rate limit**: `CRAWLER_RATE_MS` (ms between requests, default
+//!   `1000`).
+//! - **Cap**: `CRAWLER_LIMIT` (max manga detail fetches per run,
+//!   default `0` = no cap).
+//! - **Skip chapters**: `CRAWLER_SKIP_CHAPTERS=1` — turn off the
+//!   chapter selector in the parser AND skip the per-manga
+//!   `sync_manga_chapters` write. Use this for "metadata only" runs.
+//! - **Proxy**: `$CRAWLER_PROXY` — single URL applied to both
+//!   Chromium (`--proxy-server`) and `reqwest::Proxy::all`. Supports
+//!   `http://`, `https://`, and `socks5://` (with optional user:pass).
+//!   Example: `socks5://user:pass@host:1080`. Unset → direct.

-use mangalord::crawler::browser::{self, LaunchOptions};
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::Duration;
+
+use anyhow::{anyhow, Context};
+use mangalord::crawler::{
+    browser::{self, LaunchOptions},
+    rate_limit::RateLimiter,
+    source::{target::TargetSource, DiscoverMode, FetchContext, Source},
+};
+use mangalord::repo;
+use mangalord::storage::{LocalStorage, Storage};
+use sqlx::postgres::PgPoolOptions;
+use sqlx::PgPool;
+use tokio::sync::Mutex;
 use tracing_subscriber::EnvFilter;
+use uuid::Uuid;

 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    dotenvy::dotenv().ok();
    tracing_subscriber::fmt()
        .with_env_filter(
-            EnvFilter::try_from_default_env()
-                .unwrap_or_else(|_| "info,mangalord=debug".into()),
+            EnvFilter::try_from_default_env().unwrap_or_else(|_| {
+                "info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off"
+                    .into()
+            }),
        )
        .init();

-    let options = LaunchOptions::from_env();
-    tracing::info!(?options, "launching browser");
-    let handle = browser::launch(options).await?;
-    tracing::info!("browser launched; closing");
-    handle.close().await?;
+    let start_url = resolve_start_url()?;
+    let database_url = std::env::var("DATABASE_URL")
+        .map_err(|_| anyhow!("DATABASE_URL must be set"))?;
+    let storage_dir: PathBuf = std::env::var("STORAGE_DIR")
+        .unwrap_or_else(|_| "./data/storage".to_string())
+        .into();
+    let rate_ms = env_u64("CRAWLER_RATE_MS", 1000);
+    let limit = env_u64("CRAWLER_LIMIT", 0) as usize;
+    let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false);
+    let proxy_url = std::env::var("CRAWLER_PROXY")
+        .ok()
+        .filter(|s| !s.trim().is_empty());
+
+    let db = PgPoolOptions::new()
+        .max_connections(5)
+        .connect(&database_url)
+        .await
+        .context("connect to database")?;
+    sqlx::migrate!("./migrations").run(&db).await?;
+
+    let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(&storage_dir));
+
+    // `no_proxy()` disables reqwest's own env-based detection so the
+    // single `CRAWLER_PROXY` knob is the only thing that influences
+    // routing. Otherwise an unrelated `HTTPS_PROXY` in the shell would
+    // silently route cover downloads while the browser stayed direct.
+    let mut http_builder = reqwest::Client::builder()
+        .timeout(Duration::from_secs(30))
+        .no_proxy();
+    if let Some(proxy) = &proxy_url {
+        http_builder = http_builder
+            .proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy URL: {proxy}"))?);
+    }
+    let http = http_builder.build().context("build http client")?;
+
+    let mut options = LaunchOptions::from_env();
+    if let Some(proxy) = &proxy_url {
+        options.extra_args.push(format!("--proxy-server={proxy}"));
+    }
+    tracing::info!(
+        ?options,
+        %start_url,
+        rate_ms,
+        limit,
+        skip_chapters,
+        proxy = ?proxy_url,
+        storage_dir = %storage_dir.display(),
+        "starting crawler"
+    );
+
+    let handle = browser::launch(options).await.context("launch browser")?;
+    let result = run(
+        handle.browser(),
+        &db,
+        storage.as_ref(),
+        &http,
+        &start_url,
+        rate_ms,
+        limit,
+        skip_chapters,
+    )
+    .await;
+    handle.close().await.ok();
+    result
+}
+
+async fn run(
+    browser: &chromiumoxide::Browser,
+    db: &PgPool,
+    storage: &dyn Storage,
+    http: &reqwest::Client,
+    start_url: &str,
+    rate_ms: u64,
+    limit: usize,
+    skip_chapters: bool,
+) -> anyhow::Result<()> {
+    let rate = Mutex::new(RateLimiter::new(Duration::from_millis(rate_ms)));
+    let source = {
+        let s = TargetSource::new(start_url.to_string());
+        if skip_chapters {
+            s.without_chapter_parsing()
+        } else {
+            s
+        }
+    };
+    let ctx = FetchContext {
+        browser,
+        rate: &rate,
+    };
+
+    let source_id = source.id();
+    repo::crawler::ensure_source(
+        db,
+        source_id,
+        "Target Site",
+        &origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
+    )
+    .await
+    .context("ensure_source")?;
+
+    let run_started_at = chrono::Utc::now();
+
+    let max_refs = (limit > 0).then_some(limit);
+    tracing::info!(?max_refs, "discovering manga list");
+    let refs = source
+        .discover(&ctx, DiscoverMode::Backfill, max_refs)
+        .await
+        .context("discover failed")?;
+    tracing::info!(count = refs.len(), "discovered manga list");
+
+    let to_fetch = refs;
+    let total = to_fetch.len();
+
+    for (i, r) in to_fetch.iter().enumerate() {
+        tracing::info!(idx = i + 1, total, key = %r.source_manga_key, "fetching metadata");
+        let manga = match source.fetch_manga(&ctx, r).await {
+            Ok(m) => m,
+            Err(e) => {
+                tracing::warn!(key = %r.source_manga_key, url = %r.url, error = ?e, "fetch_manga failed");
+                continue;
+            }
+        };
+
+        let upsert = match repo::crawler::upsert_manga_from_source(db, source_id, &r.url, &manga)
+            .await
+        {
+            Ok(u) => u,
+            Err(e) => {
+                tracing::error!(key = %r.source_manga_key, error = ?e, "upsert_manga_from_source failed");
+                continue;
+            }
+        };
+        tracing::info!(
+            key = %manga.source_manga_key,
+            manga_id = %upsert.manga_id,
+            status = ?upsert.status,
+            title = %manga.title,
+            "manga upserted"
+        );
+
+        // Cover image: download when missing in storage (backfill for
+        // mangas synced before cover-download support, plus the New
+        // path) or when metadata changed (cover URL is part of
+        // metadata_hash, so an Updated status implies the URL may
+        // have moved). Failures are non-fatal.
+        let needs_cover = upsert.cover_image_path.is_none()
+            || matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
+        if needs_cover {
+            if let Some(cover_url) = manga.cover_url.as_deref() {
+                if let Err(e) = download_and_store_cover(
+                    db,
+                    storage,
+                    http,
+                    &rate,
+                    &r.url,
+                    upsert.manga_id,
+                    cover_url,
+                )
+                .await
+                {
+                    tracing::warn!(manga_id = %upsert.manga_id, error = ?e, "cover download failed");
+                }
+            }
+        }
+
+        if !skip_chapters {
+            match repo::crawler::sync_manga_chapters(
+                db,
+                source_id,
+                upsert.manga_id,
+                &manga.chapters,
+            )
+            .await
+            {
+                Ok(diff) => tracing::info!(
+                    manga_id = %upsert.manga_id,
+                    new = diff.new,
+                    refreshed = diff.refreshed,
+                    dropped = diff.dropped,
+                    "chapters synced"
+                ),
+                Err(e) => tracing::warn!(manga_id = %upsert.manga_id, error = ?e, "chapter sync failed"),
+            }
+        }
+    }
+
+    if limit == 0 {
+        match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
+            Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
+            Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
+        }
+    } else {
+        tracing::info!(limit, "partial sync — skipping drop pass");
+    }
+
    Ok(())
 }
+
+async fn download_and_store_cover(
+    db: &PgPool,
+    storage: &dyn Storage,
+    http: &reqwest::Client,
+    rate: &Mutex<RateLimiter>,
+    manga_url: &str,
+    manga_id: Uuid,
+    cover_url: &str,
+) -> anyhow::Result<()> {
+    let absolute = reqwest::Url::parse(manga_url)
+        .context("parse manga URL")?
+        .join(cover_url)
+        .context("join cover URL onto manga URL")?;
+
+    rate.lock().await.wait().await;
+    let resp = http
+        .get(absolute.clone())
+        .send()
+        .await
+        .with_context(|| format!("GET {absolute}"))?
+        .error_for_status()
+        .with_context(|| format!("non-2xx for {absolute}"))?;
+    let bytes = resp.bytes().await.context("read cover body")?;
+
+    // `infer` sniffs the magic bytes — same crate the upload handler
+    // uses, so we don't trust the URL's extension.
+    let kind = infer::get(&bytes);
+    let ext = kind.map(|k| k.extension()).unwrap_or("bin");
+    let key = format!("mangas/{manga_id}/cover.{ext}");
+
+    storage
+        .put(&key, &bytes)
+        .await
+        .with_context(|| format!("store cover at {key}"))?;
+    repo::manga::set_cover_image_path(db, manga_id, &key)
+        .await
+        .with_context(|| format!("update cover_image_path for {manga_id}"))?;
+    tracing::info!(manga_id = %manga_id, key = %key, bytes = bytes.len(), %absolute, "cover stored");
+    Ok(())
+}
+
+fn resolve_start_url() -> anyhow::Result<String> {
+    if let Some(arg) = std::env::args().nth(1) {
+        return Ok(arg);
+    }
+    std::env::var("CRAWLER_START_URL").map_err(|_| {
+        anyhow!(
+            "start URL is required — pass as first CLI arg or set $CRAWLER_START_URL"
+        )
+    })
+}
+
+fn origin_of(url: &str) -> Option<String> {
+    let (scheme, rest) = url.split_once("://")?;
+    let host = rest.split('/').next()?;
+    Some(format!("{scheme}://{host}"))
+}
+
+fn env_u64(name: &str, default: u64) -> u64 {
+    std::env::var(name)
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(default)
+}
+
+fn env_bool(name: &str, default: bool) -> bool {
+    match std::env::var(name).ok().as_deref() {
+        Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
+        Some("0") | Some("false") | Some("FALSE") | Some("no") => false,
+        _ => default,
+    }
+}