feat: crawler scaffold with chromium launcher (0.22.0)

- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
2026-05-20 22:07:56 +02:00
parent 89b8785a40
commit 26eccd0abe
12 changed files with 1951 additions and 27 deletions
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mangalord"
-version = "0.21.3"
+version = "0.22.0"
 edition = "2021"

 [lib]
@@ -10,6 +10,10 @@ path = "src/lib.rs"
 name = "mangalord"
 path = "src/main.rs"

+[[bin]]
+name = "crawler"
+path = "src/bin/crawler.rs"
+
 [dependencies]
 axum = { version = "0.7", features = ["macros", "multipart"] }
 tokio = { version = "1", features = ["full"] }
@@ -36,7 +40,10 @@ time = "0.3"
 infer = "0.16"
 tokio-util = { version = "0.7", features = ["io"] }
 futures-core = "0.3"
+futures-util = "0.3"
 bytes = "1"
+chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false }
+scraper = "0.20"

 [dev-dependencies]
 tempfile = "3"
--- a/backend/migrations/0012_crawler.sql
+++ b/backend/migrations/0012_crawler.sql
@@ -0,0 +1,72 @@
+-- Crawler tables.
+--
+-- Same philosophy as 0001_init.sql: new concepts go in new tables
+-- joined to existing ones, not jammed onto `mangas`/`chapters`. A
+-- crawled manga IS a manga; the only thing the source-link tables
+-- carry is "where did this come from and when did we last see it".
+-- That keeps the API and frontend source-agnostic.
+
+-- 1. Source registry. One row per site the crawler knows about.
+--    `config` carries per-site knobs (base URL, rate limits, custom
+--    selectors) so adding a source is a row insert plus a `Source`
+--    trait impl — no schema change.
+CREATE TABLE sources (
+    id          text PRIMARY KEY,
+    name        text NOT NULL,
+    base_url    text NOT NULL,
+    enabled     boolean NOT NULL DEFAULT true,
+    config      jsonb NOT NULL DEFAULT '{}'::jsonb,
+    created_at  timestamptz NOT NULL DEFAULT now()
+);
+
+-- 2. Link tables. `(source_id, source_*_key)` is the natural key the
+--    source itself exposes; the FK to `mangas`/`chapters` is what
+--    threads it back into our domain. `metadata_hash` is the signal
+--    used by `crawler::diff` to detect updates without re-comparing
+--    every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
+CREATE TABLE manga_sources (
+    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
+    source_manga_key    text NOT NULL,
+    manga_id            uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
+    source_url          text NOT NULL,
+    metadata_hash       text,
+    first_seen_at       timestamptz NOT NULL DEFAULT now(),
+    last_seen_at        timestamptz NOT NULL DEFAULT now(),
+    dropped_at          timestamptz,
+    PRIMARY KEY (source_id, source_manga_key)
+);
+CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
+CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);
+
+CREATE TABLE chapter_sources (
+    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
+    source_chapter_key  text NOT NULL,
+    chapter_id          uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
+    source_url          text NOT NULL,
+    first_seen_at       timestamptz NOT NULL DEFAULT now(),
+    last_seen_at        timestamptz NOT NULL DEFAULT now(),
+    dropped_at          timestamptz,
+    PRIMARY KEY (source_id, source_chapter_key)
+);
+CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);
+
+-- 3. Persistent job queue. Workers lease with
+--    `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
+--    by transitioning state. The partial index keeps the hot path
+--    (pick the next ready job) off the bulk of done/dead rows.
+CREATE TABLE crawler_jobs (
+    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
+    payload         jsonb NOT NULL,
+    state           text NOT NULL DEFAULT 'pending'
+                       CHECK (state IN ('pending','running','done','failed','dead')),
+    attempts        integer NOT NULL DEFAULT 0,
+    max_attempts    integer NOT NULL DEFAULT 5,
+    scheduled_at    timestamptz NOT NULL DEFAULT now(),
+    leased_until    timestamptz,
+    last_error      text,
+    created_at      timestamptz NOT NULL DEFAULT now(),
+    updated_at      timestamptz NOT NULL DEFAULT now()
+);
+CREATE INDEX crawler_jobs_ready_idx
+    ON crawler_jobs (scheduled_at)
+    WHERE state IN ('pending', 'failed');
--- a/backend/src/bin/crawler.rs
+++ b/backend/src/bin/crawler.rs
@@ -0,0 +1,29 @@
+//! Crawler binary.
+//!
+//! Today: a thin shell that launches Chromium via the shared
+//! `crawler::browser` module and exits. Useful as an ad-hoc smoke test
+//! for the launcher in addition to the integration test in
+//! `tests/crawler_browser_smoke.rs`.
+//!
+//! Future: reads config, picks `Source` impls, runs the job loop.
+
+use mangalord::crawler::browser::{self, LaunchOptions};
+use tracing_subscriber::EnvFilter;
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    dotenvy::dotenv().ok();
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| "info,mangalord=debug".into()),
+        )
+        .init();
+
+    let options = LaunchOptions::from_env();
+    tracing::info!(?options, "launching browser");
+    let handle = browser::launch(options).await?;
+    tracing::info!("browser launched; closing");
+    handle.close().await?;
+    Ok(())
+}
--- a/backend/src/crawler/browser.rs
+++ b/backend/src/crawler/browser.rs
@@ -0,0 +1,217 @@
+//! Chromium launcher and lifecycle.
+//!
+//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
+//! system Chrome install — first call downloads a known-good revision
+//! into a cache dir and reuses it forever after. `BrowserMode` toggles
+//! headed vs headless; the headed path needs a display (real `$DISPLAY`
+//! or `xvfb-run`).
+//!
+//! Extra Chromium command-line flags can be supplied through
+//! [`LaunchOptions::extra_args`] in code, or via the
+//! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going
+//! through [`LaunchOptions::from_env`]. The launcher always also
+//! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're
+//! near-mandatory for containerized Chromium; everything else is
+//! caller-provided.
+
+use std::path::PathBuf;
+
+use anyhow::Context;
+use chromiumoxide::browser::{Browser, BrowserConfig};
+use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions};
+use futures_util::StreamExt;
+use tokio::task::JoinHandle;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum BrowserMode {
+    /// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the
+    /// binary). This is the default the old Puppeteer crawler used and
+    /// the assumed mode for the target site until we prove headless
+    /// works against it.
+    Headed,
+    /// No window. Faster, lower resource use, but more likely to trip
+    /// fingerprinting on hostile sites.
+    Headless,
+}
+
+/// Configuration for a single browser launch.
+///
+/// Public fields rather than a builder — there are only two of them
+/// and callers benefit from struct literal syntax for clarity.
+#[derive(Clone, Debug)]
+pub struct LaunchOptions {
+    pub mode: BrowserMode,
+    /// Extra Chromium flags, appended after the launcher's own
+    /// defaults. Example: `vec!["--lang=de-DE".into(),
+    /// "--window-size=1280,800".into()]`.
+    pub extra_args: Vec<String>,
+}
+
+impl LaunchOptions {
+    pub fn headed() -> Self {
+        Self {
+            mode: BrowserMode::Headed,
+            extra_args: Vec::new(),
+        }
+    }
+
+    pub fn headless() -> Self {
+        Self {
+            mode: BrowserMode::Headless,
+            extra_args: Vec::new(),
+        }
+    }
+
+    /// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default
+    /// `headed`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated
+    /// Chromium flags). Flags containing whitespace aren't supported
+    /// through the env var — use the programmatic API for those.
+    pub fn from_env() -> Self {
+        let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() {
+            Ok("headless") => BrowserMode::Headless,
+            _ => BrowserMode::Headed,
+        };
+        let extra_args = std::env::var("CRAWLER_BROWSER_ARGS")
+            .map(|s| parse_args(&s))
+            .unwrap_or_default();
+        Self { mode, extra_args }
+    }
+}
+
+impl Default for LaunchOptions {
+    fn default() -> Self {
+        Self::headed()
+    }
+}
+
+/// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed
+/// separately from `from_env` so it can be unit-tested without
+/// touching process environment.
+pub(crate) fn parse_args(s: &str) -> Vec<String> {
+    s.split_whitespace().map(str::to_string).collect()
+}
+
+/// Owned browser plus the spawned task that drives its CDP event loop.
+/// Dropping `Handle` without calling `close` leaks the Chromium process
+/// — always call `close().await` in production paths.
+pub struct Handle {
+    browser: Browser,
+    driver: JoinHandle<()>,
+}
+
+impl Handle {
+    pub fn browser(&self) -> &Browser {
+        &self.browser
+    }
+
+    pub fn browser_mut(&mut self) -> &mut Browser {
+        &mut self.browser
+    }
+
+    /// Closes the browser and awaits the driver task. Safe to call
+    /// multiple times — subsequent calls are no-ops.
+    pub async fn close(mut self) -> anyhow::Result<()> {
+        let _ = self.browser.close().await;
+        let _ = self.browser.wait().await;
+        let _ = self.driver.await;
+        Ok(())
+    }
+}
+
+/// Launches Chromium. Downloads it on first run via the `fetcher`
+/// feature; subsequent runs hit the cache. The cache dir is
+/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
+/// else `./.chromium-cache` as a last-resort repo-local fallback.
+pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
+    let cache = cache_dir()?;
+    tokio::fs::create_dir_all(&cache)
+        .await
+        .with_context(|| format!("create cache dir {}", cache.display()))?;
+
+    let fetcher = BrowserFetcher::new(
+        BrowserFetcherOptions::builder()
+            .with_path(&cache)
+            .build()
+            .map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
+    );
+    tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
+    let info = fetcher
+        .fetch()
+        .await
+        .context("download chromium via fetcher")?;
+    tracing::info!(executable = %info.executable_path.display(), "chromium ready");
+
+    let mut builder = BrowserConfig::builder()
+        .chrome_executable(info.executable_path)
+        // Linux containers / CI commonly lack the user namespaces
+        // Chromium's sandbox wants. Disable it; the crawler runs in its
+        // own container anyway.
+        .arg("--no-sandbox")
+        .arg("--disable-dev-shm-usage");
+    for arg in &options.extra_args {
+        builder = builder.arg(arg);
+    }
+    if matches!(options.mode, BrowserMode::Headed) {
+        builder = builder.with_head();
+    }
+    tracing::info!(
+        mode = ?options.mode,
+        extra_args = ?options.extra_args,
+        "building browser config"
+    );
+    let config = builder
+        .build()
+        .map_err(|e| anyhow::anyhow!("browser config: {e}"))?;
+
+    let (browser, mut handler) = Browser::launch(config)
+        .await
+        .context("launch chromium")?;
+
+    let driver = tokio::spawn(async move {
+        while let Some(event) = handler.next().await {
+            if let Err(err) = event {
+                tracing::warn!(?err, "chromium handler event error");
+            }
+        }
+    });
+
+    Ok(Handle { browser, driver })
+}
+
+fn cache_dir() -> anyhow::Result<PathBuf> {
+    if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") {
+        return Ok(PathBuf::from(dir));
+    }
+    if let Ok(home) = std::env::var("HOME") {
+        return Ok(PathBuf::from(home).join(".cache/mangalord/chromium"));
+    }
+    Ok(PathBuf::from("./.chromium-cache"))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_args_splits_on_whitespace() {
+        assert_eq!(
+            parse_args("--lang=de-DE --window-size=1280,800"),
+            vec!["--lang=de-DE", "--window-size=1280,800"]
+        );
+    }
+
+    #[test]
+    fn parse_args_tolerates_irregular_whitespace() {
+        // tabs, multiple spaces, leading/trailing — all collapsed.
+        assert_eq!(
+            parse_args("  --a\t--b   --c=1\n"),
+            vec!["--a", "--b", "--c=1"]
+        );
+    }
+
+    #[test]
+    fn parse_args_empty_string_yields_empty_vec() {
+        assert!(parse_args("").is_empty());
+        assert!(parse_args("   \t\n").is_empty());
+    }
+}
--- a/backend/src/crawler/diff.rs
+++ b/backend/src/crawler/diff.rs
@@ -0,0 +1,15 @@
+//! Change-detection rules between the source and our DB.
+//!
+//! | Event              | Signal                                                                                |
+//! |--------------------|----------------------------------------------------------------------------------------|
+//! | New manga          | `(source_id, source_manga_key)` not in `manga_sources`                                 |
+//! | Updated metadata   | freshly computed `metadata_hash` differs from the stored one                           |
+//! | Dropped manga      | `last_seen_at < discover_run_started_at` for N consecutive successful discover runs    |
+//! | New chapter        | `(source_id, source_chapter_key)` not in `chapter_sources`                             |
+//! | Dropped chapter    | present in DB but absent from the latest `fetch_chapter_list` for the same manga       |
+//!
+//! Dropped is always a soft flag (`dropped_at`), never a row delete —
+//! restoring is a matter of clearing the flag if the source brings the
+//! item back.
+//!
+//! Scaffold only — implementations land once `repo::crawler` exists.
--- a/backend/src/crawler/jobs.rs
+++ b/backend/src/crawler/jobs.rs
@@ -0,0 +1,55 @@
+//! Persistent job queue and the four job kinds.
+//!
+//! Backed by Postgres (the `crawler_jobs` table). Workers lease rows
+//! with `SELECT ... FOR UPDATE SKIP LOCKED`, heartbeat via
+//! `leased_until`, and ack by transitioning to `done` (or backoff /
+//! `dead`). Handlers are idempotent so a crash mid-run is recoverable
+//! by replay.
+//!
+//! Scaffold only — the actual queue wrapper and handler dispatch land
+//! once we have the first `Source` impl exercising the pipeline.
+
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
+
+use super::source::DiscoverMode;
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(tag = "kind", rename_all = "snake_case")]
+pub enum JobPayload {
+    /// Walk the source index and enqueue `SyncManga` jobs.
+    Discover {
+        source_id: String,
+        mode: DiscoverMode,
+    },
+    /// Fetch one manga's detail page, upsert metadata, enqueue
+    /// `SyncChapterList`.
+    SyncManga {
+        source_id: String,
+        source_manga_key: String,
+    },
+    /// Diff the chapter list, enqueue `SyncChapterContent` for new
+    /// chapters, soft-drop vanished ones.
+    SyncChapterList {
+        source_id: String,
+        manga_id: Uuid,
+        source_manga_key: String,
+    },
+    /// Download a single chapter's page images into storage.
+    SyncChapterContent {
+        source_id: String,
+        chapter_id: Uuid,
+        source_chapter_key: String,
+    },
+}
+
+#[derive(Clone, Copy, Debug, sqlx::Type, Serialize, Deserialize)]
+#[sqlx(type_name = "text", rename_all = "snake_case")]
+#[serde(rename_all = "snake_case")]
+pub enum JobState {
+    Pending,
+    Running,
+    Done,
+    Failed,
+    Dead,
+}
--- a/backend/src/crawler/mod.rs
+++ b/backend/src/crawler/mod.rs
@@ -0,0 +1,19 @@
+//! Crawler subsystem.
+//!
+//! Runs as its own binary (`src/bin/crawler.rs`) and shares `domain`,
+//! `repo`, and `storage` with the API binary. Layering mirrors the
+//! `Storage` trait pattern: callers depend on the `source::Source`
+//! trait, not on a concrete site; new sites plug in as additional
+//! impls without touching the job runner.
+//!
+//! Submodules:
+//! - [`browser`]: launches and pools Chromium via `chromiumoxide`.
+//!   First run downloads a known-good build via the `fetcher` feature.
+//! - [`source`]: the `Source` trait. Per-site impls live alongside it.
+//! - [`jobs`]: job kinds, queue wrapper, handler dispatch.
+//! - [`diff`]: change detection — new / updated / dropped semantics.
+
+pub mod browser;
+pub mod diff;
+pub mod jobs;
+pub mod source;
--- a/backend/src/crawler/source.rs
+++ b/backend/src/crawler/source.rs
@@ -0,0 +1,105 @@
+//! `Source` trait — the per-site abstraction.
+//!
+//! Job handlers depend on this trait, not on a concrete site. Adding a
+//! new site is: implement `Source`, register it in a `sources` table
+//! row, and the existing job pipeline picks it up unchanged.
+//!
+//! Scaffold only — the first concrete impl lands in a follow-up commit
+//! once the target site is locked in.
+
+use async_trait::async_trait;
+use chromiumoxide::browser::Browser;
+use serde::{Deserialize, Serialize};
+
+/// How a `discover` job should walk the source's index.
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub enum DiscoverMode {
+    /// Walk every index page from last back to first. Used for the
+    /// initial seed of a source.
+    Backfill,
+    /// Walk index pages from page 1 forward, stopping after
+    /// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
+    /// matches storage. Used for the recurring cron tick.
+    Incremental { stop_after_unchanged: usize },
+}
+
+/// Pointer at a manga in the source's index, before we've fetched the
+/// detail page. The `source_manga_key` is whatever stable id the source
+/// uses (slug, numeric id, etc).
+#[derive(Clone, Debug)]
+pub struct SourceMangaRef {
+    pub source_manga_key: String,
+    pub title: String,
+    pub url: String,
+}
+
+/// Full metadata returned by `fetch_manga`. The hash is computed by the
+/// source impl (typically over the normalized field set) and is the
+/// signal `diff` uses to detect metadata updates.
+#[derive(Clone, Debug)]
+pub struct SourceManga {
+    pub source_manga_key: String,
+    pub title: String,
+    pub alternative_titles: Vec<String>,
+    pub authors: Vec<String>,
+    pub genres: Vec<String>,
+    pub tags: Vec<String>,
+    pub status: Option<String>,
+    pub summary: Option<String>,
+    pub cover_url: Option<String>,
+    pub metadata_hash: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct SourceChapterRef {
+    pub source_chapter_key: String,
+    pub number: i32,
+    pub title: Option<String>,
+    pub url: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct SourceChapter {
+    pub source_chapter_key: String,
+    pub number: i32,
+    pub title: Option<String>,
+    /// Ordered list of page image URLs, ready to be fetched and put
+    /// into `Storage`.
+    pub page_urls: Vec<String>,
+}
+
+/// Context passed to every `Source` call. Owns the browser handle, so
+/// impls can `browser.new_page(...)` without bringing their own.
+pub struct FetchContext<'a> {
+    pub browser: &'a Browser,
+}
+
+#[async_trait]
+pub trait Source: Send + Sync {
+    /// Stable identifier — also the row key in the `sources` table.
+    fn id(&self) -> &'static str;
+
+    async fn discover(
+        &self,
+        ctx: &FetchContext<'_>,
+        mode: DiscoverMode,
+    ) -> anyhow::Result<Vec<SourceMangaRef>>;
+
+    async fn fetch_manga(
+        &self,
+        ctx: &FetchContext<'_>,
+        r: &SourceMangaRef,
+    ) -> anyhow::Result<SourceManga>;
+
+    async fn fetch_chapter_list(
+        &self,
+        ctx: &FetchContext<'_>,
+        manga: &SourceManga,
+    ) -> anyhow::Result<Vec<SourceChapterRef>>;
+
+    async fn fetch_chapter(
+        &self,
+        ctx: &FetchContext<'_>,
+        r: &SourceChapterRef,
+    ) -> anyhow::Result<SourceChapter>;
+}
--- a/backend/src/lib.rs
+++ b/backend/src/lib.rs
@@ -2,6 +2,7 @@ pub mod api;
 pub mod app;
 pub mod auth;
 pub mod config;
+pub mod crawler;
 pub mod domain;
 pub mod error;
 pub mod repo;
--- a/backend/tests/crawler_browser_smoke.rs
+++ b/backend/tests/crawler_browser_smoke.rs
@@ -0,0 +1,157 @@
+//! Smoke test for the Chromium launcher.
+//!
+//! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on
+//! first run via the `fetcher` feature and (b) requires a real `$DISPLAY`
+//! for the headed path. Run it explicitly:
+//!
+//! ```sh
+//! cargo test --test crawler_browser_smoke -- --ignored --nocapture
+//! ```
+//!
+//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
+//! `$HOME/.cache/mangalord/chromium` isn't writable.
+
+use mangalord::crawler::browser::{self, LaunchOptions};
+
+#[tokio::test]
+#[ignore = "downloads Chromium and needs a display; run with --ignored"]
+async fn headed_browser_can_navigate_and_read_title() {
+    // A data URL avoids any network dependency — we're testing the
+    // browser launcher, not connectivity.
+    const PAGE: &str = "data:text/html,<html><head><title>Mangalord%20Smoke</title></head><body>OK</body></html>";
+
+    let handle = browser::launch(LaunchOptions::headed())
+        .await
+        .expect("launch headed chromium");
+
+    let page = handle
+        .browser()
+        .new_page(PAGE)
+        .await
+        .expect("open new page");
+    page.wait_for_navigation()
+        .await
+        .expect("wait for navigation");
+
+    let title = page.get_title().await.expect("get title");
+    assert_eq!(title.as_deref(), Some("Mangalord Smoke"));
+
+    handle.close().await.expect("close cleanly");
+}
+
+#[tokio::test]
+#[ignore = "downloads Chromium; run with --ignored"]
+async fn headless_browser_can_navigate_and_read_title() {
+    const PAGE: &str = "data:text/html,<html><head><title>Headless%20OK</title></head><body></body></html>";
+
+    let handle = browser::launch(LaunchOptions::headless())
+        .await
+        .expect("launch headless chromium");
+
+    let page = handle.browser().new_page(PAGE).await.expect("open new page");
+    page.wait_for_navigation().await.expect("wait for navigation");
+
+    let title = page.get_title().await.expect("get title");
+    assert_eq!(title.as_deref(), Some("Headless OK"));
+
+    handle.close().await.expect("close cleanly");
+}
+
+/// Live end-to-end: navigate to a real page, get the rendered HTML, and
+/// parse it with `scraper`. ipify.org renders the visitor's public IP
+/// into the page DOM, so a successful run proves browser → render →
+/// `Html::parse_document` → selector → text extraction all work
+/// against a real site. This is the same path each future `Source`
+/// impl will take.
+#[tokio::test]
+#[ignore = "needs network; run with --ignored"]
+async fn fetches_public_ip_from_ipify() {
+    use std::time::Duration;
+
+    let handle = browser::launch(LaunchOptions::headless())
+        .await
+        .expect("launch headless chromium");
+
+    let page = handle
+        .browser()
+        .new_page("https://www.ipify.org")
+        .await
+        .expect("open ipify");
+    page.wait_for_navigation().await.expect("wait for navigation");
+    // ipify injects the IP via JS after load, so the navigation event
+    // alone isn't enough — give the script a beat to run.
+    tokio::time::sleep(Duration::from_secs(2)).await;
+
+    let html = page.content().await.expect("get rendered html");
+    let doc = scraper::Html::parse_document(&html);
+    let body_sel = scraper::Selector::parse("body").unwrap();
+    let body_text: String = doc
+        .select(&body_sel)
+        .next()
+        .map(|n| n.text().collect::<Vec<_>>().join(" "))
+        .unwrap_or_default();
+
+    let ip = extract_ipv4(&body_text)
+        .unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}"));
+    eprintln!("ipify says our public IP is: {ip}");
+
+    handle.close().await.expect("close cleanly");
+}
+
+/// Proves that `LaunchOptions::extra_args` actually reach Chromium and
+/// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`,
+/// observable from JS — read it back via `page.evaluate`.
+#[tokio::test]
+#[ignore = "downloads Chromium; run with --ignored"]
+async fn extra_args_reach_chromium() {
+    const UA: &str = "MangalordCrawlerTest/1.0";
+    let options = LaunchOptions {
+        mode: browser::BrowserMode::Headless,
+        extra_args: vec![format!("--user-agent={UA}")],
+    };
+    let handle = browser::launch(options).await.expect("launch with extra args");
+
+    let page = handle
+        .browser()
+        .new_page("about:blank")
+        .await
+        .expect("open page");
+    page.wait_for_navigation().await.expect("wait");
+
+    let ua: String = page
+        .evaluate("navigator.userAgent")
+        .await
+        .expect("evaluate navigator.userAgent")
+        .into_value()
+        .expect("string value");
+    assert_eq!(
+        ua, UA,
+        "extra --user-agent flag should override navigator.userAgent"
+    );
+
+    handle.close().await.expect("close cleanly");
+}
+
+/// Tiny dotted-quad finder — avoids pulling `regex` in just for one
+/// test. Scans the first valid IPv4 substring (four 0..=255 octets
+/// separated by dots).
+fn extract_ipv4(s: &str) -> Option<String> {
+    let bytes = s.as_bytes();
+    let mut i = 0;
+    while i < bytes.len() {
+        if !bytes[i].is_ascii_digit() {
+            i += 1;
+            continue;
+        }
+        let start = i;
+        while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') {
+            i += 1;
+        }
+        let candidate = &s[start..i];
+        let parts: Vec<&str> = candidate.split('.').collect();
+        if parts.len() == 4 && parts.iter().all(|p| p.parse::<u8>().is_ok()) {
+            return Some(candidate.to_string());
+        }
+    }
+    None
+}
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "mangalord-frontend",
-  "version": "0.21.3",
+  "version": "0.22.0",
  "private": true,
  "type": "module",
  "scripts": {