feat: crawler scaffold with chromium launcher (0.22.0)

- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
2026-05-20 22:07:56 +02:00
parent 89b8785a40
commit 26eccd0abe
12 changed files with 1951 additions and 27 deletions
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mangalord"
-version = "0.21.3"
+version = "0.22.0"
 edition = "2021"
 [lib]
@@ -10,6 +10,10 @@ path = "src/lib.rs"
 name = "mangalord"
 path = "src/main.rs"
 [[bin]]
 name = "crawler"
 path = "src/bin/crawler.rs"
 [dependencies]
 axum = { version = "0.7", features = ["macros", "multipart"] }
 tokio = { version = "1", features = ["full"] }
@@ -36,7 +40,10 @@ time = "0.3"
 infer = "0.16"
 tokio-util = { version = "0.7", features = ["io"] }
 futures-core = "0.3"
 futures-util = "0.3"
 bytes = "1"
 chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false }
 scraper = "0.20"
 [dev-dependencies]
 tempfile = "3"
--- a/backend/migrations/0012_crawler.sql
+++ b/backend/migrations/0012_crawler.sql
@@ -0,0 +1,72 @@
 -- Crawler tables.
 --
 -- Same philosophy as 0001_init.sql: new concepts go in new tables
 -- joined to existing ones, not jammed onto `mangas`/`chapters`. A
 -- crawled manga IS a manga; the only thing the source-link tables
 -- carry is "where did this come from and when did we last see it".
 -- That keeps the API and frontend source-agnostic.
 -- 1. Source registry. One row per site the crawler knows about.
 --    `config` carries per-site knobs (base URL, rate limits, custom
 --    selectors) so adding a source is a row insert plus a `Source`
 --    trait impl — no schema change.
 CREATE TABLE sources (
    id          text PRIMARY KEY,
    name        text NOT NULL,
    base_url    text NOT NULL,
    enabled     boolean NOT NULL DEFAULT true,
    config      jsonb NOT NULL DEFAULT '{}'::jsonb,
    created_at  timestamptz NOT NULL DEFAULT now()
 );
 -- 2. Link tables. `(source_id, source_*_key)` is the natural key the
 --    source itself exposes; the FK to `mangas`/`chapters` is what
 --    threads it back into our domain. `metadata_hash` is the signal
 --    used by `crawler::diff` to detect updates without re-comparing
 --    every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
 CREATE TABLE manga_sources (
    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
    source_manga_key    text NOT NULL,
    manga_id            uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
    source_url          text NOT NULL,
    metadata_hash       text,
    first_seen_at       timestamptz NOT NULL DEFAULT now(),
    last_seen_at        timestamptz NOT NULL DEFAULT now(),
    dropped_at          timestamptz,
    PRIMARY KEY (source_id, source_manga_key)
 );
 CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
 CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);
 CREATE TABLE chapter_sources (
    source_id           text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
    source_chapter_key  text NOT NULL,
    chapter_id          uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
    source_url          text NOT NULL,
    first_seen_at       timestamptz NOT NULL DEFAULT now(),
    last_seen_at        timestamptz NOT NULL DEFAULT now(),
    dropped_at          timestamptz,
    PRIMARY KEY (source_id, source_chapter_key)
 );
 CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);
 -- 3. Persistent job queue. Workers lease with
 --    `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
 --    by transitioning state. The partial index keeps the hot path
 --    (pick the next ready job) off the bulk of done/dead rows.
 CREATE TABLE crawler_jobs (
    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
    payload         jsonb NOT NULL,
    state           text NOT NULL DEFAULT 'pending'
                       CHECK (state IN ('pending','running','done','failed','dead')),
    attempts        integer NOT NULL DEFAULT 0,
    max_attempts    integer NOT NULL DEFAULT 5,
    scheduled_at    timestamptz NOT NULL DEFAULT now(),
    leased_until    timestamptz,
    last_error      text,
    created_at      timestamptz NOT NULL DEFAULT now(),
    updated_at      timestamptz NOT NULL DEFAULT now()
 );
 CREATE INDEX crawler_jobs_ready_idx
    ON crawler_jobs (scheduled_at)
    WHERE state IN ('pending', 'failed');
--- a/backend/src/bin/crawler.rs
+++ b/backend/src/bin/crawler.rs
@@ -0,0 +1,29 @@
 //! Crawler binary.
 //!
 //! Today: a thin shell that launches Chromium via the shared
 //! `crawler::browser` module and exits. Useful as an ad-hoc smoke test
 //! for the launcher in addition to the integration test in
 //! `tests/crawler_browser_smoke.rs`.
 //!
 //! Future: reads config, picks `Source` impls, runs the job loop.
 use mangalord::crawler::browser::{self, LaunchOptions};
 use tracing_subscriber::EnvFilter;
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    dotenvy::dotenv().ok();
    tracing_subscriber::fmt()
        .with_env_filter(
            EnvFilter::try_from_default_env()
                .unwrap_or_else(|_| "info,mangalord=debug".into()),
        )
        .init();
    let options = LaunchOptions::from_env();
    tracing::info!(?options, "launching browser");
    let handle = browser::launch(options).await?;
    tracing::info!("browser launched; closing");
    handle.close().await?;
    Ok(())
 }
--- a/backend/src/crawler/browser.rs
+++ b/backend/src/crawler/browser.rs
@@ -0,0 +1,217 @@
 //! Chromium launcher and lifecycle.
 //!
 //! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
 //! system Chrome install — first call downloads a known-good revision
 //! into a cache dir and reuses it forever after. `BrowserMode` toggles
 //! headed vs headless; the headed path needs a display (real `$DISPLAY`
 //! or `xvfb-run`).
 //!
 //! Extra Chromium command-line flags can be supplied through
 //! [`LaunchOptions::extra_args`] in code, or via the
 //! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going
 //! through [`LaunchOptions::from_env`]. The launcher always also
 //! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're
 //! near-mandatory for containerized Chromium; everything else is
 //! caller-provided.
 use std::path::PathBuf;
 use anyhow::Context;
 use chromiumoxide::browser::{Browser, BrowserConfig};
 use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions};
 use futures_util::StreamExt;
 use tokio::task::JoinHandle;
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum BrowserMode {
    /// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the
    /// binary). This is the default the old Puppeteer crawler used and
    /// the assumed mode for the target site until we prove headless
    /// works against it.
    Headed,
    /// No window. Faster, lower resource use, but more likely to trip
    /// fingerprinting on hostile sites.
    Headless,
 }
 /// Configuration for a single browser launch.
 ///
 /// Public fields rather than a builder — there are only two of them
 /// and callers benefit from struct literal syntax for clarity.
 #[derive(Clone, Debug)]
 pub struct LaunchOptions {
    pub mode: BrowserMode,
    /// Extra Chromium flags, appended after the launcher's own
    /// defaults. Example: `vec!["--lang=de-DE".into(),
    /// "--window-size=1280,800".into()]`.
    pub extra_args: Vec<String>,
 }
 impl LaunchOptions {
    pub fn headed() -> Self {
        Self {
            mode: BrowserMode::Headed,
            extra_args: Vec::new(),
        }
    }
    pub fn headless() -> Self {
        Self {
            mode: BrowserMode::Headless,
            extra_args: Vec::new(),
        }
    }
    /// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default
    /// `headed`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated
    /// Chromium flags). Flags containing whitespace aren't supported
    /// through the env var — use the programmatic API for those.
    pub fn from_env() -> Self {
        let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() {
            Ok("headless") => BrowserMode::Headless,
            _ => BrowserMode::Headed,
        };
        let extra_args = std::env::var("CRAWLER_BROWSER_ARGS")
            .map(|s| parse_args(&s))
            .unwrap_or_default();
        Self { mode, extra_args }
    }
 }
 impl Default for LaunchOptions {
    fn default() -> Self {
        Self::headed()
    }
 }
 /// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed
 /// separately from `from_env` so it can be unit-tested without
 /// touching process environment.
 pub(crate) fn parse_args(s: &str) -> Vec<String> {
    s.split_whitespace().map(str::to_string).collect()
 }
 /// Owned browser plus the spawned task that drives its CDP event loop.
 /// Dropping `Handle` without calling `close` leaks the Chromium process
 /// — always call `close().await` in production paths.
 pub struct Handle {
    browser: Browser,
    driver: JoinHandle<()>,
 }
 impl Handle {
    pub fn browser(&self) -> &Browser {
        &self.browser
    }
    pub fn browser_mut(&mut self) -> &mut Browser {
        &mut self.browser
    }
    /// Closes the browser and awaits the driver task. Safe to call
    /// multiple times — subsequent calls are no-ops.
    pub async fn close(mut self) -> anyhow::Result<()> {
        let _ = self.browser.close().await;
        let _ = self.browser.wait().await;
        let _ = self.driver.await;
        Ok(())
    }
 }
 /// Launches Chromium. Downloads it on first run via the `fetcher`
 /// feature; subsequent runs hit the cache. The cache dir is
 /// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
 /// else `./.chromium-cache` as a last-resort repo-local fallback.
 pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
    let cache = cache_dir()?;
    tokio::fs::create_dir_all(&cache)
        .await
        .with_context(|| format!("create cache dir {}", cache.display()))?;
    let fetcher = BrowserFetcher::new(
        BrowserFetcherOptions::builder()
            .with_path(&cache)
            .build()
            .map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
    );
    tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
    let info = fetcher
        .fetch()
        .await
        .context("download chromium via fetcher")?;
    tracing::info!(executable = %info.executable_path.display(), "chromium ready");
    let mut builder = BrowserConfig::builder()
        .chrome_executable(info.executable_path)
        // Linux containers / CI commonly lack the user namespaces
        // Chromium's sandbox wants. Disable it; the crawler runs in its
        // own container anyway.
        .arg("--no-sandbox")
        .arg("--disable-dev-shm-usage");
    for arg in &options.extra_args {
        builder = builder.arg(arg);
    }
    if matches!(options.mode, BrowserMode::Headed) {
        builder = builder.with_head();
    }
    tracing::info!(
        mode = ?options.mode,
        extra_args = ?options.extra_args,
        "building browser config"
    );
    let config = builder
        .build()
        .map_err(|e| anyhow::anyhow!("browser config: {e}"))?;
    let (browser, mut handler) = Browser::launch(config)
        .await
        .context("launch chromium")?;
    let driver = tokio::spawn(async move {
        while let Some(event) = handler.next().await {
            if let Err(err) = event {
                tracing::warn!(?err, "chromium handler event error");
            }
        }
    });
    Ok(Handle { browser, driver })
 }
 fn cache_dir() -> anyhow::Result<PathBuf> {
    if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") {
        return Ok(PathBuf::from(dir));
    }
    if let Ok(home) = std::env::var("HOME") {
        return Ok(PathBuf::from(home).join(".cache/mangalord/chromium"));
    }
    Ok(PathBuf::from("./.chromium-cache"))
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn parse_args_splits_on_whitespace() {
        assert_eq!(
            parse_args("--lang=de-DE --window-size=1280,800"),
            vec!["--lang=de-DE", "--window-size=1280,800"]
        );
    }
    #[test]
    fn parse_args_tolerates_irregular_whitespace() {
        // tabs, multiple spaces, leading/trailing — all collapsed.
        assert_eq!(
            parse_args("  --a\t--b   --c=1\n"),
            vec!["--a", "--b", "--c=1"]
        );
    }
    #[test]
    fn parse_args_empty_string_yields_empty_vec() {
        assert!(parse_args("").is_empty());
        assert!(parse_args("   \t\n").is_empty());
    }
 }
--- a/backend/src/crawler/diff.rs
+++ b/backend/src/crawler/diff.rs
@@ -0,0 +1,15 @@
 //! Change-detection rules between the source and our DB.
 //!
 //! | Event              | Signal                                                                                |
 //! |--------------------|----------------------------------------------------------------------------------------|
 //! | New manga          | `(source_id, source_manga_key)` not in `manga_sources`                                 |
 //! | Updated metadata   | freshly computed `metadata_hash` differs from the stored one                           |
 //! | Dropped manga      | `last_seen_at < discover_run_started_at` for N consecutive successful discover runs    |
 //! | New chapter        | `(source_id, source_chapter_key)` not in `chapter_sources`                             |
 //! | Dropped chapter    | present in DB but absent from the latest `fetch_chapter_list` for the same manga       |
 //!
 //! Dropped is always a soft flag (`dropped_at`), never a row delete —
 //! restoring is a matter of clearing the flag if the source brings the
 //! item back.
 //!
 //! Scaffold only — implementations land once `repo::crawler` exists.
--- a/backend/src/crawler/jobs.rs
+++ b/backend/src/crawler/jobs.rs
@@ -0,0 +1,55 @@
 //! Persistent job queue and the four job kinds.
 //!
 //! Backed by Postgres (the `crawler_jobs` table). Workers lease rows
 //! with `SELECT ... FOR UPDATE SKIP LOCKED`, heartbeat via
 //! `leased_until`, and ack by transitioning to `done` (or backoff /
 //! `dead`). Handlers are idempotent so a crash mid-run is recoverable
 //! by replay.
 //!
 //! Scaffold only — the actual queue wrapper and handler dispatch land
 //! once we have the first `Source` impl exercising the pipeline.
 use serde::{Deserialize, Serialize};
 use uuid::Uuid;
 use super::source::DiscoverMode;
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(tag = "kind", rename_all = "snake_case")]
 pub enum JobPayload {
    /// Walk the source index and enqueue `SyncManga` jobs.
    Discover {
        source_id: String,
        mode: DiscoverMode,
    },
    /// Fetch one manga's detail page, upsert metadata, enqueue
    /// `SyncChapterList`.
    SyncManga {
        source_id: String,
        source_manga_key: String,
    },
    /// Diff the chapter list, enqueue `SyncChapterContent` for new
    /// chapters, soft-drop vanished ones.
    SyncChapterList {
        source_id: String,
        manga_id: Uuid,
        source_manga_key: String,
    },
    /// Download a single chapter's page images into storage.
    SyncChapterContent {
        source_id: String,
        chapter_id: Uuid,
        source_chapter_key: String,
    },
 }
 #[derive(Clone, Copy, Debug, sqlx::Type, Serialize, Deserialize)]
 #[sqlx(type_name = "text", rename_all = "snake_case")]
 #[serde(rename_all = "snake_case")]
 pub enum JobState {
    Pending,
    Running,
    Done,
    Failed,
    Dead,
 }
--- a/backend/src/crawler/mod.rs
+++ b/backend/src/crawler/mod.rs
@@ -0,0 +1,19 @@
 //! Crawler subsystem.
 //!
 //! Runs as its own binary (`src/bin/crawler.rs`) and shares `domain`,
 //! `repo`, and `storage` with the API binary. Layering mirrors the
 //! `Storage` trait pattern: callers depend on the `source::Source`
 //! trait, not on a concrete site; new sites plug in as additional
 //! impls without touching the job runner.
 //!
 //! Submodules:
 //! - [`browser`]: launches and pools Chromium via `chromiumoxide`.
 //!   First run downloads a known-good build via the `fetcher` feature.
 //! - [`source`]: the `Source` trait. Per-site impls live alongside it.
 //! - [`jobs`]: job kinds, queue wrapper, handler dispatch.
 //! - [`diff`]: change detection — new / updated / dropped semantics.
 pub mod browser;
 pub mod diff;
 pub mod jobs;
 pub mod source;
--- a/backend/src/crawler/source.rs
+++ b/backend/src/crawler/source.rs
@@ -0,0 +1,105 @@
 //! `Source` trait — the per-site abstraction.
 //!
 //! Job handlers depend on this trait, not on a concrete site. Adding a
 //! new site is: implement `Source`, register it in a `sources` table
 //! row, and the existing job pipeline picks it up unchanged.
 //!
 //! Scaffold only — the first concrete impl lands in a follow-up commit
 //! once the target site is locked in.
 use async_trait::async_trait;
 use chromiumoxide::browser::Browser;
 use serde::{Deserialize, Serialize};
 /// How a `discover` job should walk the source's index.
 #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
 pub enum DiscoverMode {
    /// Walk every index page from last back to first. Used for the
    /// initial seed of a source.
    Backfill,
    /// Walk index pages from page 1 forward, stopping after
    /// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
    /// matches storage. Used for the recurring cron tick.
    Incremental { stop_after_unchanged: usize },
 }
 /// Pointer at a manga in the source's index, before we've fetched the
 /// detail page. The `source_manga_key` is whatever stable id the source
 /// uses (slug, numeric id, etc).
 #[derive(Clone, Debug)]
 pub struct SourceMangaRef {
    pub source_manga_key: String,
    pub title: String,
    pub url: String,
 }
 /// Full metadata returned by `fetch_manga`. The hash is computed by the
 /// source impl (typically over the normalized field set) and is the
 /// signal `diff` uses to detect metadata updates.
 #[derive(Clone, Debug)]
 pub struct SourceManga {
    pub source_manga_key: String,
    pub title: String,
    pub alternative_titles: Vec<String>,
    pub authors: Vec<String>,
    pub genres: Vec<String>,
    pub tags: Vec<String>,
    pub status: Option<String>,
    pub summary: Option<String>,
    pub cover_url: Option<String>,
    pub metadata_hash: String,
 }
 #[derive(Clone, Debug)]
 pub struct SourceChapterRef {
    pub source_chapter_key: String,
    pub number: i32,
    pub title: Option<String>,
    pub url: String,
 }
 #[derive(Clone, Debug)]
 pub struct SourceChapter {
    pub source_chapter_key: String,
    pub number: i32,
    pub title: Option<String>,
    /// Ordered list of page image URLs, ready to be fetched and put
    /// into `Storage`.
    pub page_urls: Vec<String>,
 }
 /// Context passed to every `Source` call. Owns the browser handle, so
 /// impls can `browser.new_page(...)` without bringing their own.
 pub struct FetchContext<'a> {
    pub browser: &'a Browser,
 }
 #[async_trait]
 pub trait Source: Send + Sync {
    /// Stable identifier — also the row key in the `sources` table.
    fn id(&self) -> &'static str;
    async fn discover(
        &self,
        ctx: &FetchContext<'_>,
        mode: DiscoverMode,
    ) -> anyhow::Result<Vec<SourceMangaRef>>;
    async fn fetch_manga(
        &self,
        ctx: &FetchContext<'_>,
        r: &SourceMangaRef,
    ) -> anyhow::Result<SourceManga>;
    async fn fetch_chapter_list(
        &self,
        ctx: &FetchContext<'_>,
        manga: &SourceManga,
    ) -> anyhow::Result<Vec<SourceChapterRef>>;
    async fn fetch_chapter(
        &self,
        ctx: &FetchContext<'_>,
        r: &SourceChapterRef,
    ) -> anyhow::Result<SourceChapter>;
 }
--- a/backend/src/lib.rs
+++ b/backend/src/lib.rs
@@ -2,6 +2,7 @@ pub mod api;
 pub mod app;
 pub mod auth;
 pub mod config;
 pub mod crawler;
 pub mod domain;
 pub mod error;
 pub mod repo;
--- a/backend/tests/crawler_browser_smoke.rs
+++ b/backend/tests/crawler_browser_smoke.rs
@@ -0,0 +1,157 @@
 //! Smoke test for the Chromium launcher.
 //!
 //! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on
 //! first run via the `fetcher` feature and (b) requires a real `$DISPLAY`
 //! for the headed path. Run it explicitly:
 //!
 //! ```sh
 //! cargo test --test crawler_browser_smoke -- --ignored --nocapture
 //! ```
 //!
 //! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
 //! `$HOME/.cache/mangalord/chromium` isn't writable.
 use mangalord::crawler::browser::{self, LaunchOptions};
 #[tokio::test]
 #[ignore = "downloads Chromium and needs a display; run with --ignored"]
 async fn headed_browser_can_navigate_and_read_title() {
    // A data URL avoids any network dependency — we're testing the
    // browser launcher, not connectivity.
    const PAGE: &str = "data:text/html,<html><head><title>Mangalord%20Smoke</title></head><body>OK</body></html>";
    let handle = browser::launch(LaunchOptions::headed())
        .await
        .expect("launch headed chromium");
    let page = handle
        .browser()
        .new_page(PAGE)
        .await
        .expect("open new page");
    page.wait_for_navigation()
        .await
        .expect("wait for navigation");
    let title = page.get_title().await.expect("get title");
    assert_eq!(title.as_deref(), Some("Mangalord Smoke"));
    handle.close().await.expect("close cleanly");
 }
 #[tokio::test]
 #[ignore = "downloads Chromium; run with --ignored"]
 async fn headless_browser_can_navigate_and_read_title() {
    const PAGE: &str = "data:text/html,<html><head><title>Headless%20OK</title></head><body></body></html>";
    let handle = browser::launch(LaunchOptions::headless())
        .await
        .expect("launch headless chromium");
    let page = handle.browser().new_page(PAGE).await.expect("open new page");
    page.wait_for_navigation().await.expect("wait for navigation");
    let title = page.get_title().await.expect("get title");
    assert_eq!(title.as_deref(), Some("Headless OK"));
    handle.close().await.expect("close cleanly");
 }
 /// Live end-to-end: navigate to a real page, get the rendered HTML, and
 /// parse it with `scraper`. ipify.org renders the visitor's public IP
 /// into the page DOM, so a successful run proves browser → render →
 /// `Html::parse_document` → selector → text extraction all work
 /// against a real site. This is the same path each future `Source`
 /// impl will take.
 #[tokio::test]
 #[ignore = "needs network; run with --ignored"]
 async fn fetches_public_ip_from_ipify() {
    use std::time::Duration;
    let handle = browser::launch(LaunchOptions::headless())
        .await
        .expect("launch headless chromium");
    let page = handle
        .browser()
        .new_page("https://www.ipify.org")
        .await
        .expect("open ipify");
    page.wait_for_navigation().await.expect("wait for navigation");
    // ipify injects the IP via JS after load, so the navigation event
    // alone isn't enough — give the script a beat to run.
    tokio::time::sleep(Duration::from_secs(2)).await;
    let html = page.content().await.expect("get rendered html");
    let doc = scraper::Html::parse_document(&html);
    let body_sel = scraper::Selector::parse("body").unwrap();
    let body_text: String = doc
        .select(&body_sel)
        .next()
        .map(|n| n.text().collect::<Vec<_>>().join(" "))
        .unwrap_or_default();
    let ip = extract_ipv4(&body_text)
        .unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}"));
    eprintln!("ipify says our public IP is: {ip}");
    handle.close().await.expect("close cleanly");
 }
 /// Proves that `LaunchOptions::extra_args` actually reach Chromium and
 /// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`,
 /// observable from JS — read it back via `page.evaluate`.
 #[tokio::test]
 #[ignore = "downloads Chromium; run with --ignored"]
 async fn extra_args_reach_chromium() {
    const UA: &str = "MangalordCrawlerTest/1.0";
    let options = LaunchOptions {
        mode: browser::BrowserMode::Headless,
        extra_args: vec![format!("--user-agent={UA}")],
    };
    let handle = browser::launch(options).await.expect("launch with extra args");
    let page = handle
        .browser()
        .new_page("about:blank")
        .await
        .expect("open page");
    page.wait_for_navigation().await.expect("wait");
    let ua: String = page
        .evaluate("navigator.userAgent")
        .await
        .expect("evaluate navigator.userAgent")
        .into_value()
        .expect("string value");
    assert_eq!(
        ua, UA,
        "extra --user-agent flag should override navigator.userAgent"
    );
    handle.close().await.expect("close cleanly");
 }
 /// Tiny dotted-quad finder — avoids pulling `regex` in just for one
 /// test. Scans the first valid IPv4 substring (four 0..=255 octets
 /// separated by dots).
 fn extract_ipv4(s: &str) -> Option<String> {
    let bytes = s.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if !bytes[i].is_ascii_digit() {
            i += 1;
            continue;
        }
        let start = i;
        while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') {
            i += 1;
        }
        let candidate = &s[start..i];
        let parts: Vec<&str> = candidate.split('.').collect();
        if parts.len() == 4 && parts.iter().all(|p| p.parse::<u8>().is_ok()) {
            return Some(candidate.to_string());
        }
    }
    None
 }
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
 {
  "name": "mangalord-frontend",
-  "version": "0.21.3",
+  "version": "0.22.0",
  "private": true,
  "type": "module",
  "scripts": {