Mangalord/backend/tests/crawler_browser_smoke.rs

//! Smoke test for the Chromium launcher.
//!
//! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on
//! first run via the `fetcher` feature and (b) requires a real `$DISPLAY`
//! for the headed path. Run it explicitly:
//!
//! ```sh
//! cargo test --test crawler_browser_smoke -- --ignored --nocapture
//! ```
//!
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
//! `$HOME/.cache/mangalord/chromium` isn't writable.
//!
//! Set `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell` (or
//! another system chromium path) to exercise the system-chromium
//! launch path instead of the fetcher download — this is the path the
//! Raspberry Pi deployment takes.

use mangalord::crawler::browser::{self, LaunchOptions};

#[tokio::test]
#[ignore = "downloads Chromium and needs a display; run with --ignored"]
async fn headed_browser_can_navigate_and_read_title() {
    // A data URL avoids any network dependency — we're testing the
    // browser launcher, not connectivity.
    const PAGE: &str = "data:text/html,<html><head><title>Mangalord%20Smoke</title></head><body>OK</body></html>";

    let handle = browser::launch(LaunchOptions::headed())
        .await
        .expect("launch headed chromium");

    let page = handle
        .browser()
        .new_page(PAGE)
        .await
        .expect("open new page");
    page.wait_for_navigation()
        .await
        .expect("wait for navigation");

    let title = page.get_title().await.expect("get title");
    assert_eq!(title.as_deref(), Some("Mangalord Smoke"));

    handle.close().await.expect("close cleanly");
}

#[tokio::test]
#[ignore = "downloads Chromium; run with --ignored"]
async fn headless_browser_can_navigate_and_read_title() {
    const PAGE: &str = "data:text/html,<html><head><title>Headless%20OK</title></head><body></body></html>";

    let handle = browser::launch(LaunchOptions::headless())
        .await
        .expect("launch headless chromium");

    let page = handle.browser().new_page(PAGE).await.expect("open new page");
    page.wait_for_navigation().await.expect("wait for navigation");

    let title = page.get_title().await.expect("get title");
    assert_eq!(title.as_deref(), Some("Headless OK"));

    handle.close().await.expect("close cleanly");
}

/// Live end-to-end: navigate to a real page, get the rendered HTML, and
/// parse it with `scraper`. ipify.org renders the visitor's public IP
/// into the page DOM, so a successful run proves browser → render →
/// `Html::parse_document` → selector → text extraction all work
/// against a real site. This is the same path each future `Source`
/// impl will take.
#[tokio::test]
#[ignore = "needs network; run with --ignored"]
async fn fetches_public_ip_from_ipify() {
    use std::time::Duration;

    let handle = browser::launch(LaunchOptions::headless())
        .await
        .expect("launch headless chromium");

    let page = handle
        .browser()
        .new_page("https://www.ipify.org")
        .await
        .expect("open ipify");
    page.wait_for_navigation().await.expect("wait for navigation");
    // ipify injects the IP via JS after load, so the navigation event
    // alone isn't enough — give the script a beat to run.
    tokio::time::sleep(Duration::from_secs(2)).await;

    let html = page.content().await.expect("get rendered html");
    let doc = scraper::Html::parse_document(&html);
    let body_sel = scraper::Selector::parse("body").unwrap();
    let body_text: String = doc
        .select(&body_sel)
        .next()
        .map(|n| n.text().collect::<Vec<_>>().join(" "))
        .unwrap_or_default();

    let ip = extract_ipv4(&body_text)
        .unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}"));
    eprintln!("ipify says our public IP is: {ip}");

    handle.close().await.expect("close cleanly");
}

/// Proves that `LaunchOptions::extra_args` actually reach Chromium and
/// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`,
/// observable from JS — read it back via `page.evaluate`.
#[tokio::test]
#[ignore = "downloads Chromium; run with --ignored"]
async fn extra_args_reach_chromium() {
    const UA: &str = "MangalordCrawlerTest/1.0";
    let options = LaunchOptions {
        mode: browser::BrowserMode::Headless,
        extra_args: vec![format!("--user-agent={UA}")],
    };
    let handle = browser::launch(options).await.expect("launch with extra args");

    let page = handle
        .browser()
        .new_page("about:blank")
        .await
        .expect("open page");
    page.wait_for_navigation().await.expect("wait");

    let ua: String = page
        .evaluate("navigator.userAgent")
        .await
        .expect("evaluate navigator.userAgent")
        .into_value()
        .expect("string value");
    assert_eq!(
        ua, UA,
        "extra --user-agent flag should override navigator.userAgent"
    );

    handle.close().await.expect("close cleanly");
}

/// Tiny dotted-quad finder — avoids pulling `regex` in just for one
/// test. Scans the first valid IPv4 substring (four 0..=255 octets
/// separated by dots).
fn extract_ipv4(s: &str) -> Option<String> {
    let bytes = s.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if !bytes[i].is_ascii_digit() {
            i += 1;
            continue;
        }
        let start = i;
        while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') {
            i += 1;
        }
        let candidate = &s[start..i];
        let parts: Vec<&str> = candidate.split('.').collect();
        if parts.len() == 4 && parts.iter().all(|p| p.parse::<u8>().is_ok()) {
            return Some(candidate.to_string());
        }
    }
    None
}