Files
Mangalord/backend/tests/crawler_browser_smoke.rs
MechaCat02 26eccd0abe feat: crawler scaffold with chromium launcher (0.22.0)
- crawler module (browser, source trait, jobs, diff) + binary
- chromiumoxide launcher with fetcher feature (auto-downloads
  Chromium on first run, caches under ~/.cache/mangalord/chromium)
- LaunchOptions struct with extra_args, parseable from
  CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS
- migration 0012 introduces sources, manga_sources,
  chapter_sources, crawler_jobs
- integration tests for headed + headless launch, ipify load+parse,
  and extra-args propagation (all #[ignore], opt-in)
2026-05-20 22:07:56 +02:00

158 lines
5.4 KiB
Rust

//! Smoke test for the Chromium launcher.
//!
//! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on
//! first run via the `fetcher` feature and (b) requires a real `$DISPLAY`
//! for the headed path. Run it explicitly:
//!
//! ```sh
//! cargo test --test crawler_browser_smoke -- --ignored --nocapture
//! ```
//!
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
//! `$HOME/.cache/mangalord/chromium` isn't writable.
use mangalord::crawler::browser::{self, LaunchOptions};
#[tokio::test]
#[ignore = "downloads Chromium and needs a display; run with --ignored"]
async fn headed_browser_can_navigate_and_read_title() {
// A data URL avoids any network dependency — we're testing the
// browser launcher, not connectivity.
const PAGE: &str = "data:text/html,<html><head><title>Mangalord%20Smoke</title></head><body>OK</body></html>";
let handle = browser::launch(LaunchOptions::headed())
.await
.expect("launch headed chromium");
let page = handle
.browser()
.new_page(PAGE)
.await
.expect("open new page");
page.wait_for_navigation()
.await
.expect("wait for navigation");
let title = page.get_title().await.expect("get title");
assert_eq!(title.as_deref(), Some("Mangalord Smoke"));
handle.close().await.expect("close cleanly");
}
#[tokio::test]
#[ignore = "downloads Chromium; run with --ignored"]
async fn headless_browser_can_navigate_and_read_title() {
const PAGE: &str = "data:text/html,<html><head><title>Headless%20OK</title></head><body></body></html>";
let handle = browser::launch(LaunchOptions::headless())
.await
.expect("launch headless chromium");
let page = handle.browser().new_page(PAGE).await.expect("open new page");
page.wait_for_navigation().await.expect("wait for navigation");
let title = page.get_title().await.expect("get title");
assert_eq!(title.as_deref(), Some("Headless OK"));
handle.close().await.expect("close cleanly");
}
/// Live end-to-end: navigate to a real page, get the rendered HTML, and
/// parse it with `scraper`. ipify.org renders the visitor's public IP
/// into the page DOM, so a successful run proves browser → render →
/// `Html::parse_document` → selector → text extraction all work
/// against a real site. This is the same path each future `Source`
/// impl will take.
#[tokio::test]
#[ignore = "needs network; run with --ignored"]
async fn fetches_public_ip_from_ipify() {
use std::time::Duration;
let handle = browser::launch(LaunchOptions::headless())
.await
.expect("launch headless chromium");
let page = handle
.browser()
.new_page("https://www.ipify.org")
.await
.expect("open ipify");
page.wait_for_navigation().await.expect("wait for navigation");
// ipify injects the IP via JS after load, so the navigation event
// alone isn't enough — give the script a beat to run.
tokio::time::sleep(Duration::from_secs(2)).await;
let html = page.content().await.expect("get rendered html");
let doc = scraper::Html::parse_document(&html);
let body_sel = scraper::Selector::parse("body").unwrap();
let body_text: String = doc
.select(&body_sel)
.next()
.map(|n| n.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default();
let ip = extract_ipv4(&body_text)
.unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}"));
eprintln!("ipify says our public IP is: {ip}");
handle.close().await.expect("close cleanly");
}
/// Proves that `LaunchOptions::extra_args` actually reach Chromium and
/// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`,
/// observable from JS — read it back via `page.evaluate`.
#[tokio::test]
#[ignore = "downloads Chromium; run with --ignored"]
async fn extra_args_reach_chromium() {
const UA: &str = "MangalordCrawlerTest/1.0";
let options = LaunchOptions {
mode: browser::BrowserMode::Headless,
extra_args: vec![format!("--user-agent={UA}")],
};
let handle = browser::launch(options).await.expect("launch with extra args");
let page = handle
.browser()
.new_page("about:blank")
.await
.expect("open page");
page.wait_for_navigation().await.expect("wait");
let ua: String = page
.evaluate("navigator.userAgent")
.await
.expect("evaluate navigator.userAgent")
.into_value()
.expect("string value");
assert_eq!(
ua, UA,
"extra --user-agent flag should override navigator.userAgent"
);
handle.close().await.expect("close cleanly");
}
/// Tiny dotted-quad finder — avoids pulling `regex` in just for one
/// test. Scans the first valid IPv4 substring (four 0..=255 octets
/// separated by dots).
fn extract_ipv4(s: &str) -> Option<String> {
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
if !bytes[i].is_ascii_digit() {
i += 1;
continue;
}
let start = i;
while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') {
i += 1;
}
let candidate = &s[start..i];
let parts: Vec<&str> = candidate.split('.').collect();
if parts.len() == 4 && parts.iter().all(|p| p.parse::<u8>().is_ok()) {
return Some(candidate.to_string());
}
}
None
}