- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
158 lines
5.4 KiB
Rust
158 lines
5.4 KiB
Rust
//! Smoke test for the Chromium launcher.
|
|
//!
|
|
//! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on
|
|
//! first run via the `fetcher` feature and (b) requires a real `$DISPLAY`
|
|
//! for the headed path. Run it explicitly:
|
|
//!
|
|
//! ```sh
|
|
//! cargo test --test crawler_browser_smoke -- --ignored --nocapture
|
|
//! ```
|
|
//!
|
|
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
|
|
//! `$HOME/.cache/mangalord/chromium` isn't writable.
|
|
|
|
use mangalord::crawler::browser::{self, LaunchOptions};
|
|
|
|
#[tokio::test]
|
|
#[ignore = "downloads Chromium and needs a display; run with --ignored"]
|
|
async fn headed_browser_can_navigate_and_read_title() {
|
|
// A data URL avoids any network dependency — we're testing the
|
|
// browser launcher, not connectivity.
|
|
const PAGE: &str = "data:text/html,<html><head><title>Mangalord%20Smoke</title></head><body>OK</body></html>";
|
|
|
|
let handle = browser::launch(LaunchOptions::headed())
|
|
.await
|
|
.expect("launch headed chromium");
|
|
|
|
let page = handle
|
|
.browser()
|
|
.new_page(PAGE)
|
|
.await
|
|
.expect("open new page");
|
|
page.wait_for_navigation()
|
|
.await
|
|
.expect("wait for navigation");
|
|
|
|
let title = page.get_title().await.expect("get title");
|
|
assert_eq!(title.as_deref(), Some("Mangalord Smoke"));
|
|
|
|
handle.close().await.expect("close cleanly");
|
|
}
|
|
|
|
#[tokio::test]
|
|
#[ignore = "downloads Chromium; run with --ignored"]
|
|
async fn headless_browser_can_navigate_and_read_title() {
|
|
const PAGE: &str = "data:text/html,<html><head><title>Headless%20OK</title></head><body></body></html>";
|
|
|
|
let handle = browser::launch(LaunchOptions::headless())
|
|
.await
|
|
.expect("launch headless chromium");
|
|
|
|
let page = handle.browser().new_page(PAGE).await.expect("open new page");
|
|
page.wait_for_navigation().await.expect("wait for navigation");
|
|
|
|
let title = page.get_title().await.expect("get title");
|
|
assert_eq!(title.as_deref(), Some("Headless OK"));
|
|
|
|
handle.close().await.expect("close cleanly");
|
|
}
|
|
|
|
/// Live end-to-end: navigate to a real page, get the rendered HTML, and
|
|
/// parse it with `scraper`. ipify.org renders the visitor's public IP
|
|
/// into the page DOM, so a successful run proves browser → render →
|
|
/// `Html::parse_document` → selector → text extraction all work
|
|
/// against a real site. This is the same path each future `Source`
|
|
/// impl will take.
|
|
#[tokio::test]
|
|
#[ignore = "needs network; run with --ignored"]
|
|
async fn fetches_public_ip_from_ipify() {
|
|
use std::time::Duration;
|
|
|
|
let handle = browser::launch(LaunchOptions::headless())
|
|
.await
|
|
.expect("launch headless chromium");
|
|
|
|
let page = handle
|
|
.browser()
|
|
.new_page("https://www.ipify.org")
|
|
.await
|
|
.expect("open ipify");
|
|
page.wait_for_navigation().await.expect("wait for navigation");
|
|
// ipify injects the IP via JS after load, so the navigation event
|
|
// alone isn't enough — give the script a beat to run.
|
|
tokio::time::sleep(Duration::from_secs(2)).await;
|
|
|
|
let html = page.content().await.expect("get rendered html");
|
|
let doc = scraper::Html::parse_document(&html);
|
|
let body_sel = scraper::Selector::parse("body").unwrap();
|
|
let body_text: String = doc
|
|
.select(&body_sel)
|
|
.next()
|
|
.map(|n| n.text().collect::<Vec<_>>().join(" "))
|
|
.unwrap_or_default();
|
|
|
|
let ip = extract_ipv4(&body_text)
|
|
.unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}"));
|
|
eprintln!("ipify says our public IP is: {ip}");
|
|
|
|
handle.close().await.expect("close cleanly");
|
|
}
|
|
|
|
/// Proves that `LaunchOptions::extra_args` actually reach Chromium and
|
|
/// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`,
|
|
/// observable from JS — read it back via `page.evaluate`.
|
|
#[tokio::test]
|
|
#[ignore = "downloads Chromium; run with --ignored"]
|
|
async fn extra_args_reach_chromium() {
|
|
const UA: &str = "MangalordCrawlerTest/1.0";
|
|
let options = LaunchOptions {
|
|
mode: browser::BrowserMode::Headless,
|
|
extra_args: vec![format!("--user-agent={UA}")],
|
|
};
|
|
let handle = browser::launch(options).await.expect("launch with extra args");
|
|
|
|
let page = handle
|
|
.browser()
|
|
.new_page("about:blank")
|
|
.await
|
|
.expect("open page");
|
|
page.wait_for_navigation().await.expect("wait");
|
|
|
|
let ua: String = page
|
|
.evaluate("navigator.userAgent")
|
|
.await
|
|
.expect("evaluate navigator.userAgent")
|
|
.into_value()
|
|
.expect("string value");
|
|
assert_eq!(
|
|
ua, UA,
|
|
"extra --user-agent flag should override navigator.userAgent"
|
|
);
|
|
|
|
handle.close().await.expect("close cleanly");
|
|
}
|
|
|
|
/// Tiny dotted-quad finder — avoids pulling `regex` in just for one
|
|
/// test. Scans the first valid IPv4 substring (four 0..=255 octets
|
|
/// separated by dots).
|
|
fn extract_ipv4(s: &str) -> Option<String> {
|
|
let bytes = s.as_bytes();
|
|
let mut i = 0;
|
|
while i < bytes.len() {
|
|
if !bytes[i].is_ascii_digit() {
|
|
i += 1;
|
|
continue;
|
|
}
|
|
let start = i;
|
|
while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') {
|
|
i += 1;
|
|
}
|
|
let candidate = &s[start..i];
|
|
let parts: Vec<&str> = candidate.split('.').collect();
|
|
if parts.len() == 4 && parts.iter().all(|p| p.parse::<u8>().is_ok()) {
|
|
return Some(candidate.to_string());
|
|
}
|
|
}
|
|
None
|
|
}
|