Mangalord%20Smoke

//! Smoke test for the Chromium launcher. //! //! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on //! first run via the `fetcher` feature and (b) requires a real `$DISPLAY` //! for the headed path. Run it explicitly: //! //! ```sh //! cargo test --test crawler_browser_smoke -- --ignored --nocapture //! ``` //! //! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if //! `$HOME/.cache/mangalord/chromium` isn't writable. use mangalord::crawler::browser::{self, LaunchOptions}; #[tokio::test] #[ignore = "downloads Chromium and needs a display; run with --ignored"] async fn headed_browser_can_navigate_and_read_title() { // A data URL avoids any network dependency — we're testing the // browser launcher, not connectivity. const PAGE: &str = "data:text/html,Mangalord%20SmokeOK"; let handle = browser::launch(LaunchOptions::headed()) .await .expect("launch headed chromium"); let page = handle .browser() .new_page(PAGE) .await .expect("open new page"); page.wait_for_navigation() .await .expect("wait for navigation"); let title = page.get_title().await.expect("get title"); assert_eq!(title.as_deref(), Some("Mangalord Smoke")); handle.close().await.expect("close cleanly"); } #[tokio::test] #[ignore = "downloads Chromium; run with --ignored"] async fn headless_browser_can_navigate_and_read_title() { const PAGE: &str = "data:text/html,Headless%20OK"; let handle = browser::launch(LaunchOptions::headless()) .await .expect("launch headless chromium"); let page = handle.browser().new_page(PAGE).await.expect("open new page"); page.wait_for_navigation().await.expect("wait for navigation"); let title = page.get_title().await.expect("get title"); assert_eq!(title.as_deref(), Some("Headless OK")); handle.close().await.expect("close cleanly"); } /// Live end-to-end: navigate to a real page, get the rendered HTML, and /// parse it with `scraper`. ipify.org renders the visitor's public IP /// into the page DOM, so a successful run proves browser → render → /// `Html::parse_document` → selector → text extraction all work /// against a real site. This is the same path each future `Source` /// impl will take. #[tokio::test] #[ignore = "needs network; run with --ignored"] async fn fetches_public_ip_from_ipify() { use std::time::Duration; let handle = browser::launch(LaunchOptions::headless()) .await .expect("launch headless chromium"); let page = handle .browser() .new_page("https://www.ipify.org") .await .expect("open ipify"); page.wait_for_navigation().await.expect("wait for navigation"); // ipify injects the IP via JS after load, so the navigation event // alone isn't enough — give the script a beat to run. tokio::time::sleep(Duration::from_secs(2)).await; let html = page.content().await.expect("get rendered html"); let doc = scraper::Html::parse_document(&html); let body_sel = scraper::Selector::parse("body").unwrap(); let body_text: String = doc .select(&body_sel) .next() .map(|n| n.text().collect::>().join(" ")) .unwrap_or_default(); let ip = extract_ipv4(&body_text) .unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}")); eprintln!("ipify says our public IP is: {ip}"); handle.close().await.expect("close cleanly"); } /// Proves that `LaunchOptions::extra_args` actually reach Chromium and /// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`, /// observable from JS — read it back via `page.evaluate`. #[tokio::test] #[ignore = "downloads Chromium; run with --ignored"] async fn extra_args_reach_chromium() { const UA: &str = "MangalordCrawlerTest/1.0"; let options = LaunchOptions { mode: browser::BrowserMode::Headless, extra_args: vec![format!("--user-agent={UA}")], }; let handle = browser::launch(options).await.expect("launch with extra args"); let page = handle .browser() .new_page("about:blank") .await .expect("open page"); page.wait_for_navigation().await.expect("wait"); let ua: String = page .evaluate("navigator.userAgent") .await .expect("evaluate navigator.userAgent") .into_value() .expect("string value"); assert_eq!( ua, UA, "extra --user-agent flag should override navigator.userAgent" ); handle.close().await.expect("close cleanly"); } /// Tiny dotted-quad finder — avoids pulling `regex` in just for one /// test. Scans the first valid IPv4 substring (four 0..=255 octets /// separated by dots). fn extract_ipv4(s: &str) -> Option { let bytes = s.as_bytes(); let mut i = 0; while i < bytes.len() { if !bytes[i].is_ascii_digit() { i += 1; continue; } let start = i; while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') { i += 1; } let candidate = &s[start..i]; let parts: Vec<&str> = candidate.split('.').collect(); if parts.len() == 4 && parts.iter().all(|p| p.parse::().is_ok()) { return Some(candidate.to_string()); } } None }