//! Chromium launcher and lifecycle. //! //! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a //! system Chrome install — first call downloads a known-good revision //! into a cache dir and reuses it forever after. `BrowserMode` toggles //! headed vs headless; the headed path needs a display (real `$DISPLAY` //! or `xvfb-run`). //! //! Extra Chromium command-line flags can be supplied through //! [`LaunchOptions::extra_args`] in code, or via the //! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going //! through [`LaunchOptions::from_env`]. The launcher always also //! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're //! near-mandatory for containerized Chromium; everything else is //! caller-provided. use std::path::PathBuf; use anyhow::Context; use chromiumoxide::browser::{Browser, BrowserConfig}; use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions}; use futures_util::StreamExt; use tokio::task::JoinHandle; #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum BrowserMode { /// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the /// binary). This is the default the old Puppeteer crawler used and /// the assumed mode for the target site until we prove headless /// works against it. Headed, /// No window. Faster, lower resource use, but more likely to trip /// fingerprinting on hostile sites. Headless, } /// Configuration for a single browser launch. /// /// Public fields rather than a builder — there are only two of them /// and callers benefit from struct literal syntax for clarity. #[derive(Clone, Debug)] pub struct LaunchOptions { pub mode: BrowserMode, /// Extra Chromium flags, appended after the launcher's own /// defaults. Example: `vec!["--lang=de-DE".into(), /// "--window-size=1280,800".into()]`. pub extra_args: Vec, } impl LaunchOptions { pub fn headed() -> Self { Self { mode: BrowserMode::Headed, extra_args: Vec::new(), } } pub fn headless() -> Self { Self { mode: BrowserMode::Headless, extra_args: Vec::new(), } } /// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default /// `headed`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated /// Chromium flags). Flags containing whitespace aren't supported /// through the env var — use the programmatic API for those. pub fn from_env() -> Self { let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() { Ok("headless") => BrowserMode::Headless, _ => BrowserMode::Headed, }; let extra_args = std::env::var("CRAWLER_BROWSER_ARGS") .map(|s| parse_args(&s)) .unwrap_or_default(); Self { mode, extra_args } } } impl Default for LaunchOptions { fn default() -> Self { Self::headed() } } /// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed /// separately from `from_env` so it can be unit-tested without /// touching process environment. pub(crate) fn parse_args(s: &str) -> Vec { s.split_whitespace().map(str::to_string).collect() } /// Owned browser plus the spawned task that drives its CDP event loop. /// Dropping `Handle` without calling `close` leaks the Chromium process /// — always call `close().await` in production paths. pub struct Handle { browser: Browser, driver: JoinHandle<()>, } impl Handle { pub fn browser(&self) -> &Browser { &self.browser } pub fn browser_mut(&mut self) -> &mut Browser { &mut self.browser } /// Closes the browser and awaits the driver task. Safe to call /// multiple times — subsequent calls are no-ops. pub async fn close(mut self) -> anyhow::Result<()> { let _ = self.browser.close().await; let _ = self.browser.wait().await; let _ = self.driver.await; Ok(()) } } /// Launches Chromium. Downloads it on first run via the `fetcher` /// feature; subsequent runs hit the cache. The cache dir is /// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`, /// else `./.chromium-cache` as a last-resort repo-local fallback. pub async fn launch(options: LaunchOptions) -> anyhow::Result { let cache = cache_dir()?; tokio::fs::create_dir_all(&cache) .await .with_context(|| format!("create cache dir {}", cache.display()))?; let fetcher = BrowserFetcher::new( BrowserFetcherOptions::builder() .with_path(&cache) .build() .map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?, ); tracing::info!(path = %cache.display(), "ensuring chromium revision is present"); let info = fetcher .fetch() .await .context("download chromium via fetcher")?; tracing::info!(executable = %info.executable_path.display(), "chromium ready"); let mut builder = BrowserConfig::builder() .chrome_executable(info.executable_path) // Linux containers / CI commonly lack the user namespaces // Chromium's sandbox wants. Disable it; the crawler runs in its // own container anyway. .arg("--no-sandbox") .arg("--disable-dev-shm-usage"); for arg in &options.extra_args { builder = builder.arg(arg); } if matches!(options.mode, BrowserMode::Headed) { builder = builder.with_head(); } tracing::info!( mode = ?options.mode, extra_args = ?options.extra_args, "building browser config" ); let config = builder .build() .map_err(|e| anyhow::anyhow!("browser config: {e}"))?; let (browser, mut handler) = Browser::launch(config) .await .context("launch chromium")?; let driver = tokio::spawn(async move { while let Some(event) = handler.next().await { if let Err(err) = event { tracing::warn!(?err, "chromium handler event error"); } } }); Ok(Handle { browser, driver }) } fn cache_dir() -> anyhow::Result { if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") { return Ok(PathBuf::from(dir)); } if let Ok(home) = std::env::var("HOME") { return Ok(PathBuf::from(home).join(".cache/mangalord/chromium")); } Ok(PathBuf::from("./.chromium-cache")) } #[cfg(test)] mod tests { use super::*; #[test] fn parse_args_splits_on_whitespace() { assert_eq!( parse_args("--lang=de-DE --window-size=1280,800"), vec!["--lang=de-DE", "--window-size=1280,800"] ); } #[test] fn parse_args_tolerates_irregular_whitespace() { // tabs, multiple spaces, leading/trailing — all collapsed. assert_eq!( parse_args(" --a\t--b --c=1\n"), vec!["--a", "--b", "--c=1"] ); } #[test] fn parse_args_empty_string_yields_empty_vec() { assert!(parse_args("").is_empty()); assert!(parse_args(" \t\n").is_empty()); } }