Files
Mangalord/backend/src/crawler/browser.rs
MechaCat02 51f42b03e9 feat: default crawler browser to headless (0.32.0)
LaunchOptions::from_env() and LaunchOptions::default() now return
BrowserMode::Headless. The in-process daemon (via CrawlerConfig::from_env)
and the standalone crawler binary both pick this up — no display
required for production runs, smaller resource footprint.

`Headed` stays as an explicit opt-in via CRAWLER_BROWSER_MODE=headed
for debugging or sites that fingerprint headless Chrome. New unit test
locks the default in place.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 20:27:05 +02:00

265 lines
9.2 KiB
Rust

//! Chromium launcher and lifecycle.
//!
//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
//! system Chrome install — first call downloads a known-good revision
//! into a cache dir and reuses it forever after. `BrowserMode` toggles
//! headed vs headless; the headed path needs a display (real `$DISPLAY`
//! or `xvfb-run`).
//!
//! Extra Chromium command-line flags can be supplied through
//! [`LaunchOptions::extra_args`] in code, or via the
//! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going
//! through [`LaunchOptions::from_env`]. The launcher always also
//! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're
//! near-mandatory for containerized Chromium; everything else is
//! caller-provided.
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::Context;
use chromiumoxide::browser::{Browser, BrowserConfig};
use chromiumoxide::error::CdpError;
use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions};
use futures_util::StreamExt;
use tokio::task::JoinHandle;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum BrowserMode {
/// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the
/// binary). Opt-in via `CRAWLER_BROWSER_MODE=headed` — useful for
/// debugging a flow visually or for sites that fingerprint
/// headless Chrome. Not used in production.
Headed,
/// No window. Faster, lower resource use, runs without a display.
/// This is the default for both `from_env()` and `Default`.
Headless,
}
/// Configuration for a single browser launch.
///
/// Public fields rather than a builder — there are only two of them
/// and callers benefit from struct literal syntax for clarity.
#[derive(Clone, Debug)]
pub struct LaunchOptions {
pub mode: BrowserMode,
/// Extra Chromium flags, appended after the launcher's own
/// defaults. Example: `vec!["--lang=de-DE".into(),
/// "--window-size=1280,800".into()]`.
pub extra_args: Vec<String>,
}
impl LaunchOptions {
pub fn headed() -> Self {
Self {
mode: BrowserMode::Headed,
extra_args: Vec::new(),
}
}
pub fn headless() -> Self {
Self {
mode: BrowserMode::Headless,
extra_args: Vec::new(),
}
}
/// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default
/// `headless`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated
/// Chromium flags). Flags containing whitespace aren't supported
/// through the env var — use the programmatic API for those.
pub fn from_env() -> Self {
let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() {
Ok("headed") => BrowserMode::Headed,
_ => BrowserMode::Headless,
};
let extra_args = std::env::var("CRAWLER_BROWSER_ARGS")
.map(|s| parse_args(&s))
.unwrap_or_default();
Self { mode, extra_args }
}
}
impl Default for LaunchOptions {
fn default() -> Self {
Self::headless()
}
}
/// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed
/// separately from `from_env` so it can be unit-tested without
/// touching process environment.
pub(crate) fn parse_args(s: &str) -> Vec<String> {
s.split_whitespace().map(str::to_string).collect()
}
/// Owned browser plus the spawned task that drives its CDP event loop.
/// Dropping `Handle` without calling `close` leaks the Chromium process
/// — always call `close().await` in production paths.
///
/// The browser is stored behind an `Arc` so it can be shared across
/// worker tasks (via [`Handle::shared`]) without copying. `Browser::new_page`
/// only needs `&self`, so multiple workers can drive the same browser
/// concurrently as long as the manager keeps the `Arc` alive.
pub struct Handle {
browser: Arc<Browser>,
driver: JoinHandle<()>,
}
impl Handle {
/// Borrow the browser. Equivalent to `&*handle.shared()`.
pub fn browser(&self) -> &Browser {
&self.browser
}
/// Clone the shared handle. Workers hold these to call `new_page`
/// concurrently. The browser only exits when the last `Arc<Browser>`
/// is dropped (kill-on-drop), or when `close()` is called on the
/// originating `Handle` while it is the sole holder.
pub fn shared(&self) -> Arc<Browser> {
Arc::clone(&self.browser)
}
/// Closes the browser and awaits the driver task. If other Arcs to
/// the browser are still alive we fall back to drop-kills-Chromium
/// semantics and just join the driver — this is the rare case where
/// shutdown raced an outstanding worker; the OS-level kill is the
/// safety net.
pub async fn close(self) -> anyhow::Result<()> {
match Arc::try_unwrap(self.browser) {
Ok(mut owned) => {
let _ = owned.close().await;
let _ = owned.wait().await;
}
Err(shared) => {
tracing::warn!(
strong_count = Arc::strong_count(&shared),
"Handle::close while Arc<Browser> still shared — relying on kill-on-drop"
);
drop(shared);
}
}
let _ = self.driver.await;
Ok(())
}
}
/// Launches Chromium. Downloads it on first run via the `fetcher`
/// feature; subsequent runs hit the cache. The cache dir is
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
/// else `./.chromium-cache` as a last-resort repo-local fallback.
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
let cache = cache_dir()?;
tokio::fs::create_dir_all(&cache)
.await
.with_context(|| format!("create cache dir {}", cache.display()))?;
let fetcher = BrowserFetcher::new(
BrowserFetcherOptions::builder()
.with_path(&cache)
.build()
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
);
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
let info = fetcher
.fetch()
.await
.context("download chromium via fetcher")?;
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
let mut builder = BrowserConfig::builder()
.chrome_executable(info.executable_path)
// Linux containers / CI commonly lack the user namespaces
// Chromium's sandbox wants. Disable it; the crawler runs in its
// own container anyway.
.arg("--no-sandbox")
.arg("--disable-dev-shm-usage");
for arg in &options.extra_args {
builder = builder.arg(arg);
}
if matches!(options.mode, BrowserMode::Headed) {
builder = builder.with_head();
}
tracing::info!(
mode = ?options.mode,
extra_args = ?options.extra_args,
"building browser config"
);
let config = builder
.build()
.map_err(|e| anyhow::anyhow!("browser config: {e}"))?;
let (browser, mut handler) = Browser::launch(config)
.await
.context("launch chromium")?;
let driver = tokio::spawn(async move {
while let Some(event) = handler.next().await {
match event {
Ok(_) => {}
// chromiumoxide 0.7 ships fixed CDP type bindings, so any
// CDP event Chrome added later fails to deserialize. The
// connection is unaffected — these are noise. Suppress
// them so real failures stay visible.
Err(CdpError::Serde(_)) => {
tracing::trace!("chromium emitted an unrecognized CDP event");
}
Err(err) => tracing::warn!(?err, "chromium handler event error"),
}
}
});
Ok(Handle {
browser: Arc::new(browser),
driver,
})
}
fn cache_dir() -> anyhow::Result<PathBuf> {
if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") {
return Ok(PathBuf::from(dir));
}
if let Ok(home) = std::env::var("HOME") {
return Ok(PathBuf::from(home).join(".cache/mangalord/chromium"));
}
Ok(PathBuf::from("./.chromium-cache"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_args_splits_on_whitespace() {
assert_eq!(
parse_args("--lang=de-DE --window-size=1280,800"),
vec!["--lang=de-DE", "--window-size=1280,800"]
);
}
#[test]
fn parse_args_tolerates_irregular_whitespace() {
// tabs, multiple spaces, leading/trailing — all collapsed.
assert_eq!(
parse_args(" --a\t--b --c=1\n"),
vec!["--a", "--b", "--c=1"]
);
}
#[test]
fn parse_args_empty_string_yields_empty_vec() {
assert!(parse_args("").is_empty());
assert!(parse_args(" \t\n").is_empty());
}
#[test]
fn default_launch_options_are_headless() {
// Headless is the production-safe default — no display required,
// smaller resource footprint. `Headed` stays available as an
// opt-in for debugging via CRAWLER_BROWSER_MODE=headed.
assert_eq!(LaunchOptions::default().mode, BrowserMode::Headless);
assert_eq!(LaunchOptions::headless().mode, BrowserMode::Headless);
assert_eq!(LaunchOptions::headed().mode, BrowserMode::Headed);
}
}