feat: crawler scaffold with chromium launcher (0.22.0)
- crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in)
This commit is contained in:
217
backend/src/crawler/browser.rs
Normal file
217
backend/src/crawler/browser.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//! Chromium launcher and lifecycle.
|
||||
//!
|
||||
//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
|
||||
//! system Chrome install — first call downloads a known-good revision
|
||||
//! into a cache dir and reuses it forever after. `BrowserMode` toggles
|
||||
//! headed vs headless; the headed path needs a display (real `$DISPLAY`
|
||||
//! or `xvfb-run`).
|
||||
//!
|
||||
//! Extra Chromium command-line flags can be supplied through
|
||||
//! [`LaunchOptions::extra_args`] in code, or via the
|
||||
//! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going
|
||||
//! through [`LaunchOptions::from_env`]. The launcher always also
|
||||
//! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're
|
||||
//! near-mandatory for containerized Chromium; everything else is
|
||||
//! caller-provided.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::Context;
|
||||
use chromiumoxide::browser::{Browser, BrowserConfig};
|
||||
use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions};
|
||||
use futures_util::StreamExt;
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum BrowserMode {
|
||||
/// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the
|
||||
/// binary). This is the default the old Puppeteer crawler used and
|
||||
/// the assumed mode for the target site until we prove headless
|
||||
/// works against it.
|
||||
Headed,
|
||||
/// No window. Faster, lower resource use, but more likely to trip
|
||||
/// fingerprinting on hostile sites.
|
||||
Headless,
|
||||
}
|
||||
|
||||
/// Configuration for a single browser launch.
|
||||
///
|
||||
/// Public fields rather than a builder — there are only two of them
|
||||
/// and callers benefit from struct literal syntax for clarity.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LaunchOptions {
|
||||
pub mode: BrowserMode,
|
||||
/// Extra Chromium flags, appended after the launcher's own
|
||||
/// defaults. Example: `vec!["--lang=de-DE".into(),
|
||||
/// "--window-size=1280,800".into()]`.
|
||||
pub extra_args: Vec<String>,
|
||||
}
|
||||
|
||||
impl LaunchOptions {
|
||||
pub fn headed() -> Self {
|
||||
Self {
|
||||
mode: BrowserMode::Headed,
|
||||
extra_args: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn headless() -> Self {
|
||||
Self {
|
||||
mode: BrowserMode::Headless,
|
||||
extra_args: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default
|
||||
/// `headed`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated
|
||||
/// Chromium flags). Flags containing whitespace aren't supported
|
||||
/// through the env var — use the programmatic API for those.
|
||||
pub fn from_env() -> Self {
|
||||
let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() {
|
||||
Ok("headless") => BrowserMode::Headless,
|
||||
_ => BrowserMode::Headed,
|
||||
};
|
||||
let extra_args = std::env::var("CRAWLER_BROWSER_ARGS")
|
||||
.map(|s| parse_args(&s))
|
||||
.unwrap_or_default();
|
||||
Self { mode, extra_args }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LaunchOptions {
|
||||
fn default() -> Self {
|
||||
Self::headed()
|
||||
}
|
||||
}
|
||||
|
||||
/// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed
|
||||
/// separately from `from_env` so it can be unit-tested without
|
||||
/// touching process environment.
|
||||
pub(crate) fn parse_args(s: &str) -> Vec<String> {
|
||||
s.split_whitespace().map(str::to_string).collect()
|
||||
}
|
||||
|
||||
/// Owned browser plus the spawned task that drives its CDP event loop.
|
||||
/// Dropping `Handle` without calling `close` leaks the Chromium process
|
||||
/// — always call `close().await` in production paths.
|
||||
pub struct Handle {
|
||||
browser: Browser,
|
||||
driver: JoinHandle<()>,
|
||||
}
|
||||
|
||||
impl Handle {
|
||||
pub fn browser(&self) -> &Browser {
|
||||
&self.browser
|
||||
}
|
||||
|
||||
pub fn browser_mut(&mut self) -> &mut Browser {
|
||||
&mut self.browser
|
||||
}
|
||||
|
||||
/// Closes the browser and awaits the driver task. Safe to call
|
||||
/// multiple times — subsequent calls are no-ops.
|
||||
pub async fn close(mut self) -> anyhow::Result<()> {
|
||||
let _ = self.browser.close().await;
|
||||
let _ = self.browser.wait().await;
|
||||
let _ = self.driver.await;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Launches Chromium. Downloads it on first run via the `fetcher`
|
||||
/// feature; subsequent runs hit the cache. The cache dir is
|
||||
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
|
||||
/// else `./.chromium-cache` as a last-resort repo-local fallback.
|
||||
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
||||
let cache = cache_dir()?;
|
||||
tokio::fs::create_dir_all(&cache)
|
||||
.await
|
||||
.with_context(|| format!("create cache dir {}", cache.display()))?;
|
||||
|
||||
let fetcher = BrowserFetcher::new(
|
||||
BrowserFetcherOptions::builder()
|
||||
.with_path(&cache)
|
||||
.build()
|
||||
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
|
||||
);
|
||||
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
|
||||
let info = fetcher
|
||||
.fetch()
|
||||
.await
|
||||
.context("download chromium via fetcher")?;
|
||||
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
|
||||
|
||||
let mut builder = BrowserConfig::builder()
|
||||
.chrome_executable(info.executable_path)
|
||||
// Linux containers / CI commonly lack the user namespaces
|
||||
// Chromium's sandbox wants. Disable it; the crawler runs in its
|
||||
// own container anyway.
|
||||
.arg("--no-sandbox")
|
||||
.arg("--disable-dev-shm-usage");
|
||||
for arg in &options.extra_args {
|
||||
builder = builder.arg(arg);
|
||||
}
|
||||
if matches!(options.mode, BrowserMode::Headed) {
|
||||
builder = builder.with_head();
|
||||
}
|
||||
tracing::info!(
|
||||
mode = ?options.mode,
|
||||
extra_args = ?options.extra_args,
|
||||
"building browser config"
|
||||
);
|
||||
let config = builder
|
||||
.build()
|
||||
.map_err(|e| anyhow::anyhow!("browser config: {e}"))?;
|
||||
|
||||
let (browser, mut handler) = Browser::launch(config)
|
||||
.await
|
||||
.context("launch chromium")?;
|
||||
|
||||
let driver = tokio::spawn(async move {
|
||||
while let Some(event) = handler.next().await {
|
||||
if let Err(err) = event {
|
||||
tracing::warn!(?err, "chromium handler event error");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Handle { browser, driver })
|
||||
}
|
||||
|
||||
fn cache_dir() -> anyhow::Result<PathBuf> {
|
||||
if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") {
|
||||
return Ok(PathBuf::from(dir));
|
||||
}
|
||||
if let Ok(home) = std::env::var("HOME") {
|
||||
return Ok(PathBuf::from(home).join(".cache/mangalord/chromium"));
|
||||
}
|
||||
Ok(PathBuf::from("./.chromium-cache"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_args_splits_on_whitespace() {
|
||||
assert_eq!(
|
||||
parse_args("--lang=de-DE --window-size=1280,800"),
|
||||
vec!["--lang=de-DE", "--window-size=1280,800"]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_args_tolerates_irregular_whitespace() {
|
||||
// tabs, multiple spaces, leading/trailing — all collapsed.
|
||||
assert_eq!(
|
||||
parse_args(" --a\t--b --c=1\n"),
|
||||
vec!["--a", "--b", "--c=1"]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_args_empty_string_yields_empty_vec() {
|
||||
assert!(parse_args("").is_empty());
|
||||
assert!(parse_args(" \t\n").is_empty());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user