diff --git a/.env.example b/.env.example index 7be35eb..274ac73 100644 --- a/.env.example +++ b/.env.example @@ -74,6 +74,14 @@ CRAWLER_DOWNLOAD_ALLOWLIST= CRAWLER_ALLOW_ANY_HOST=false # Hard cap on a single image body. Default 32 MiB. CRAWLER_MAX_IMAGE_BYTES=33554432 +# Path to a system Chromium binary. When set, the crawler skips the +# bundled-fetcher download. Required on platforms without a usable +# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On +# Debian: /usr/bin/chromium-headless-shell or /usr/bin/chromium. On +# Ubuntu the package is chromium-browser (different path). Pair with +# `docker compose build --build-arg INSTALL_CHROMIUM=true backend` so +# the image actually contains the binary. +CRAWLER_CHROMIUM_BINARY= # ----- Frontend ----- # The frontend container runs SvelteKit's Node adapter on :3000 and diff --git a/backend/Cargo.lock b/backend/Cargo.lock index d41e0a2..920feaa 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.44.0" +version = "0.45.0" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 33a8ddb..b826569 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.44.0" +version = "0.45.0" edition = "2021" default-run = "mangalord" diff --git a/backend/Dockerfile b/backend/Dockerfile index fd4f87c..065269e 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -25,8 +25,23 @@ FROM debian:trixie-slim # binary ("GLIBC_2.39 not found"). Keep these two in lockstep on bumps. # `curl` is for the container HEALTHCHECK; `ca-certificates` is for # outbound HTTPS (crawler covers/pages). +# +# INSTALL_CHROMIUM is an opt-in for deployments that can't use the +# chromiumoxide fetcher path (notably Linux_arm64 / Raspberry Pi, where +# the upstream snapshot bucket has no usable build). When `true`, adds +# Debian's apt-packaged headless chromium plus a baseline font set — +# pair with `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell` +# at runtime so the launcher uses it. Default `false` keeps cloud/x86 +# images slim. +# +# Build the Pi image with: +# docker compose build --build-arg INSTALL_CHROMIUM=true backend +ARG INSTALL_CHROMIUM=false RUN apt-get update \ && apt-get install -y --no-install-recommends ca-certificates curl \ + && if [ "$INSTALL_CHROMIUM" = "true" ]; then \ + apt-get install -y --no-install-recommends chromium-headless-shell fonts-liberation; \ + fi \ && rm -rf /var/lib/apt/lists/* # Non-root runtime user. The API binary doesn't need any root diff --git a/backend/src/crawler/browser.rs b/backend/src/crawler/browser.rs index 08ea598..c9981ea 100644 --- a/backend/src/crawler/browser.rs +++ b/backend/src/crawler/browser.rs @@ -1,10 +1,17 @@ //! Chromium launcher and lifecycle. //! -//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a -//! system Chrome install — first call downloads a known-good revision -//! into a cache dir and reuses it forever after. `BrowserMode` toggles -//! headed vs headless; the headed path needs a display (real `$DISPLAY` -//! or `xvfb-run`). +//! By default uses `chromiumoxide`'s `fetcher` feature — first call +//! downloads a known-good revision into a cache dir and reuses it +//! forever after. Set `CRAWLER_CHROMIUM_BINARY` to skip the fetcher +//! and use a system-installed Chromium instead; required on platforms +//! where the upstream snapshot bucket has no usable build (notably +//! `Linux_arm64` / Raspberry Pi). Debian's package is at +//! `/usr/bin/chromium` or `/usr/bin/chromium-headless-shell`; Ubuntu +//! ships it as `chromium-browser` at a different path — don't paste +//! the wrong one. +//! +//! `BrowserMode` toggles headed vs headless; the headed path needs a +//! display (real `$DISPLAY` or `xvfb-run`). //! //! Extra Chromium command-line flags can be supplied through //! [`LaunchOptions::extra_args`] in code, or via the @@ -165,31 +172,41 @@ where } } -/// Launches Chromium. Downloads it on first run via the `fetcher` -/// feature; subsequent runs hit the cache. The cache dir is +/// Launches Chromium. If `CRAWLER_CHROMIUM_BINARY` is set, uses that +/// path directly. Otherwise downloads via the `fetcher` feature on +/// first run and hits the cache after that. The fetcher cache dir is /// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`, /// else `./.chromium-cache` as a last-resort repo-local fallback. pub async fn launch(options: LaunchOptions) -> anyhow::Result { - let cache = cache_dir()?; - tokio::fs::create_dir_all(&cache) - .await - .with_context(|| format!("create cache dir {}", cache.display()))?; + let executable = match system_chromium_path_from_env() { + Some(path) => { + tracing::info!(path = %path.display(), "using system chromium (CRAWLER_CHROMIUM_BINARY)"); + path + } + None => { + let cache = cache_dir()?; + tokio::fs::create_dir_all(&cache) + .await + .with_context(|| format!("create cache dir {}", cache.display()))?; - let fetcher = BrowserFetcher::new( - BrowserFetcherOptions::builder() - .with_path(&cache) - .build() - .map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?, - ); - tracing::info!(path = %cache.display(), "ensuring chromium revision is present"); - let info = fetcher - .fetch() - .await - .context("download chromium via fetcher")?; - tracing::info!(executable = %info.executable_path.display(), "chromium ready"); + let fetcher = BrowserFetcher::new( + BrowserFetcherOptions::builder() + .with_path(&cache) + .build() + .map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?, + ); + tracing::info!(path = %cache.display(), "ensuring chromium revision is present"); + let info = fetcher + .fetch() + .await + .context("download chromium via fetcher")?; + tracing::info!(executable = %info.executable_path.display(), "chromium ready"); + info.executable_path + } + }; let mut builder = BrowserConfig::builder() - .chrome_executable(info.executable_path) + .chrome_executable(executable) // Linux containers / CI commonly lack the user namespaces // Chromium's sandbox wants. Disable it; the crawler runs in its // own container anyway. @@ -246,6 +263,24 @@ fn cache_dir() -> anyhow::Result { Ok(PathBuf::from("./.chromium-cache")) } +/// Reads `CRAWLER_CHROMIUM_BINARY` and delegates to the pure helper. +/// Thin wrapper kept separate so the decision logic can be unit-tested +/// without mutating the process environment. +fn system_chromium_path_from_env() -> Option { + system_chromium_path_from_value(std::env::var_os("CRAWLER_CHROMIUM_BINARY").as_deref()) +} + +/// Returns `Some(path)` only when the value is set and non-empty. An +/// exported-but-blank var (common in compose `${VAR:-}` patterns when +/// the operator didn't fill it in) must behave like "unset" — otherwise +/// we'd hand chromiumoxide an empty path and fail launch in a confusing +/// way. +pub(crate) fn system_chromium_path_from_value( + raw: Option<&std::ffi::OsStr>, +) -> Option { + raw.filter(|v| !v.is_empty()).map(PathBuf::from) +} + #[cfg(test)] mod tests { use super::*; @@ -273,6 +308,33 @@ mod tests { assert!(parse_args(" \t\n").is_empty()); } + #[test] + fn system_chromium_path_returns_some_when_value_set() { + let raw = std::ffi::OsString::from("/usr/bin/chromium-headless-shell"); + assert_eq!( + system_chromium_path_from_value(Some(raw.as_os_str())), + Some(PathBuf::from("/usr/bin/chromium-headless-shell")) + ); + } + + #[test] + fn system_chromium_path_returns_none_when_unset() { + assert_eq!(system_chromium_path_from_value(None), None); + } + + #[test] + fn system_chromium_path_treats_empty_as_unset() { + // Compose's `${VAR:-}` substitution produces an exported-but-empty + // env var when the operator left it blank. Treat it as unset so + // the launcher falls back to the fetcher path instead of handing + // chromiumoxide an empty path. + let raw = std::ffi::OsString::from(""); + assert_eq!( + system_chromium_path_from_value(Some(raw.as_os_str())), + None + ); + } + #[test] fn default_launch_options_are_headless() { // Headless is the production-safe default — no display required, diff --git a/backend/tests/crawler_browser_smoke.rs b/backend/tests/crawler_browser_smoke.rs index 1323618..b2df9cf 100644 --- a/backend/tests/crawler_browser_smoke.rs +++ b/backend/tests/crawler_browser_smoke.rs @@ -10,6 +10,11 @@ //! //! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if //! `$HOME/.cache/mangalord/chromium` isn't writable. +//! +//! Set `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell` (or +//! another system chromium path) to exercise the system-chromium +//! launch path instead of the fetcher download — this is the path the +//! Raspberry Pi deployment takes. use mangalord::crawler::browser::{self, LaunchOptions}; diff --git a/docker-compose.yml b/docker-compose.yml index f9257e0..a02a992 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -39,6 +39,11 @@ services: # Upload limits. MAX_REQUEST_BYTES: ${MAX_REQUEST_BYTES:-209715200} MAX_FILE_BYTES: ${MAX_FILE_BYTES:-20971520} + # System-chromium override for the crawler. Leave blank to use the + # bundled fetcher; set to e.g. /usr/bin/chromium-headless-shell on + # arm64 deployments. Pair with `--build-arg INSTALL_CHROMIUM=true` + # so the image actually contains the binary. + CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-} volumes: - storage-data:/var/lib/mangalord/storage # No host port mapping in the default setup — the frontend proxies diff --git a/frontend/package.json b/frontend/package.json index 66eb934..52180a0 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.44.0", + "version": "0.45.0", "private": true, "type": "module", "scripts": {