Compare commits
2 Commits
main
...
feat/crawl
| Author | SHA1 | Date | |
|---|---|---|---|
| 2f9037e210 | |||
|
|
0b5f5d1692 |
@@ -74,6 +74,14 @@ CRAWLER_DOWNLOAD_ALLOWLIST=
|
||||
CRAWLER_ALLOW_ANY_HOST=false
|
||||
# Hard cap on a single image body. Default 32 MiB.
|
||||
CRAWLER_MAX_IMAGE_BYTES=33554432
|
||||
# Path to a system Chromium binary. When set, the crawler skips the
|
||||
# bundled-fetcher download. Required on platforms without a usable
|
||||
# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On
|
||||
# Debian: /usr/bin/chromium-headless-shell or /usr/bin/chromium. On
|
||||
# Ubuntu the package is chromium-browser (different path). Pair with
|
||||
# `docker compose build --build-arg INSTALL_CHROMIUM=true backend` so
|
||||
# the image actually contains the binary.
|
||||
CRAWLER_CHROMIUM_BINARY=
|
||||
|
||||
# ----- Frontend -----
|
||||
# The frontend container runs SvelteKit's Node adapter on :3000 and
|
||||
|
||||
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mangalord"
|
||||
version = "0.44.0"
|
||||
version = "0.45.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.44.0"
|
||||
version = "0.45.0"
|
||||
edition = "2021"
|
||||
default-run = "mangalord"
|
||||
|
||||
|
||||
@@ -25,8 +25,23 @@ FROM debian:trixie-slim
|
||||
# binary ("GLIBC_2.39 not found"). Keep these two in lockstep on bumps.
|
||||
# `curl` is for the container HEALTHCHECK; `ca-certificates` is for
|
||||
# outbound HTTPS (crawler covers/pages).
|
||||
#
|
||||
# INSTALL_CHROMIUM is an opt-in for deployments that can't use the
|
||||
# chromiumoxide fetcher path (notably Linux_arm64 / Raspberry Pi, where
|
||||
# the upstream snapshot bucket has no usable build). When `true`, adds
|
||||
# Debian's apt-packaged headless chromium plus a baseline font set —
|
||||
# pair with `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell`
|
||||
# at runtime so the launcher uses it. Default `false` keeps cloud/x86
|
||||
# images slim.
|
||||
#
|
||||
# Build the Pi image with:
|
||||
# docker compose build --build-arg INSTALL_CHROMIUM=true backend
|
||||
ARG INSTALL_CHROMIUM=false
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ca-certificates curl \
|
||||
&& if [ "$INSTALL_CHROMIUM" = "true" ]; then \
|
||||
apt-get install -y --no-install-recommends chromium-headless-shell fonts-liberation; \
|
||||
fi \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Non-root runtime user. The API binary doesn't need any root
|
||||
|
||||
@@ -1,10 +1,17 @@
|
||||
//! Chromium launcher and lifecycle.
|
||||
//!
|
||||
//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
|
||||
//! system Chrome install — first call downloads a known-good revision
|
||||
//! into a cache dir and reuses it forever after. `BrowserMode` toggles
|
||||
//! headed vs headless; the headed path needs a display (real `$DISPLAY`
|
||||
//! or `xvfb-run`).
|
||||
//! By default uses `chromiumoxide`'s `fetcher` feature — first call
|
||||
//! downloads a known-good revision into a cache dir and reuses it
|
||||
//! forever after. Set `CRAWLER_CHROMIUM_BINARY` to skip the fetcher
|
||||
//! and use a system-installed Chromium instead; required on platforms
|
||||
//! where the upstream snapshot bucket has no usable build (notably
|
||||
//! `Linux_arm64` / Raspberry Pi). Debian's package is at
|
||||
//! `/usr/bin/chromium` or `/usr/bin/chromium-headless-shell`; Ubuntu
|
||||
//! ships it as `chromium-browser` at a different path — don't paste
|
||||
//! the wrong one.
|
||||
//!
|
||||
//! `BrowserMode` toggles headed vs headless; the headed path needs a
|
||||
//! display (real `$DISPLAY` or `xvfb-run`).
|
||||
//!
|
||||
//! Extra Chromium command-line flags can be supplied through
|
||||
//! [`LaunchOptions::extra_args`] in code, or via the
|
||||
@@ -165,31 +172,41 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Launches Chromium. Downloads it on first run via the `fetcher`
|
||||
/// feature; subsequent runs hit the cache. The cache dir is
|
||||
/// Launches Chromium. If `CRAWLER_CHROMIUM_BINARY` is set, uses that
|
||||
/// path directly. Otherwise downloads via the `fetcher` feature on
|
||||
/// first run and hits the cache after that. The fetcher cache dir is
|
||||
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
|
||||
/// else `./.chromium-cache` as a last-resort repo-local fallback.
|
||||
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
||||
let cache = cache_dir()?;
|
||||
tokio::fs::create_dir_all(&cache)
|
||||
.await
|
||||
.with_context(|| format!("create cache dir {}", cache.display()))?;
|
||||
let executable = match system_chromium_path_from_env() {
|
||||
Some(path) => {
|
||||
tracing::info!(path = %path.display(), "using system chromium (CRAWLER_CHROMIUM_BINARY)");
|
||||
path
|
||||
}
|
||||
None => {
|
||||
let cache = cache_dir()?;
|
||||
tokio::fs::create_dir_all(&cache)
|
||||
.await
|
||||
.with_context(|| format!("create cache dir {}", cache.display()))?;
|
||||
|
||||
let fetcher = BrowserFetcher::new(
|
||||
BrowserFetcherOptions::builder()
|
||||
.with_path(&cache)
|
||||
.build()
|
||||
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
|
||||
);
|
||||
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
|
||||
let info = fetcher
|
||||
.fetch()
|
||||
.await
|
||||
.context("download chromium via fetcher")?;
|
||||
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
|
||||
let fetcher = BrowserFetcher::new(
|
||||
BrowserFetcherOptions::builder()
|
||||
.with_path(&cache)
|
||||
.build()
|
||||
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
|
||||
);
|
||||
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
|
||||
let info = fetcher
|
||||
.fetch()
|
||||
.await
|
||||
.context("download chromium via fetcher")?;
|
||||
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
|
||||
info.executable_path
|
||||
}
|
||||
};
|
||||
|
||||
let mut builder = BrowserConfig::builder()
|
||||
.chrome_executable(info.executable_path)
|
||||
.chrome_executable(executable)
|
||||
// Linux containers / CI commonly lack the user namespaces
|
||||
// Chromium's sandbox wants. Disable it; the crawler runs in its
|
||||
// own container anyway.
|
||||
@@ -246,6 +263,24 @@ fn cache_dir() -> anyhow::Result<PathBuf> {
|
||||
Ok(PathBuf::from("./.chromium-cache"))
|
||||
}
|
||||
|
||||
/// Reads `CRAWLER_CHROMIUM_BINARY` and delegates to the pure helper.
|
||||
/// Thin wrapper kept separate so the decision logic can be unit-tested
|
||||
/// without mutating the process environment.
|
||||
fn system_chromium_path_from_env() -> Option<PathBuf> {
|
||||
system_chromium_path_from_value(std::env::var_os("CRAWLER_CHROMIUM_BINARY").as_deref())
|
||||
}
|
||||
|
||||
/// Returns `Some(path)` only when the value is set and non-empty. An
|
||||
/// exported-but-blank var (common in compose `${VAR:-}` patterns when
|
||||
/// the operator didn't fill it in) must behave like "unset" — otherwise
|
||||
/// we'd hand chromiumoxide an empty path and fail launch in a confusing
|
||||
/// way.
|
||||
pub(crate) fn system_chromium_path_from_value(
|
||||
raw: Option<&std::ffi::OsStr>,
|
||||
) -> Option<PathBuf> {
|
||||
raw.filter(|v| !v.is_empty()).map(PathBuf::from)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -273,6 +308,33 @@ mod tests {
|
||||
assert!(parse_args(" \t\n").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_chromium_path_returns_some_when_value_set() {
|
||||
let raw = std::ffi::OsString::from("/usr/bin/chromium-headless-shell");
|
||||
assert_eq!(
|
||||
system_chromium_path_from_value(Some(raw.as_os_str())),
|
||||
Some(PathBuf::from("/usr/bin/chromium-headless-shell"))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_chromium_path_returns_none_when_unset() {
|
||||
assert_eq!(system_chromium_path_from_value(None), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn system_chromium_path_treats_empty_as_unset() {
|
||||
// Compose's `${VAR:-}` substitution produces an exported-but-empty
|
||||
// env var when the operator left it blank. Treat it as unset so
|
||||
// the launcher falls back to the fetcher path instead of handing
|
||||
// chromiumoxide an empty path.
|
||||
let raw = std::ffi::OsString::from("");
|
||||
assert_eq!(
|
||||
system_chromium_path_from_value(Some(raw.as_os_str())),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_launch_options_are_headless() {
|
||||
// Headless is the production-safe default — no display required,
|
||||
|
||||
@@ -10,6 +10,11 @@
|
||||
//!
|
||||
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
|
||||
//! `$HOME/.cache/mangalord/chromium` isn't writable.
|
||||
//!
|
||||
//! Set `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell` (or
|
||||
//! another system chromium path) to exercise the system-chromium
|
||||
//! launch path instead of the fetcher download — this is the path the
|
||||
//! Raspberry Pi deployment takes.
|
||||
|
||||
use mangalord::crawler::browser::{self, LaunchOptions};
|
||||
|
||||
|
||||
@@ -39,6 +39,11 @@ services:
|
||||
# Upload limits.
|
||||
MAX_REQUEST_BYTES: ${MAX_REQUEST_BYTES:-209715200}
|
||||
MAX_FILE_BYTES: ${MAX_FILE_BYTES:-20971520}
|
||||
# System-chromium override for the crawler. Leave blank to use the
|
||||
# bundled fetcher; set to e.g. /usr/bin/chromium-headless-shell on
|
||||
# arm64 deployments. Pair with `--build-arg INSTALL_CHROMIUM=true`
|
||||
# so the image actually contains the binary.
|
||||
CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-}
|
||||
volumes:
|
||||
- storage-data:/var/lib/mangalord/storage
|
||||
# No host port mapping in the default setup — the frontend proxies
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "mangalord-frontend",
|
||||
"version": "0.44.0",
|
||||
"version": "0.45.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
|
||||
Reference in New Issue
Block a user