From 72756cfef201339d48e325584aa3914649be575e Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Mon, 1 Jun 2026 20:07:01 +0200 Subject: [PATCH] feat(crawler): honour CRAWLER_LIMIT in the in-process daemon (0.47.0) The CLI binary already capped runs at CRAWLER_LIMIT mangas, but the daemon's RealMetadataPass passed a hardcoded `0` (no cap) to `pipeline::run_metadata_pass`, so the env var was silently ignored once the daemon took over the metadata pass. Adds `manga_limit` to `CrawlerConfig`, reads it from `CRAWLER_LIMIT` (default 0 = no cap), and threads it through `RealMetadataPass::run` so a daemon-driven sweep stops at the same boundary as a CLI run. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 4 ++++ backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/app.rs | 4 +++- backend/src/config.rs | 35 +++++++++++++++++++++++++++++++++++ frontend/package.json | 2 +- 6 files changed, 45 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index 406f54d..b3f6da3 100644 --- a/.env.example +++ b/.env.example @@ -74,6 +74,10 @@ CRAWLER_DOWNLOAD_ALLOWLIST= CRAWLER_ALLOW_ANY_HOST=false # Hard cap on a single image body. Default 32 MiB. CRAWLER_MAX_IMAGE_BYTES=33554432 +# Max manga detail fetches per metadata pass (both the in-process daemon +# and the `bin/crawler` CLI). 0 means no cap — let the source walker run +# to completion. Useful for capped test runs against a new source. +CRAWLER_LIMIT=0 # Path to a system Chromium binary. When set, the crawler skips the # bundled-fetcher download. Required on platforms without a usable # upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On diff --git a/backend/Cargo.lock b/backend/Cargo.lock index ed7a599..86c58d5 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.46.0" +version = "0.47.0" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 5b13f53..4ffc57d 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.46.0" +version = "0.47.0" edition = "2021" default-run = "mangalord" diff --git a/backend/src/app.rs b/backend/src/app.rs index 8288ca5..331d95d 100644 --- a/backend/src/app.rs +++ b/backend/src/app.rs @@ -185,6 +185,7 @@ async fn spawn_crawler_daemon( http: http.clone(), rate: Arc::clone(&rate), start_url: url.clone(), + manga_limit: cfg.manga_limit, download_allowlist: cfg.download_allowlist.clone(), max_image_bytes: cfg.max_image_bytes, tor: tor.as_ref().map(Arc::clone), @@ -252,6 +253,7 @@ struct RealMetadataPass { http: reqwest::Client, rate: Arc, start_url: String, + manga_limit: usize, download_allowlist: DownloadAllowlist, max_image_bytes: usize, tor: Option>, @@ -267,7 +269,7 @@ impl MetadataPass for RealMetadataPass { &self.http, &self.rate, &self.start_url, - 0, + self.manga_limit, false, &self.download_allowlist, self.max_image_bytes, diff --git a/backend/src/config.rs b/backend/src/config.rs index 9f68a83..fb0821d 100644 --- a/backend/src/config.rs +++ b/backend/src/config.rs @@ -119,6 +119,10 @@ pub struct CrawlerConfig { pub download_allowlist: DownloadAllowlist, /// Hard upper bound on a single image download. Defaults to 32 MiB. pub max_image_bytes: usize, + /// Max manga detail fetches per metadata pass. `0` means no cap + /// (full sweep up to the source's own bound). Sourced from + /// `CRAWLER_LIMIT`, mirroring the CLI binary. + pub manga_limit: usize, } impl Default for CrawlerConfig { @@ -145,6 +149,7 @@ impl Default for CrawlerConfig { browser: LaunchOptions::headless(), download_allowlist: DownloadAllowlist::new(), max_image_bytes: DEFAULT_MAX_IMAGE_BYTES, + manga_limit: 0, } } } @@ -267,6 +272,7 @@ impl CrawlerConfig { browser: LaunchOptions::from_env(), download_allowlist, max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES), + manga_limit: env_usize("CRAWLER_LIMIT", 0), }) } } @@ -340,3 +346,32 @@ fn env_usize(name: &str, default: usize) -> usize { .unwrap_or(default) } +#[cfg(test)] +mod tests { + use super::*; + use std::sync::Mutex; + + // Serialise env-touching tests so concurrent cargo-test threads don't + // race on the process-global env. Re-acquire on poison since a + // panicking test still leaves the env in a consistent state for us + // (we set/unset within each guard region). + static ENV_GUARD: Mutex<()> = Mutex::new(()); + + #[test] + fn crawler_limit_env_populates_manga_limit() { + let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner()); + std::env::set_var("CRAWLER_LIMIT", "96"); + let cfg = CrawlerConfig::from_env().expect("from_env"); + std::env::remove_var("CRAWLER_LIMIT"); + assert_eq!(cfg.manga_limit, 96); + } + + #[test] + fn crawler_limit_unset_defaults_to_zero() { + let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner()); + std::env::remove_var("CRAWLER_LIMIT"); + let cfg = CrawlerConfig::from_env().expect("from_env"); + assert_eq!(cfg.manga_limit, 0); + } +} + diff --git a/frontend/package.json b/frontend/package.json index 5493c34..c25b253 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.46.0", + "version": "0.47.0", "private": true, "type": "module", "scripts": {