feat(crawler): honour CRAWLER_LIMIT in the in-process daemon (0.47.0)

The CLI binary already capped runs at CRAWLER_LIMIT mangas, but the
daemon's RealMetadataPass passed a hardcoded `0` (no cap) to
`pipeline::run_metadata_pass`, so the env var was silently ignored once
the daemon took over the metadata pass.

Adds `manga_limit` to `CrawlerConfig`, reads it from `CRAWLER_LIMIT`
(default 0 = no cap), and threads it through `RealMetadataPass::run`
so a daemon-driven sweep stops at the same boundary as a CLI run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-06-01 20:07:01 +02:00
parent 4e20350645
commit 72756cfef2
6 changed files with 45 additions and 4 deletions

View File

@@ -74,6 +74,10 @@ CRAWLER_DOWNLOAD_ALLOWLIST=
CRAWLER_ALLOW_ANY_HOST=false
# Hard cap on a single image body. Default 32 MiB.
CRAWLER_MAX_IMAGE_BYTES=33554432
# Max manga detail fetches per metadata pass (both the in-process daemon
# and the `bin/crawler` CLI). 0 means no cap — let the source walker run
# to completion. Useful for capped test runs against a new source.
CRAWLER_LIMIT=0
# Path to a system Chromium binary. When set, the crawler skips the
# bundled-fetcher download. Required on platforms without a usable
# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On

2
backend/Cargo.lock generated
View File

@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mangalord"
version = "0.46.0"
version = "0.47.0"
dependencies = [
"anyhow",
"argon2",

View File

@@ -1,6 +1,6 @@
[package]
name = "mangalord"
version = "0.46.0"
version = "0.47.0"
edition = "2021"
default-run = "mangalord"

View File

@@ -185,6 +185,7 @@ async fn spawn_crawler_daemon(
http: http.clone(),
rate: Arc::clone(&rate),
start_url: url.clone(),
manga_limit: cfg.manga_limit,
download_allowlist: cfg.download_allowlist.clone(),
max_image_bytes: cfg.max_image_bytes,
tor: tor.as_ref().map(Arc::clone),
@@ -252,6 +253,7 @@ struct RealMetadataPass {
http: reqwest::Client,
rate: Arc<HostRateLimiters>,
start_url: String,
manga_limit: usize,
download_allowlist: DownloadAllowlist,
max_image_bytes: usize,
tor: Option<Arc<crate::crawler::tor::TorController>>,
@@ -267,7 +269,7 @@ impl MetadataPass for RealMetadataPass {
&self.http,
&self.rate,
&self.start_url,
0,
self.manga_limit,
false,
&self.download_allowlist,
self.max_image_bytes,

View File

@@ -119,6 +119,10 @@ pub struct CrawlerConfig {
pub download_allowlist: DownloadAllowlist,
/// Hard upper bound on a single image download. Defaults to 32 MiB.
pub max_image_bytes: usize,
/// Max manga detail fetches per metadata pass. `0` means no cap
/// (full sweep up to the source's own bound). Sourced from
/// `CRAWLER_LIMIT`, mirroring the CLI binary.
pub manga_limit: usize,
}
impl Default for CrawlerConfig {
@@ -145,6 +149,7 @@ impl Default for CrawlerConfig {
browser: LaunchOptions::headless(),
download_allowlist: DownloadAllowlist::new(),
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
manga_limit: 0,
}
}
}
@@ -267,6 +272,7 @@ impl CrawlerConfig {
browser: LaunchOptions::from_env(),
download_allowlist,
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
manga_limit: env_usize("CRAWLER_LIMIT", 0),
})
}
}
@@ -340,3 +346,32 @@ fn env_usize(name: &str, default: usize) -> usize {
.unwrap_or(default)
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
// Serialise env-touching tests so concurrent cargo-test threads don't
// race on the process-global env. Re-acquire on poison since a
// panicking test still leaves the env in a consistent state for us
// (we set/unset within each guard region).
static ENV_GUARD: Mutex<()> = Mutex::new(());
#[test]
fn crawler_limit_env_populates_manga_limit() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::set_var("CRAWLER_LIMIT", "96");
let cfg = CrawlerConfig::from_env().expect("from_env");
std::env::remove_var("CRAWLER_LIMIT");
assert_eq!(cfg.manga_limit, 96);
}
#[test]
fn crawler_limit_unset_defaults_to_zero() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::remove_var("CRAWLER_LIMIT");
let cfg = CrawlerConfig::from_env().expect("from_env");
assert_eq!(cfg.manga_limit, 0);
}
}

View File

@@ -1,6 +1,6 @@
{
"name": "mangalord-frontend",
"version": "0.46.0",
"version": "0.47.0",
"private": true,
"type": "module",
"scripts": {