feat(crawler): honour CRAWLER_LIMIT in the in-process daemon (0.47.0)
The CLI binary already capped runs at CRAWLER_LIMIT mangas, but the daemon's RealMetadataPass passed a hardcoded `0` (no cap) to `pipeline::run_metadata_pass`, so the env var was silently ignored once the daemon took over the metadata pass. Adds `manga_limit` to `CrawlerConfig`, reads it from `CRAWLER_LIMIT` (default 0 = no cap), and threads it through `RealMetadataPass::run` so a daemon-driven sweep stops at the same boundary as a CLI run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -74,6 +74,10 @@ CRAWLER_DOWNLOAD_ALLOWLIST=
|
|||||||
CRAWLER_ALLOW_ANY_HOST=false
|
CRAWLER_ALLOW_ANY_HOST=false
|
||||||
# Hard cap on a single image body. Default 32 MiB.
|
# Hard cap on a single image body. Default 32 MiB.
|
||||||
CRAWLER_MAX_IMAGE_BYTES=33554432
|
CRAWLER_MAX_IMAGE_BYTES=33554432
|
||||||
|
# Max manga detail fetches per metadata pass (both the in-process daemon
|
||||||
|
# and the `bin/crawler` CLI). 0 means no cap — let the source walker run
|
||||||
|
# to completion. Useful for capped test runs against a new source.
|
||||||
|
CRAWLER_LIMIT=0
|
||||||
# Path to a system Chromium binary. When set, the crawler skips the
|
# Path to a system Chromium binary. When set, the crawler skips the
|
||||||
# bundled-fetcher download. Required on platforms without a usable
|
# bundled-fetcher download. Required on platforms without a usable
|
||||||
# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On
|
# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On
|
||||||
|
|||||||
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.46.0"
|
version = "0.47.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.46.0"
|
version = "0.47.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -185,6 +185,7 @@ async fn spawn_crawler_daemon(
|
|||||||
http: http.clone(),
|
http: http.clone(),
|
||||||
rate: Arc::clone(&rate),
|
rate: Arc::clone(&rate),
|
||||||
start_url: url.clone(),
|
start_url: url.clone(),
|
||||||
|
manga_limit: cfg.manga_limit,
|
||||||
download_allowlist: cfg.download_allowlist.clone(),
|
download_allowlist: cfg.download_allowlist.clone(),
|
||||||
max_image_bytes: cfg.max_image_bytes,
|
max_image_bytes: cfg.max_image_bytes,
|
||||||
tor: tor.as_ref().map(Arc::clone),
|
tor: tor.as_ref().map(Arc::clone),
|
||||||
@@ -252,6 +253,7 @@ struct RealMetadataPass {
|
|||||||
http: reqwest::Client,
|
http: reqwest::Client,
|
||||||
rate: Arc<HostRateLimiters>,
|
rate: Arc<HostRateLimiters>,
|
||||||
start_url: String,
|
start_url: String,
|
||||||
|
manga_limit: usize,
|
||||||
download_allowlist: DownloadAllowlist,
|
download_allowlist: DownloadAllowlist,
|
||||||
max_image_bytes: usize,
|
max_image_bytes: usize,
|
||||||
tor: Option<Arc<crate::crawler::tor::TorController>>,
|
tor: Option<Arc<crate::crawler::tor::TorController>>,
|
||||||
@@ -267,7 +269,7 @@ impl MetadataPass for RealMetadataPass {
|
|||||||
&self.http,
|
&self.http,
|
||||||
&self.rate,
|
&self.rate,
|
||||||
&self.start_url,
|
&self.start_url,
|
||||||
0,
|
self.manga_limit,
|
||||||
false,
|
false,
|
||||||
&self.download_allowlist,
|
&self.download_allowlist,
|
||||||
self.max_image_bytes,
|
self.max_image_bytes,
|
||||||
|
|||||||
@@ -119,6 +119,10 @@ pub struct CrawlerConfig {
|
|||||||
pub download_allowlist: DownloadAllowlist,
|
pub download_allowlist: DownloadAllowlist,
|
||||||
/// Hard upper bound on a single image download. Defaults to 32 MiB.
|
/// Hard upper bound on a single image download. Defaults to 32 MiB.
|
||||||
pub max_image_bytes: usize,
|
pub max_image_bytes: usize,
|
||||||
|
/// Max manga detail fetches per metadata pass. `0` means no cap
|
||||||
|
/// (full sweep up to the source's own bound). Sourced from
|
||||||
|
/// `CRAWLER_LIMIT`, mirroring the CLI binary.
|
||||||
|
pub manga_limit: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for CrawlerConfig {
|
impl Default for CrawlerConfig {
|
||||||
@@ -145,6 +149,7 @@ impl Default for CrawlerConfig {
|
|||||||
browser: LaunchOptions::headless(),
|
browser: LaunchOptions::headless(),
|
||||||
download_allowlist: DownloadAllowlist::new(),
|
download_allowlist: DownloadAllowlist::new(),
|
||||||
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
|
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
|
||||||
|
manga_limit: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -267,6 +272,7 @@ impl CrawlerConfig {
|
|||||||
browser: LaunchOptions::from_env(),
|
browser: LaunchOptions::from_env(),
|
||||||
download_allowlist,
|
download_allowlist,
|
||||||
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
|
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
|
||||||
|
manga_limit: env_usize("CRAWLER_LIMIT", 0),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -340,3 +346,32 @@ fn env_usize(name: &str, default: usize) -> usize {
|
|||||||
.unwrap_or(default)
|
.unwrap_or(default)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
// Serialise env-touching tests so concurrent cargo-test threads don't
|
||||||
|
// race on the process-global env. Re-acquire on poison since a
|
||||||
|
// panicking test still leaves the env in a consistent state for us
|
||||||
|
// (we set/unset within each guard region).
|
||||||
|
static ENV_GUARD: Mutex<()> = Mutex::new(());
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn crawler_limit_env_populates_manga_limit() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::set_var("CRAWLER_LIMIT", "96");
|
||||||
|
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||||
|
std::env::remove_var("CRAWLER_LIMIT");
|
||||||
|
assert_eq!(cfg.manga_limit, 96);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn crawler_limit_unset_defaults_to_zero() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::remove_var("CRAWLER_LIMIT");
|
||||||
|
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||||
|
assert_eq!(cfg.manga_limit, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.46.0",
|
"version": "0.47.0",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user