feat(crawler): honour CRAWLER_LIMIT in the in-process daemon (0.47.0)
The CLI binary already capped runs at CRAWLER_LIMIT mangas, but the daemon's RealMetadataPass passed a hardcoded `0` (no cap) to `pipeline::run_metadata_pass`, so the env var was silently ignored once the daemon took over the metadata pass. Adds `manga_limit` to `CrawlerConfig`, reads it from `CRAWLER_LIMIT` (default 0 = no cap), and threads it through `RealMetadataPass::run` so a daemon-driven sweep stops at the same boundary as a CLI run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -185,6 +185,7 @@ async fn spawn_crawler_daemon(
|
||||
http: http.clone(),
|
||||
rate: Arc::clone(&rate),
|
||||
start_url: url.clone(),
|
||||
manga_limit: cfg.manga_limit,
|
||||
download_allowlist: cfg.download_allowlist.clone(),
|
||||
max_image_bytes: cfg.max_image_bytes,
|
||||
tor: tor.as_ref().map(Arc::clone),
|
||||
@@ -252,6 +253,7 @@ struct RealMetadataPass {
|
||||
http: reqwest::Client,
|
||||
rate: Arc<HostRateLimiters>,
|
||||
start_url: String,
|
||||
manga_limit: usize,
|
||||
download_allowlist: DownloadAllowlist,
|
||||
max_image_bytes: usize,
|
||||
tor: Option<Arc<crate::crawler::tor::TorController>>,
|
||||
@@ -267,7 +269,7 @@ impl MetadataPass for RealMetadataPass {
|
||||
&self.http,
|
||||
&self.rate,
|
||||
&self.start_url,
|
||||
0,
|
||||
self.manga_limit,
|
||||
false,
|
||||
&self.download_allowlist,
|
||||
self.max_image_bytes,
|
||||
|
||||
@@ -119,6 +119,10 @@ pub struct CrawlerConfig {
|
||||
pub download_allowlist: DownloadAllowlist,
|
||||
/// Hard upper bound on a single image download. Defaults to 32 MiB.
|
||||
pub max_image_bytes: usize,
|
||||
/// Max manga detail fetches per metadata pass. `0` means no cap
|
||||
/// (full sweep up to the source's own bound). Sourced from
|
||||
/// `CRAWLER_LIMIT`, mirroring the CLI binary.
|
||||
pub manga_limit: usize,
|
||||
}
|
||||
|
||||
impl Default for CrawlerConfig {
|
||||
@@ -145,6 +149,7 @@ impl Default for CrawlerConfig {
|
||||
browser: LaunchOptions::headless(),
|
||||
download_allowlist: DownloadAllowlist::new(),
|
||||
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
|
||||
manga_limit: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -267,6 +272,7 @@ impl CrawlerConfig {
|
||||
browser: LaunchOptions::from_env(),
|
||||
download_allowlist,
|
||||
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
|
||||
manga_limit: env_usize("CRAWLER_LIMIT", 0),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -340,3 +346,32 @@ fn env_usize(name: &str, default: usize) -> usize {
|
||||
.unwrap_or(default)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Mutex;
|
||||
|
||||
// Serialise env-touching tests so concurrent cargo-test threads don't
|
||||
// race on the process-global env. Re-acquire on poison since a
|
||||
// panicking test still leaves the env in a consistent state for us
|
||||
// (we set/unset within each guard region).
|
||||
static ENV_GUARD: Mutex<()> = Mutex::new(());
|
||||
|
||||
#[test]
|
||||
fn crawler_limit_env_populates_manga_limit() {
|
||||
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||
std::env::set_var("CRAWLER_LIMIT", "96");
|
||||
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||
std::env::remove_var("CRAWLER_LIMIT");
|
||||
assert_eq!(cfg.manga_limit, 96);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn crawler_limit_unset_defaults_to_zero() {
|
||||
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||
std::env::remove_var("CRAWLER_LIMIT");
|
||||
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||
assert_eq!(cfg.manga_limit, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user