From a2826d64672cf2b0cd04d8aa89d742ecf9011ff2 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 31 May 2026 14:52:49 +0200 Subject: [PATCH] feat(crawler): CRAWLER_ALLOW_ANY_HOST bypasses the host allowlist (0.44.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operators whose sources shard images across numbered CDN subdomains can't pre-enumerate every host in CRAWLER_DOWNLOAD_ALLOWLIST. The new flag short-circuits the host check in DownloadAllowlist::contains while leaving scheme, localhost, and private-IP defenses in is_safe_url untouched — scraped URLs pointing at 10.x / 169.254.169.254 / file:// stay refused. Default is false; fail-closed posture is preserved unless the operator opts in. Wired into both the server (config::build_download_allowlist) and the bin/crawler.rs one-shot. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 6 +++ backend/Cargo.lock | 2 +- backend/Cargo.toml | 2 +- backend/src/bin/crawler.rs | 38 ++++++++++-------- backend/src/config.rs | 7 ++++ backend/src/crawler/safety.rs | 74 ++++++++++++++++++++++++++++++++++- frontend/package.json | 2 +- 7 files changed, 111 insertions(+), 20 deletions(-) diff --git a/.env.example b/.env.example index b7da316..7be35eb 100644 --- a/.env.example +++ b/.env.example @@ -66,6 +66,12 @@ MAX_FILE_BYTES=20971520 # to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated. # Defends against SSRF via scraped . CRAWLER_DOWNLOAD_ALLOWLIST= +# Bypass the host allowlist entirely. Intended for sources that shard +# images across numbered CDN subdomains (cdn1/cdn2/…) where enumerating +# every host upfront is impractical. The private-IP / localhost / non- +# http(s) scheme defenses STAY ON — a scraped +# is still refused with this flag set. +CRAWLER_ALLOW_ANY_HOST=false # Hard cap on a single image body. Default 32 MiB. CRAWLER_MAX_IMAGE_BYTES=33554432 diff --git a/backend/Cargo.lock b/backend/Cargo.lock index caf45c0..d41e0a2 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.43.1" +version = "0.44.0" dependencies = [ "anyhow", "argon2", diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 152dd80..33a8ddb 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.43.1" +version = "0.44.0" edition = "2021" default-run = "mangalord" diff --git a/backend/src/bin/crawler.rs b/backend/src/bin/crawler.rs index 702e218..453a1d8 100644 --- a/backend/src/bin/crawler.rs +++ b/backend/src/bin/crawler.rs @@ -226,24 +226,30 @@ async fn run( // SSRF defence: only download from the catalog host + CDN host // (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap // single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes. - let mut allowlist = - mangalord::crawler::safety::DownloadAllowlist::new(); - if let Ok(parsed) = reqwest::Url::parse(start_url) { - if let Some(h) = parsed.host_str() { - allowlist = allowlist.allow(h); - } - } - if let Some(host) = cdn_host { - allowlist = allowlist.allow(host); - } - if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") { - for piece in extras.split(',') { - let trimmed = piece.trim(); - if !trimmed.is_empty() { - allowlist = allowlist.allow(trimmed); + // CRAWLER_ALLOW_ANY_HOST=true short-circuits the host check for + // sharded-CDN sources; private-IP and scheme guards still apply. + let allowlist = if env_bool("CRAWLER_ALLOW_ANY_HOST", false) { + mangalord::crawler::safety::DownloadAllowlist::allow_any() + } else { + let mut allow = mangalord::crawler::safety::DownloadAllowlist::new(); + if let Ok(parsed) = reqwest::Url::parse(start_url) { + if let Some(h) = parsed.host_str() { + allow = allow.allow(h); } } - } + if let Some(host) = cdn_host { + allow = allow.allow(host); + } + if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") { + for piece in extras.split(',') { + let trimmed = piece.trim(); + if !trimmed.is_empty() { + allow = allow.allow(trimmed); + } + } + } + allow + }; let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES") .ok() .and_then(|s| s.parse().ok()) diff --git a/backend/src/config.rs b/backend/src/config.rs index 183145f..e983d03 100644 --- a/backend/src/config.rs +++ b/backend/src/config.rs @@ -248,10 +248,17 @@ impl CrawlerConfig { /// separated). Empty by default — meaning the crawler refuses to /// download anything when no source is configured, which is the safe /// fail-closed posture. +/// +/// `CRAWLER_ALLOW_ANY_HOST=true` short-circuits the host enumeration +/// for operators whose sources shard across numbered CDN subdomains. +/// Scheme + private-IP defenses still apply. fn build_download_allowlist( start_url: Option<&str>, cdn_host: Option<&str>, ) -> DownloadAllowlist { + if env_bool("CRAWLER_ALLOW_ANY_HOST", false) { + return DownloadAllowlist::allow_any(); + } let mut allow = DownloadAllowlist::new(); if let Some(url) = start_url { if let Ok(parsed) = reqwest::Url::parse(url) { diff --git a/backend/src/crawler/safety.rs b/backend/src/crawler/safety.rs index d9c62ad..deb7f61 100644 --- a/backend/src/crawler/safety.rs +++ b/backend/src/crawler/safety.rs @@ -47,14 +47,33 @@ pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024; /// configured allowlist. None by default — keeping the surface area /// minimal so the only way a URL gets through is if it matches an /// explicit catalog/CDN entry. +/// +/// `allow_any` flips the host check off entirely (private-IP and +/// scheme checks still apply). It exists for operators whose sources +/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …) +/// where enumerating each host upfront is impractical. Off by default. #[derive(Clone, Debug, Default)] pub struct DownloadAllowlist { hosts: Vec, + allow_any: bool, } impl DownloadAllowlist { pub fn new() -> Self { - Self { hosts: Vec::new() } + Self { + hosts: Vec::new(), + allow_any: false, + } + } + + /// Bypass the host allowlist. Scheme, localhost, and private-IP + /// checks in [`is_safe_url`] continue to apply — this only opens + /// up public hosts that weren't pre-enumerated. + pub fn allow_any() -> Self { + Self { + hosts: Vec::new(), + allow_any: true, + } } /// Add a host (case-insensitive match). Sub-domains are *not* @@ -73,6 +92,9 @@ impl DownloadAllowlist { } pub fn contains(&self, host: &str) -> bool { + if self.allow_any { + return true; + } let lower = host.to_ascii_lowercase(); self.hosts.iter().any(|h| h == &lower) } @@ -245,6 +267,56 @@ mod tests { DownloadAllowlist::new().allow(host) } + #[test] + fn allow_any_admits_arbitrary_public_host() { + // Operators who can't pre-enumerate a numbered-CDN fleet + // (cdn1, cdn2, …) opt into allow_any. Any public host passes. + let allow = DownloadAllowlist::allow_any(); + assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok()); + assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok()); + } + + #[test] + fn allow_any_still_blocks_private_ips() { + // The point of the bypass is the host-allowlist check, not the + // SSRF defense. Private/loopback IPs stay refused. + let allow = DownloadAllowlist::allow_any(); + for url in [ + "http://10.0.0.1/", + "http://192.168.1.1/", + "http://169.254.169.254/", + "http://127.0.0.1/", + "http://[::1]/", + "http://[::ffff:127.0.0.1]/", + ] { + assert!( + matches!( + is_safe_url(url, &allow).unwrap_err(), + UrlSafetyError::PrivateIp(_) + ), + "allow_any must still reject {url}" + ); + } + } + + #[test] + fn allow_any_still_blocks_localhost() { + let allow = DownloadAllowlist::allow_any(); + assert!(matches!( + is_safe_url("http://localhost:8080/", &allow).unwrap_err(), + UrlSafetyError::Loopback + )); + } + + #[test] + fn allow_any_still_blocks_non_http_schemes() { + let allow = DownloadAllowlist::allow_any(); + assert!(matches!( + is_safe_url("file:///etc/passwd", &allow).unwrap_err(), + UrlSafetyError::BadScheme(_) + )); + } + #[test] fn safe_url_allows_listed_host() { let allow = allow_just("cdn.example.com"); diff --git a/frontend/package.json b/frontend/package.json index 9019161..66eb934 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.43.1", + "version": "0.44.0", "private": true, "type": "module", "scripts": {