diff --git a/.env.example b/.env.example
index b7da316..7be35eb 100644
--- a/.env.example
+++ b/.env.example
@@ -66,6 +66,12 @@ MAX_FILE_BYTES=20971520
# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated.
# Defends against SSRF via scraped
.
CRAWLER_DOWNLOAD_ALLOWLIST=
+# Bypass the host allowlist entirely. Intended for sources that shard
+# images across numbered CDN subdomains (cdn1/cdn2/…) where enumerating
+# every host upfront is impractical. The private-IP / localhost / non-
+# http(s) scheme defenses STAY ON — a scraped
+# is still refused with this flag set.
+CRAWLER_ALLOW_ANY_HOST=false
# Hard cap on a single image body. Default 32 MiB.
CRAWLER_MAX_IMAGE_BYTES=33554432
diff --git a/backend/Cargo.lock b/backend/Cargo.lock
index caf45c0..d41e0a2 100644
--- a/backend/Cargo.lock
+++ b/backend/Cargo.lock
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mangalord"
-version = "0.43.1"
+version = "0.44.0"
dependencies = [
"anyhow",
"argon2",
diff --git a/backend/Cargo.toml b/backend/Cargo.toml
index 152dd80..33a8ddb 100644
--- a/backend/Cargo.toml
+++ b/backend/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "mangalord"
-version = "0.43.1"
+version = "0.44.0"
edition = "2021"
default-run = "mangalord"
diff --git a/backend/src/bin/crawler.rs b/backend/src/bin/crawler.rs
index 702e218..453a1d8 100644
--- a/backend/src/bin/crawler.rs
+++ b/backend/src/bin/crawler.rs
@@ -226,24 +226,30 @@ async fn run(
// SSRF defence: only download from the catalog host + CDN host
// (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap
// single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes.
- let mut allowlist =
- mangalord::crawler::safety::DownloadAllowlist::new();
- if let Ok(parsed) = reqwest::Url::parse(start_url) {
- if let Some(h) = parsed.host_str() {
- allowlist = allowlist.allow(h);
- }
- }
- if let Some(host) = cdn_host {
- allowlist = allowlist.allow(host);
- }
- if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
- for piece in extras.split(',') {
- let trimmed = piece.trim();
- if !trimmed.is_empty() {
- allowlist = allowlist.allow(trimmed);
+ // CRAWLER_ALLOW_ANY_HOST=true short-circuits the host check for
+ // sharded-CDN sources; private-IP and scheme guards still apply.
+ let allowlist = if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
+ mangalord::crawler::safety::DownloadAllowlist::allow_any()
+ } else {
+ let mut allow = mangalord::crawler::safety::DownloadAllowlist::new();
+ if let Ok(parsed) = reqwest::Url::parse(start_url) {
+ if let Some(h) = parsed.host_str() {
+ allow = allow.allow(h);
}
}
- }
+ if let Some(host) = cdn_host {
+ allow = allow.allow(host);
+ }
+ if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
+ for piece in extras.split(',') {
+ let trimmed = piece.trim();
+ if !trimmed.is_empty() {
+ allow = allow.allow(trimmed);
+ }
+ }
+ }
+ allow
+ };
let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES")
.ok()
.and_then(|s| s.parse().ok())
diff --git a/backend/src/config.rs b/backend/src/config.rs
index 183145f..e983d03 100644
--- a/backend/src/config.rs
+++ b/backend/src/config.rs
@@ -248,10 +248,17 @@ impl CrawlerConfig {
/// separated). Empty by default — meaning the crawler refuses to
/// download anything when no source is configured, which is the safe
/// fail-closed posture.
+///
+/// `CRAWLER_ALLOW_ANY_HOST=true` short-circuits the host enumeration
+/// for operators whose sources shard across numbered CDN subdomains.
+/// Scheme + private-IP defenses still apply.
fn build_download_allowlist(
start_url: Option<&str>,
cdn_host: Option<&str>,
) -> DownloadAllowlist {
+ if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
+ return DownloadAllowlist::allow_any();
+ }
let mut allow = DownloadAllowlist::new();
if let Some(url) = start_url {
if let Ok(parsed) = reqwest::Url::parse(url) {
diff --git a/backend/src/crawler/safety.rs b/backend/src/crawler/safety.rs
index d9c62ad..deb7f61 100644
--- a/backend/src/crawler/safety.rs
+++ b/backend/src/crawler/safety.rs
@@ -47,14 +47,33 @@ pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
/// configured allowlist. None by default — keeping the surface area
/// minimal so the only way a URL gets through is if it matches an
/// explicit catalog/CDN entry.
+///
+/// `allow_any` flips the host check off entirely (private-IP and
+/// scheme checks still apply). It exists for operators whose sources
+/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
+/// where enumerating each host upfront is impractical. Off by default.
#[derive(Clone, Debug, Default)]
pub struct DownloadAllowlist {
hosts: Vec,
+ allow_any: bool,
}
impl DownloadAllowlist {
pub fn new() -> Self {
- Self { hosts: Vec::new() }
+ Self {
+ hosts: Vec::new(),
+ allow_any: false,
+ }
+ }
+
+ /// Bypass the host allowlist. Scheme, localhost, and private-IP
+ /// checks in [`is_safe_url`] continue to apply — this only opens
+ /// up public hosts that weren't pre-enumerated.
+ pub fn allow_any() -> Self {
+ Self {
+ hosts: Vec::new(),
+ allow_any: true,
+ }
}
/// Add a host (case-insensitive match). Sub-domains are *not*
@@ -73,6 +92,9 @@ impl DownloadAllowlist {
}
pub fn contains(&self, host: &str) -> bool {
+ if self.allow_any {
+ return true;
+ }
let lower = host.to_ascii_lowercase();
self.hosts.iter().any(|h| h == &lower)
}
@@ -245,6 +267,56 @@ mod tests {
DownloadAllowlist::new().allow(host)
}
+ #[test]
+ fn allow_any_admits_arbitrary_public_host() {
+ // Operators who can't pre-enumerate a numbered-CDN fleet
+ // (cdn1, cdn2, …) opt into allow_any. Any public host passes.
+ let allow = DownloadAllowlist::allow_any();
+ assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
+ assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
+ }
+
+ #[test]
+ fn allow_any_still_blocks_private_ips() {
+ // The point of the bypass is the host-allowlist check, not the
+ // SSRF defense. Private/loopback IPs stay refused.
+ let allow = DownloadAllowlist::allow_any();
+ for url in [
+ "http://10.0.0.1/",
+ "http://192.168.1.1/",
+ "http://169.254.169.254/",
+ "http://127.0.0.1/",
+ "http://[::1]/",
+ "http://[::ffff:127.0.0.1]/",
+ ] {
+ assert!(
+ matches!(
+ is_safe_url(url, &allow).unwrap_err(),
+ UrlSafetyError::PrivateIp(_)
+ ),
+ "allow_any must still reject {url}"
+ );
+ }
+ }
+
+ #[test]
+ fn allow_any_still_blocks_localhost() {
+ let allow = DownloadAllowlist::allow_any();
+ assert!(matches!(
+ is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
+ UrlSafetyError::Loopback
+ ));
+ }
+
+ #[test]
+ fn allow_any_still_blocks_non_http_schemes() {
+ let allow = DownloadAllowlist::allow_any();
+ assert!(matches!(
+ is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
+ UrlSafetyError::BadScheme(_)
+ ));
+ }
+
#[test]
fn safe_url_allows_listed_host() {
let allow = allow_just("cdn.example.com");
diff --git a/frontend/package.json b/frontend/package.json
index 9019161..66eb934 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -1,6 +1,6 @@
{
"name": "mangalord-frontend",
- "version": "0.43.1",
+ "version": "0.44.0",
"private": true,
"type": "module",
"scripts": {