feat(crawler): CRAWLER_ALLOW_ANY_HOST bypasses the host allowlist (0.44.0)
Some checks failed
deploy / test-backend (push) Failing after 11s
deploy / test-frontend (push) Failing after 36s
deploy / build-and-push (push) Has been skipped
deploy / deploy (push) Has been skipped

Operators whose sources shard images across numbered CDN subdomains
can't pre-enumerate every host in CRAWLER_DOWNLOAD_ALLOWLIST. The new
flag short-circuits the host check in DownloadAllowlist::contains
while leaving scheme, localhost, and private-IP defenses in
is_safe_url untouched — scraped URLs pointing at 10.x /
169.254.169.254 / file:// stay refused. Default is false; fail-closed
posture is preserved unless the operator opts in. Wired into both the
server (config::build_download_allowlist) and the bin/crawler.rs
one-shot.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-31 14:52:49 +02:00
parent 1eebb90e25
commit a2826d6467
7 changed files with 111 additions and 20 deletions

View File

@@ -66,6 +66,12 @@ MAX_FILE_BYTES=20971520
# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated.
# Defends against SSRF via scraped <img src="http://10.0.0.1/...">.
CRAWLER_DOWNLOAD_ALLOWLIST=
# Bypass the host allowlist entirely. Intended for sources that shard
# images across numbered CDN subdomains (cdn1/cdn2/…) where enumerating
# every host upfront is impractical. The private-IP / localhost / non-
# http(s) scheme defenses STAY ON — a scraped <img src="http://10.0.0.1/">
# is still refused with this flag set.
CRAWLER_ALLOW_ANY_HOST=false
# Hard cap on a single image body. Default 32 MiB.
CRAWLER_MAX_IMAGE_BYTES=33554432

2
backend/Cargo.lock generated
View File

@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mangalord"
version = "0.43.1"
version = "0.44.0"
dependencies = [
"anyhow",
"argon2",

View File

@@ -1,6 +1,6 @@
[package]
name = "mangalord"
version = "0.43.1"
version = "0.44.0"
edition = "2021"
default-run = "mangalord"

View File

@@ -226,24 +226,30 @@ async fn run(
// SSRF defence: only download from the catalog host + CDN host
// (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap
// single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes.
let mut allowlist =
mangalord::crawler::safety::DownloadAllowlist::new();
// CRAWLER_ALLOW_ANY_HOST=true short-circuits the host check for
// sharded-CDN sources; private-IP and scheme guards still apply.
let allowlist = if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
mangalord::crawler::safety::DownloadAllowlist::allow_any()
} else {
let mut allow = mangalord::crawler::safety::DownloadAllowlist::new();
if let Ok(parsed) = reqwest::Url::parse(start_url) {
if let Some(h) = parsed.host_str() {
allowlist = allowlist.allow(h);
allow = allow.allow(h);
}
}
if let Some(host) = cdn_host {
allowlist = allowlist.allow(host);
allow = allow.allow(host);
}
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
for piece in extras.split(',') {
let trimmed = piece.trim();
if !trimmed.is_empty() {
allowlist = allowlist.allow(trimmed);
allow = allow.allow(trimmed);
}
}
}
allow
};
let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES")
.ok()
.and_then(|s| s.parse().ok())

View File

@@ -248,10 +248,17 @@ impl CrawlerConfig {
/// separated). Empty by default — meaning the crawler refuses to
/// download anything when no source is configured, which is the safe
/// fail-closed posture.
///
/// `CRAWLER_ALLOW_ANY_HOST=true` short-circuits the host enumeration
/// for operators whose sources shard across numbered CDN subdomains.
/// Scheme + private-IP defenses still apply.
fn build_download_allowlist(
start_url: Option<&str>,
cdn_host: Option<&str>,
) -> DownloadAllowlist {
if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
return DownloadAllowlist::allow_any();
}
let mut allow = DownloadAllowlist::new();
if let Some(url) = start_url {
if let Ok(parsed) = reqwest::Url::parse(url) {

View File

@@ -47,14 +47,33 @@ pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
/// configured allowlist. None by default — keeping the surface area
/// minimal so the only way a URL gets through is if it matches an
/// explicit catalog/CDN entry.
///
/// `allow_any` flips the host check off entirely (private-IP and
/// scheme checks still apply). It exists for operators whose sources
/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
/// where enumerating each host upfront is impractical. Off by default.
#[derive(Clone, Debug, Default)]
pub struct DownloadAllowlist {
hosts: Vec<String>,
allow_any: bool,
}
impl DownloadAllowlist {
pub fn new() -> Self {
Self { hosts: Vec::new() }
Self {
hosts: Vec::new(),
allow_any: false,
}
}
/// Bypass the host allowlist. Scheme, localhost, and private-IP
/// checks in [`is_safe_url`] continue to apply — this only opens
/// up public hosts that weren't pre-enumerated.
pub fn allow_any() -> Self {
Self {
hosts: Vec::new(),
allow_any: true,
}
}
/// Add a host (case-insensitive match). Sub-domains are *not*
@@ -73,6 +92,9 @@ impl DownloadAllowlist {
}
pub fn contains(&self, host: &str) -> bool {
if self.allow_any {
return true;
}
let lower = host.to_ascii_lowercase();
self.hosts.iter().any(|h| h == &lower)
}
@@ -245,6 +267,56 @@ mod tests {
DownloadAllowlist::new().allow(host)
}
#[test]
fn allow_any_admits_arbitrary_public_host() {
// Operators who can't pre-enumerate a numbered-CDN fleet
// (cdn1, cdn2, …) opt into allow_any. Any public host passes.
let allow = DownloadAllowlist::allow_any();
assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
}
#[test]
fn allow_any_still_blocks_private_ips() {
// The point of the bypass is the host-allowlist check, not the
// SSRF defense. Private/loopback IPs stay refused.
let allow = DownloadAllowlist::allow_any();
for url in [
"http://10.0.0.1/",
"http://192.168.1.1/",
"http://169.254.169.254/",
"http://127.0.0.1/",
"http://[::1]/",
"http://[::ffff:127.0.0.1]/",
] {
assert!(
matches!(
is_safe_url(url, &allow).unwrap_err(),
UrlSafetyError::PrivateIp(_)
),
"allow_any must still reject {url}"
);
}
}
#[test]
fn allow_any_still_blocks_localhost() {
let allow = DownloadAllowlist::allow_any();
assert!(matches!(
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
UrlSafetyError::Loopback
));
}
#[test]
fn allow_any_still_blocks_non_http_schemes() {
let allow = DownloadAllowlist::allow_any();
assert!(matches!(
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
UrlSafetyError::BadScheme(_)
));
}
#[test]
fn safe_url_allows_listed_host() {
let allow = allow_just("cdn.example.com");

View File

@@ -1,6 +1,6 @@
{
"name": "mangalord-frontend",
"version": "0.43.1",
"version": "0.44.0",
"private": true,
"type": "module",
"scripts": {