feat(crawler): CRAWLER_ALLOW_ANY_HOST bypasses the host allowlist (0.44.0)
Some checks failed
deploy / test-backend (push) Failing after 11s
deploy / test-frontend (push) Failing after 36s
deploy / build-and-push (push) Has been skipped
deploy / deploy (push) Has been skipped

Operators whose sources shard images across numbered CDN subdomains
can't pre-enumerate every host in CRAWLER_DOWNLOAD_ALLOWLIST. The new
flag short-circuits the host check in DownloadAllowlist::contains
while leaving scheme, localhost, and private-IP defenses in
is_safe_url untouched — scraped URLs pointing at 10.x /
169.254.169.254 / file:// stay refused. Default is false; fail-closed
posture is preserved unless the operator opts in. Wired into both the
server (config::build_download_allowlist) and the bin/crawler.rs
one-shot.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-31 14:52:49 +02:00
parent 1eebb90e25
commit a2826d6467
7 changed files with 111 additions and 20 deletions

View File

@@ -47,14 +47,33 @@ pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
/// configured allowlist. None by default — keeping the surface area
/// minimal so the only way a URL gets through is if it matches an
/// explicit catalog/CDN entry.
///
/// `allow_any` flips the host check off entirely (private-IP and
/// scheme checks still apply). It exists for operators whose sources
/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
/// where enumerating each host upfront is impractical. Off by default.
#[derive(Clone, Debug, Default)]
pub struct DownloadAllowlist {
hosts: Vec<String>,
allow_any: bool,
}
impl DownloadAllowlist {
pub fn new() -> Self {
Self { hosts: Vec::new() }
Self {
hosts: Vec::new(),
allow_any: false,
}
}
/// Bypass the host allowlist. Scheme, localhost, and private-IP
/// checks in [`is_safe_url`] continue to apply — this only opens
/// up public hosts that weren't pre-enumerated.
pub fn allow_any() -> Self {
Self {
hosts: Vec::new(),
allow_any: true,
}
}
/// Add a host (case-insensitive match). Sub-domains are *not*
@@ -73,6 +92,9 @@ impl DownloadAllowlist {
}
pub fn contains(&self, host: &str) -> bool {
if self.allow_any {
return true;
}
let lower = host.to_ascii_lowercase();
self.hosts.iter().any(|h| h == &lower)
}
@@ -245,6 +267,56 @@ mod tests {
DownloadAllowlist::new().allow(host)
}
#[test]
fn allow_any_admits_arbitrary_public_host() {
// Operators who can't pre-enumerate a numbered-CDN fleet
// (cdn1, cdn2, …) opt into allow_any. Any public host passes.
let allow = DownloadAllowlist::allow_any();
assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
}
#[test]
fn allow_any_still_blocks_private_ips() {
// The point of the bypass is the host-allowlist check, not the
// SSRF defense. Private/loopback IPs stay refused.
let allow = DownloadAllowlist::allow_any();
for url in [
"http://10.0.0.1/",
"http://192.168.1.1/",
"http://169.254.169.254/",
"http://127.0.0.1/",
"http://[::1]/",
"http://[::ffff:127.0.0.1]/",
] {
assert!(
matches!(
is_safe_url(url, &allow).unwrap_err(),
UrlSafetyError::PrivateIp(_)
),
"allow_any must still reject {url}"
);
}
}
#[test]
fn allow_any_still_blocks_localhost() {
let allow = DownloadAllowlist::allow_any();
assert!(matches!(
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
UrlSafetyError::Loopback
));
}
#[test]
fn allow_any_still_blocks_non_http_schemes() {
let allow = DownloadAllowlist::allow_any();
assert!(matches!(
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
UrlSafetyError::BadScheme(_)
));
}
#[test]
fn safe_url_allows_listed_host() {
let allow = allow_just("cdn.example.com");