feat(crawler): CRAWLER_ALLOW_ANY_HOST bypasses the host allowlist (0.44.0)
Operators whose sources shard images across numbered CDN subdomains can't pre-enumerate every host in CRAWLER_DOWNLOAD_ALLOWLIST. The new flag short-circuits the host check in DownloadAllowlist::contains while leaving scheme, localhost, and private-IP defenses in is_safe_url untouched — scraped URLs pointing at 10.x / 169.254.169.254 / file:// stay refused. Default is false; fail-closed posture is preserved unless the operator opts in. Wired into both the server (config::build_download_allowlist) and the bin/crawler.rs one-shot. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -66,6 +66,12 @@ MAX_FILE_BYTES=20971520
|
||||
# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated.
|
||||
# Defends against SSRF via scraped <img src="http://10.0.0.1/...">.
|
||||
CRAWLER_DOWNLOAD_ALLOWLIST=
|
||||
# Bypass the host allowlist entirely. Intended for sources that shard
|
||||
# images across numbered CDN subdomains (cdn1/cdn2/…) where enumerating
|
||||
# every host upfront is impractical. The private-IP / localhost / non-
|
||||
# http(s) scheme defenses STAY ON — a scraped <img src="http://10.0.0.1/">
|
||||
# is still refused with this flag set.
|
||||
CRAWLER_ALLOW_ANY_HOST=false
|
||||
# Hard cap on a single image body. Default 32 MiB.
|
||||
CRAWLER_MAX_IMAGE_BYTES=33554432
|
||||
|
||||
|
||||
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mangalord"
|
||||
version = "0.43.1"
|
||||
version = "0.44.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.43.1"
|
||||
version = "0.44.0"
|
||||
edition = "2021"
|
||||
default-run = "mangalord"
|
||||
|
||||
|
||||
@@ -226,24 +226,30 @@ async fn run(
|
||||
// SSRF defence: only download from the catalog host + CDN host
|
||||
// (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap
|
||||
// single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes.
|
||||
let mut allowlist =
|
||||
mangalord::crawler::safety::DownloadAllowlist::new();
|
||||
if let Ok(parsed) = reqwest::Url::parse(start_url) {
|
||||
if let Some(h) = parsed.host_str() {
|
||||
allowlist = allowlist.allow(h);
|
||||
}
|
||||
}
|
||||
if let Some(host) = cdn_host {
|
||||
allowlist = allowlist.allow(host);
|
||||
}
|
||||
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
|
||||
for piece in extras.split(',') {
|
||||
let trimmed = piece.trim();
|
||||
if !trimmed.is_empty() {
|
||||
allowlist = allowlist.allow(trimmed);
|
||||
// CRAWLER_ALLOW_ANY_HOST=true short-circuits the host check for
|
||||
// sharded-CDN sources; private-IP and scheme guards still apply.
|
||||
let allowlist = if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
|
||||
mangalord::crawler::safety::DownloadAllowlist::allow_any()
|
||||
} else {
|
||||
let mut allow = mangalord::crawler::safety::DownloadAllowlist::new();
|
||||
if let Ok(parsed) = reqwest::Url::parse(start_url) {
|
||||
if let Some(h) = parsed.host_str() {
|
||||
allow = allow.allow(h);
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(host) = cdn_host {
|
||||
allow = allow.allow(host);
|
||||
}
|
||||
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
|
||||
for piece in extras.split(',') {
|
||||
let trimmed = piece.trim();
|
||||
if !trimmed.is_empty() {
|
||||
allow = allow.allow(trimmed);
|
||||
}
|
||||
}
|
||||
}
|
||||
allow
|
||||
};
|
||||
let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
|
||||
@@ -248,10 +248,17 @@ impl CrawlerConfig {
|
||||
/// separated). Empty by default — meaning the crawler refuses to
|
||||
/// download anything when no source is configured, which is the safe
|
||||
/// fail-closed posture.
|
||||
///
|
||||
/// `CRAWLER_ALLOW_ANY_HOST=true` short-circuits the host enumeration
|
||||
/// for operators whose sources shard across numbered CDN subdomains.
|
||||
/// Scheme + private-IP defenses still apply.
|
||||
fn build_download_allowlist(
|
||||
start_url: Option<&str>,
|
||||
cdn_host: Option<&str>,
|
||||
) -> DownloadAllowlist {
|
||||
if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
|
||||
return DownloadAllowlist::allow_any();
|
||||
}
|
||||
let mut allow = DownloadAllowlist::new();
|
||||
if let Some(url) = start_url {
|
||||
if let Ok(parsed) = reqwest::Url::parse(url) {
|
||||
|
||||
@@ -47,14 +47,33 @@ pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
|
||||
/// configured allowlist. None by default — keeping the surface area
|
||||
/// minimal so the only way a URL gets through is if it matches an
|
||||
/// explicit catalog/CDN entry.
|
||||
///
|
||||
/// `allow_any` flips the host check off entirely (private-IP and
|
||||
/// scheme checks still apply). It exists for operators whose sources
|
||||
/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
|
||||
/// where enumerating each host upfront is impractical. Off by default.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct DownloadAllowlist {
|
||||
hosts: Vec<String>,
|
||||
allow_any: bool,
|
||||
}
|
||||
|
||||
impl DownloadAllowlist {
|
||||
pub fn new() -> Self {
|
||||
Self { hosts: Vec::new() }
|
||||
Self {
|
||||
hosts: Vec::new(),
|
||||
allow_any: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Bypass the host allowlist. Scheme, localhost, and private-IP
|
||||
/// checks in [`is_safe_url`] continue to apply — this only opens
|
||||
/// up public hosts that weren't pre-enumerated.
|
||||
pub fn allow_any() -> Self {
|
||||
Self {
|
||||
hosts: Vec::new(),
|
||||
allow_any: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a host (case-insensitive match). Sub-domains are *not*
|
||||
@@ -73,6 +92,9 @@ impl DownloadAllowlist {
|
||||
}
|
||||
|
||||
pub fn contains(&self, host: &str) -> bool {
|
||||
if self.allow_any {
|
||||
return true;
|
||||
}
|
||||
let lower = host.to_ascii_lowercase();
|
||||
self.hosts.iter().any(|h| h == &lower)
|
||||
}
|
||||
@@ -245,6 +267,56 @@ mod tests {
|
||||
DownloadAllowlist::new().allow(host)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allow_any_admits_arbitrary_public_host() {
|
||||
// Operators who can't pre-enumerate a numbered-CDN fleet
|
||||
// (cdn1, cdn2, …) opt into allow_any. Any public host passes.
|
||||
let allow = DownloadAllowlist::allow_any();
|
||||
assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
|
||||
assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allow_any_still_blocks_private_ips() {
|
||||
// The point of the bypass is the host-allowlist check, not the
|
||||
// SSRF defense. Private/loopback IPs stay refused.
|
||||
let allow = DownloadAllowlist::allow_any();
|
||||
for url in [
|
||||
"http://10.0.0.1/",
|
||||
"http://192.168.1.1/",
|
||||
"http://169.254.169.254/",
|
||||
"http://127.0.0.1/",
|
||||
"http://[::1]/",
|
||||
"http://[::ffff:127.0.0.1]/",
|
||||
] {
|
||||
assert!(
|
||||
matches!(
|
||||
is_safe_url(url, &allow).unwrap_err(),
|
||||
UrlSafetyError::PrivateIp(_)
|
||||
),
|
||||
"allow_any must still reject {url}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allow_any_still_blocks_localhost() {
|
||||
let allow = DownloadAllowlist::allow_any();
|
||||
assert!(matches!(
|
||||
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
|
||||
UrlSafetyError::Loopback
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allow_any_still_blocks_non_http_schemes() {
|
||||
let allow = DownloadAllowlist::allow_any();
|
||||
assert!(matches!(
|
||||
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
|
||||
UrlSafetyError::BadScheme(_)
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn safe_url_allows_listed_host() {
|
||||
let allow = allow_just("cdn.example.com");
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "mangalord-frontend",
|
||||
"version": "0.43.1",
|
||||
"version": "0.44.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
|
||||
Reference in New Issue
Block a user