feat(crawler): CRAWLER_ALLOW_ANY_HOST bypasses the host allowlist (0.44.0)
Operators whose sources shard images across numbered CDN subdomains can't pre-enumerate every host in CRAWLER_DOWNLOAD_ALLOWLIST. The new flag short-circuits the host check in DownloadAllowlist::contains while leaving scheme, localhost, and private-IP defenses in is_safe_url untouched — scraped URLs pointing at 10.x / 169.254.169.254 / file:// stay refused. Default is false; fail-closed posture is preserved unless the operator opts in. Wired into both the server (config::build_download_allowlist) and the bin/crawler.rs one-shot. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -66,6 +66,12 @@ MAX_FILE_BYTES=20971520
|
|||||||
# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated.
|
# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated.
|
||||||
# Defends against SSRF via scraped <img src="http://10.0.0.1/...">.
|
# Defends against SSRF via scraped <img src="http://10.0.0.1/...">.
|
||||||
CRAWLER_DOWNLOAD_ALLOWLIST=
|
CRAWLER_DOWNLOAD_ALLOWLIST=
|
||||||
|
# Bypass the host allowlist entirely. Intended for sources that shard
|
||||||
|
# images across numbered CDN subdomains (cdn1/cdn2/…) where enumerating
|
||||||
|
# every host upfront is impractical. The private-IP / localhost / non-
|
||||||
|
# http(s) scheme defenses STAY ON — a scraped <img src="http://10.0.0.1/">
|
||||||
|
# is still refused with this flag set.
|
||||||
|
CRAWLER_ALLOW_ANY_HOST=false
|
||||||
# Hard cap on a single image body. Default 32 MiB.
|
# Hard cap on a single image body. Default 32 MiB.
|
||||||
CRAWLER_MAX_IMAGE_BYTES=33554432
|
CRAWLER_MAX_IMAGE_BYTES=33554432
|
||||||
|
|
||||||
|
|||||||
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.43.1"
|
version = "0.44.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.43.1"
|
version = "0.44.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
default-run = "mangalord"
|
default-run = "mangalord"
|
||||||
|
|
||||||
|
|||||||
@@ -226,24 +226,30 @@ async fn run(
|
|||||||
// SSRF defence: only download from the catalog host + CDN host
|
// SSRF defence: only download from the catalog host + CDN host
|
||||||
// (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap
|
// (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap
|
||||||
// single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes.
|
// single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes.
|
||||||
let mut allowlist =
|
// CRAWLER_ALLOW_ANY_HOST=true short-circuits the host check for
|
||||||
mangalord::crawler::safety::DownloadAllowlist::new();
|
// sharded-CDN sources; private-IP and scheme guards still apply.
|
||||||
|
let allowlist = if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
|
||||||
|
mangalord::crawler::safety::DownloadAllowlist::allow_any()
|
||||||
|
} else {
|
||||||
|
let mut allow = mangalord::crawler::safety::DownloadAllowlist::new();
|
||||||
if let Ok(parsed) = reqwest::Url::parse(start_url) {
|
if let Ok(parsed) = reqwest::Url::parse(start_url) {
|
||||||
if let Some(h) = parsed.host_str() {
|
if let Some(h) = parsed.host_str() {
|
||||||
allowlist = allowlist.allow(h);
|
allow = allow.allow(h);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if let Some(host) = cdn_host {
|
if let Some(host) = cdn_host {
|
||||||
allowlist = allowlist.allow(host);
|
allow = allow.allow(host);
|
||||||
}
|
}
|
||||||
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
|
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
|
||||||
for piece in extras.split(',') {
|
for piece in extras.split(',') {
|
||||||
let trimmed = piece.trim();
|
let trimmed = piece.trim();
|
||||||
if !trimmed.is_empty() {
|
if !trimmed.is_empty() {
|
||||||
allowlist = allowlist.allow(trimmed);
|
allow = allow.allow(trimmed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
allow
|
||||||
|
};
|
||||||
let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES")
|
let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES")
|
||||||
.ok()
|
.ok()
|
||||||
.and_then(|s| s.parse().ok())
|
.and_then(|s| s.parse().ok())
|
||||||
|
|||||||
@@ -248,10 +248,17 @@ impl CrawlerConfig {
|
|||||||
/// separated). Empty by default — meaning the crawler refuses to
|
/// separated). Empty by default — meaning the crawler refuses to
|
||||||
/// download anything when no source is configured, which is the safe
|
/// download anything when no source is configured, which is the safe
|
||||||
/// fail-closed posture.
|
/// fail-closed posture.
|
||||||
|
///
|
||||||
|
/// `CRAWLER_ALLOW_ANY_HOST=true` short-circuits the host enumeration
|
||||||
|
/// for operators whose sources shard across numbered CDN subdomains.
|
||||||
|
/// Scheme + private-IP defenses still apply.
|
||||||
fn build_download_allowlist(
|
fn build_download_allowlist(
|
||||||
start_url: Option<&str>,
|
start_url: Option<&str>,
|
||||||
cdn_host: Option<&str>,
|
cdn_host: Option<&str>,
|
||||||
) -> DownloadAllowlist {
|
) -> DownloadAllowlist {
|
||||||
|
if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
|
||||||
|
return DownloadAllowlist::allow_any();
|
||||||
|
}
|
||||||
let mut allow = DownloadAllowlist::new();
|
let mut allow = DownloadAllowlist::new();
|
||||||
if let Some(url) = start_url {
|
if let Some(url) = start_url {
|
||||||
if let Ok(parsed) = reqwest::Url::parse(url) {
|
if let Ok(parsed) = reqwest::Url::parse(url) {
|
||||||
|
|||||||
@@ -47,14 +47,33 @@ pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
|
|||||||
/// configured allowlist. None by default — keeping the surface area
|
/// configured allowlist. None by default — keeping the surface area
|
||||||
/// minimal so the only way a URL gets through is if it matches an
|
/// minimal so the only way a URL gets through is if it matches an
|
||||||
/// explicit catalog/CDN entry.
|
/// explicit catalog/CDN entry.
|
||||||
|
///
|
||||||
|
/// `allow_any` flips the host check off entirely (private-IP and
|
||||||
|
/// scheme checks still apply). It exists for operators whose sources
|
||||||
|
/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
|
||||||
|
/// where enumerating each host upfront is impractical. Off by default.
|
||||||
#[derive(Clone, Debug, Default)]
|
#[derive(Clone, Debug, Default)]
|
||||||
pub struct DownloadAllowlist {
|
pub struct DownloadAllowlist {
|
||||||
hosts: Vec<String>,
|
hosts: Vec<String>,
|
||||||
|
allow_any: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DownloadAllowlist {
|
impl DownloadAllowlist {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self { hosts: Vec::new() }
|
Self {
|
||||||
|
hosts: Vec::new(),
|
||||||
|
allow_any: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bypass the host allowlist. Scheme, localhost, and private-IP
|
||||||
|
/// checks in [`is_safe_url`] continue to apply — this only opens
|
||||||
|
/// up public hosts that weren't pre-enumerated.
|
||||||
|
pub fn allow_any() -> Self {
|
||||||
|
Self {
|
||||||
|
hosts: Vec::new(),
|
||||||
|
allow_any: true,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Add a host (case-insensitive match). Sub-domains are *not*
|
/// Add a host (case-insensitive match). Sub-domains are *not*
|
||||||
@@ -73,6 +92,9 @@ impl DownloadAllowlist {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn contains(&self, host: &str) -> bool {
|
pub fn contains(&self, host: &str) -> bool {
|
||||||
|
if self.allow_any {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
let lower = host.to_ascii_lowercase();
|
let lower = host.to_ascii_lowercase();
|
||||||
self.hosts.iter().any(|h| h == &lower)
|
self.hosts.iter().any(|h| h == &lower)
|
||||||
}
|
}
|
||||||
@@ -245,6 +267,56 @@ mod tests {
|
|||||||
DownloadAllowlist::new().allow(host)
|
DownloadAllowlist::new().allow(host)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_admits_arbitrary_public_host() {
|
||||||
|
// Operators who can't pre-enumerate a numbered-CDN fleet
|
||||||
|
// (cdn1, cdn2, …) opt into allow_any. Any public host passes.
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
|
||||||
|
assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_still_blocks_private_ips() {
|
||||||
|
// The point of the bypass is the host-allowlist check, not the
|
||||||
|
// SSRF defense. Private/loopback IPs stay refused.
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
for url in [
|
||||||
|
"http://10.0.0.1/",
|
||||||
|
"http://192.168.1.1/",
|
||||||
|
"http://169.254.169.254/",
|
||||||
|
"http://127.0.0.1/",
|
||||||
|
"http://[::1]/",
|
||||||
|
"http://[::ffff:127.0.0.1]/",
|
||||||
|
] {
|
||||||
|
assert!(
|
||||||
|
matches!(
|
||||||
|
is_safe_url(url, &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::PrivateIp(_)
|
||||||
|
),
|
||||||
|
"allow_any must still reject {url}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_still_blocks_localhost() {
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::Loopback
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_still_blocks_non_http_schemes() {
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::BadScheme(_)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn safe_url_allows_listed_host() {
|
fn safe_url_allows_listed_host() {
|
||||||
let allow = allow_just("cdn.example.com");
|
let allow = allow_just("cdn.example.com");
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "mangalord-frontend",
|
"name": "mangalord-frontend",
|
||||||
"version": "0.43.1",
|
"version": "0.44.0",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
Reference in New Issue
Block a user