feat(crawler): CRAWLER_ALLOW_ANY_HOST bypasses the host allowlist (0.44.0)

Operators whose sources shard images across numbered CDN subdomains can't pre-enumerate every host in CRAWLER_DOWNLOAD_ALLOWLIST. The new flag short-circuits the host check in DownloadAllowlist::contains while leaving scheme, localhost, and private-IP defenses in is_safe_url untouched — scraped URLs pointing at 10.x / 169.254.169.254 / file:// stay refused. Default is false; fail-closed posture is preserved unless the operator opts in. Wired into both the server (config::build_download_allowlist) and the bin/crawler.rs one-shot. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-31 14:52:49 +02:00
parent 1eebb90e25
commit a2826d6467
7 changed files with 111 additions and 20 deletions
--- a/backend/src/crawler/safety.rs
+++ b/backend/src/crawler/safety.rs
@@ -47,14 +47,33 @@ pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
 /// configured allowlist. None by default — keeping the surface area
 /// minimal so the only way a URL gets through is if it matches an
 /// explicit catalog/CDN entry.
+///
+/// `allow_any` flips the host check off entirely (private-IP and
+/// scheme checks still apply). It exists for operators whose sources
+/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
+/// where enumerating each host upfront is impractical. Off by default.
 #[derive(Clone, Debug, Default)]
 pub struct DownloadAllowlist {
    hosts: Vec<String>,
+    allow_any: bool,
 }

 impl DownloadAllowlist {
    pub fn new() -> Self {
-        Self { hosts: Vec::new() }
+        Self {
+            hosts: Vec::new(),
+            allow_any: false,
+        }
+    }
+
+    /// Bypass the host allowlist. Scheme, localhost, and private-IP
+    /// checks in [`is_safe_url`] continue to apply — this only opens
+    /// up public hosts that weren't pre-enumerated.
+    pub fn allow_any() -> Self {
+        Self {
+            hosts: Vec::new(),
+            allow_any: true,
+        }
    }

    /// Add a host (case-insensitive match). Sub-domains are *not*
@@ -73,6 +92,9 @@ impl DownloadAllowlist {
    }

    pub fn contains(&self, host: &str) -> bool {
+        if self.allow_any {
+            return true;
+        }
        let lower = host.to_ascii_lowercase();
        self.hosts.iter().any(|h| h == &lower)
    }
@@ -245,6 +267,56 @@ mod tests {
        DownloadAllowlist::new().allow(host)
    }

+    #[test]
+    fn allow_any_admits_arbitrary_public_host() {
+        // Operators who can't pre-enumerate a numbered-CDN fleet
+        // (cdn1, cdn2, …) opt into allow_any. Any public host passes.
+        let allow = DownloadAllowlist::allow_any();
+        assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
+        assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
+    }
+
+    #[test]
+    fn allow_any_still_blocks_private_ips() {
+        // The point of the bypass is the host-allowlist check, not the
+        // SSRF defense. Private/loopback IPs stay refused.
+        let allow = DownloadAllowlist::allow_any();
+        for url in [
+            "http://10.0.0.1/",
+            "http://192.168.1.1/",
+            "http://169.254.169.254/",
+            "http://127.0.0.1/",
+            "http://[::1]/",
+            "http://[::ffff:127.0.0.1]/",
+        ] {
+            assert!(
+                matches!(
+                    is_safe_url(url, &allow).unwrap_err(),
+                    UrlSafetyError::PrivateIp(_)
+                ),
+                "allow_any must still reject {url}"
+            );
+        }
+    }
+
+    #[test]
+    fn allow_any_still_blocks_localhost() {
+        let allow = DownloadAllowlist::allow_any();
+        assert!(matches!(
+            is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
+            UrlSafetyError::Loopback
+        ));
+    }
+
+    #[test]
+    fn allow_any_still_blocks_non_http_schemes() {
+        let allow = DownloadAllowlist::allow_any();
+        assert!(matches!(
+            is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
+            UrlSafetyError::BadScheme(_)
+        ));
+    }
+
    #[test]
    fn safe_url_allows_listed_host() {
        let allow = allow_just("cdn.example.com");