//! Defensive helpers for the image-download paths. //! //! Two threats this module addresses: //! //! - **SSRF**: a scraped chapter or manga page can embed an absolute //! ``. The crawler runs inside the //! backend container with intra-compose access to `postgres:5432` //! and possibly other internal services; without a host check the //! crawler would happily probe them. [`is_safe_url`] rejects //! anything whose host isn't on the operator-configured allowlist, //! plus any IP literal in RFC1918 / loopback / link-local / unique- //! local space (including IPv4-mapped IPv6 like `::ffff:127.0.0.1`) //! as a second defence for the case where an allowlisted hostname's //! DNS happens to resolve to a literal private address. //! //! **DNS rebinding is not covered.** A hostname like `cdn.allowed.com` //! that *resolves* to `127.0.0.1` via hostile DNS bypasses the IP //! check entirely — `is_safe_url` only inspects URL strings, not //! resolved IPs. Mitigating that requires a custom reqwest resolver //! that filters IPs after DNS, which would mean rebuilding reqwest's //! connector. The allowlist + good operator DNS hygiene is the //! realistic mitigation today. //! //! - **Unbounded download**: `Response::bytes().await` reads the full //! body before returning. A malicious source serving a 10 GiB image //! would fill memory and then disk. [`accumulate_capped`] streams //! the body chunk-by-chunk into a [`bytes::BytesMut`] and bails as //! soon as the running total exceeds the cap. //! //! Both helpers are pure-data: the SSRF check is keyed off a parsed //! URL string, and the byte accumulator is keyed off a generic stream. //! Easy to unit-test without a live network or browser. use std::net::IpAddr; use anyhow::{bail, Context}; use bytes::BytesMut; use futures_util::StreamExt; use reqwest::Url; /// Default per-image download cap. A page image is generally <2 MiB; /// 32 MiB leaves headroom for high-resolution covers while still /// stopping a misbehaving CDN dead. Override via `CRAWLER_MAX_IMAGE_BYTES`. pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024; /// Hosts that are always allowed in addition to the operator's /// configured allowlist. None by default — keeping the surface area /// minimal so the only way a URL gets through is if it matches an /// explicit catalog/CDN entry. /// /// `allow_any` flips the host check off entirely (private-IP and /// scheme checks still apply). It exists for operators whose sources /// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …) /// where enumerating each host upfront is impractical. Off by default. #[derive(Clone, Debug, Default)] pub struct DownloadAllowlist { hosts: Vec, allow_any: bool, } impl DownloadAllowlist { pub fn new() -> Self { Self { hosts: Vec::new(), allow_any: false, } } /// Bypass the host allowlist. Scheme, localhost, and private-IP /// checks in [`is_safe_url`] continue to apply — this only opens /// up public hosts that weren't pre-enumerated. pub fn allow_any() -> Self { Self { hosts: Vec::new(), allow_any: true, } } /// Add a host (case-insensitive match). Sub-domains are *not* /// implied: pass `cdn.example.com` and `example.com` separately /// if both should be reachable. pub fn allow(mut self, host: impl Into) -> Self { let h = host.into().to_ascii_lowercase(); if !h.is_empty() && !self.hosts.iter().any(|existing| existing == &h) { self.hosts.push(h); } self } pub fn is_empty(&self) -> bool { self.hosts.is_empty() } pub fn contains(&self, host: &str) -> bool { if self.allow_any { return true; } let lower = host.to_ascii_lowercase(); self.hosts.iter().any(|h| h == &lower) } } /// Verify a URL is safe for the crawler to fetch. /// /// Rejects: /// - non-http(s) schemes (file://, gopher://, …), /// - any IP literal in private / loopback / link-local / unique-local /// space (defense in depth — a DNS allowlist alone wouldn't cover an /// attacker that places an entry like `cdn.evil` pointing at /// `192.168.1.1`), /// - the literal hostname `localhost`, /// - hosts that aren't on the supplied allowlist. /// /// An empty allowlist rejects everything (the conservative default — /// callers must explicitly allow the catalog and CDN hosts). pub fn is_safe_url(raw_url: &str, allow: &DownloadAllowlist) -> Result<(), UrlSafetyError> { let url = Url::parse(raw_url).map_err(|_| UrlSafetyError::Unparseable)?; let scheme = url.scheme(); if scheme != "http" && scheme != "https" { return Err(UrlSafetyError::BadScheme(scheme.to_string())); } let host = url.host_str().ok_or(UrlSafetyError::NoHost)?; let lower_host = host.to_ascii_lowercase(); if lower_host == "localhost" { return Err(UrlSafetyError::Loopback); } // Reject IP literals in private/loopback ranges regardless of the // allowlist — if someone puts an IP literal on the allowlist they // almost certainly didn't mean a private range. // reqwest::Url normalises IPv6 literals as `[::1]` (brackets // included) in `host_str()`. Strip the brackets before parsing. let ip_candidate = lower_host .strip_prefix('[') .and_then(|s| s.strip_suffix(']')) .unwrap_or(&lower_host); if let Ok(ip) = ip_candidate.parse::() { if is_private_ip(&ip) { return Err(UrlSafetyError::PrivateIp(ip)); } } if !allow.contains(&lower_host) { return Err(UrlSafetyError::HostNotAllowed(lower_host)); } Ok(()) } fn is_private_ip(ip: &IpAddr) -> bool { match ip { IpAddr::V4(v4) => { v4.is_loopback() || v4.is_private() || v4.is_link_local() || v4.is_unspecified() || v4.is_broadcast() // CGNAT 100.64.0.0/10 || (v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64) // 169.254/16 link-local already covered, but 0.0.0.0/8 is special-use || v4.octets()[0] == 0 } IpAddr::V6(v6) => { // IPv4-mapped IPv6 (::ffff:0:0/96): unwrap to the embedded // IPv4 and recurse so `::ffff:127.0.0.1` is caught by the // IPv4 loopback check rather than passing through. // `Ipv6Addr::is_loopback()` only matches `::1` exactly. if let Some(v4) = v6.to_ipv4_mapped() { return is_private_ip(&IpAddr::V4(v4)); } v6.is_loopback() || v6.is_unspecified() // fc00::/7 unique-local || (v6.segments()[0] & 0xfe00) == 0xfc00 // fe80::/10 link-local || (v6.segments()[0] & 0xffc0) == 0xfe80 } } } #[derive(Debug, thiserror::Error, PartialEq, Eq)] pub enum UrlSafetyError { #[error("URL is not parseable")] Unparseable, #[error("scheme {0:?} is not http or https")] BadScheme(String), #[error("URL is missing a host")] NoHost, #[error("host points at the loopback interface")] Loopback, #[error("host is a private/internal IP: {0}")] PrivateIp(IpAddr), #[error("host {0:?} is not on the crawler download allowlist")] HostNotAllowed(String), } /// Drain a byte stream into a single buffer, bailing out as soon as /// the running total exceeds `max_bytes`. Generic over the stream so /// it's testable without a live HTTP response. pub async fn accumulate_capped(stream: S, max_bytes: usize) -> anyhow::Result where S: futures_core::Stream>, E: std::error::Error + Send + Sync + 'static, { let mut buf = BytesMut::new(); let mut stream = std::pin::pin!(stream); while let Some(chunk) = stream.next().await { let chunk = chunk.map_err(|e| anyhow::anyhow!("stream chunk: {e}"))?; if buf.len().saturating_add(chunk.len()) > max_bytes { bail!( "response exceeds {max_bytes}-byte cap (received >{}+{})", buf.len(), chunk.len() ); } buf.extend_from_slice(&chunk); } Ok(buf.freeze()) } /// Send `req` and stream the response into a length-limited buffer. /// Combines [`is_safe_url`] check + [`accumulate_capped`] so each /// call-site is one line. pub async fn fetch_bytes_capped( http: &reqwest::Client, url: &str, referer: Option<&str>, allow: &DownloadAllowlist, max_bytes: usize, ) -> anyhow::Result { is_safe_url(url, allow).with_context(|| format!("reject unsafe URL {url}"))?; let mut req = http.get(url); if let Some(r) = referer { req = req.header(reqwest::header::REFERER, r); } let resp = req .send() .await .with_context(|| format!("GET {url}"))? .error_for_status() .with_context(|| format!("non-2xx for {url}"))?; accumulate_capped(resp.bytes_stream(), max_bytes) .await .with_context(|| format!("download body for {url}")) } /// True when `bytes` sniffs as one of the *renderable* image formats /// the `/files/*key` endpoint can serve with a correct Content-Type: /// JPEG, PNG, WebP, GIF, AVIF. Matches the upload pipeline's /// whitelist in `upload::parse_image`. /// /// `infer::MatcherType::Image` is intentionally NOT used — it also /// matches BMP, TIFF, HEIF, ICO, PSD, and JP2. Those would sniff as /// "image" here but [`api::files::content_type_for`] would fall back /// to `application/octet-stream`, prompting browsers to download /// instead of render. Keep the two layers aligned. pub fn looks_like_image(bytes: &[u8]) -> bool { matches!( infer::get(bytes).map(|k| k.mime_type()), Some("image/jpeg" | "image/png" | "image/webp" | "image/gif" | "image/avif") ) } #[cfg(test)] mod tests { use super::*; use futures_util::stream; fn allow_just(host: &str) -> DownloadAllowlist { DownloadAllowlist::new().allow(host) } #[test] fn allow_any_admits_arbitrary_public_host() { // Operators who can't pre-enumerate a numbered-CDN fleet // (cdn1, cdn2, …) opt into allow_any. Any public host passes. let allow = DownloadAllowlist::allow_any(); assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok()); assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok()); } #[test] fn allow_any_still_blocks_private_ips() { // The point of the bypass is the host-allowlist check, not the // SSRF defense. Private/loopback IPs stay refused. let allow = DownloadAllowlist::allow_any(); for url in [ "http://10.0.0.1/", "http://192.168.1.1/", "http://169.254.169.254/", "http://127.0.0.1/", "http://[::1]/", "http://[::ffff:127.0.0.1]/", ] { assert!( matches!( is_safe_url(url, &allow).unwrap_err(), UrlSafetyError::PrivateIp(_) ), "allow_any must still reject {url}" ); } } #[test] fn allow_any_still_blocks_localhost() { let allow = DownloadAllowlist::allow_any(); assert!(matches!( is_safe_url("http://localhost:8080/", &allow).unwrap_err(), UrlSafetyError::Loopback )); } #[test] fn allow_any_still_blocks_non_http_schemes() { let allow = DownloadAllowlist::allow_any(); assert!(matches!( is_safe_url("file:///etc/passwd", &allow).unwrap_err(), UrlSafetyError::BadScheme(_) )); } #[test] fn safe_url_allows_listed_host() { let allow = allow_just("cdn.example.com"); assert!(is_safe_url("https://cdn.example.com/img.jpg", &allow).is_ok()); } #[test] fn safe_url_blocks_unlisted_host() { let allow = allow_just("cdn.example.com"); let err = is_safe_url("https://evil.example.org/img.jpg", &allow).unwrap_err(); assert!(matches!(err, UrlSafetyError::HostNotAllowed(h) if h == "evil.example.org")); } #[test] fn safe_url_blocks_localhost_even_if_allowlisted() { let allow = allow_just("localhost"); assert!(matches!( is_safe_url("http://localhost:8080/", &allow).unwrap_err(), UrlSafetyError::Loopback )); } #[test] fn safe_url_blocks_loopback_ipv4() { let allow = allow_just("127.0.0.1"); assert!(matches!( is_safe_url("http://127.0.0.1/", &allow).unwrap_err(), UrlSafetyError::PrivateIp(_) )); } #[test] fn safe_url_blocks_rfc1918() { let allow = allow_just("10.0.0.1"); for url in [ "http://10.0.0.1/", "http://192.168.1.1/", "http://172.16.0.5/", "http://172.31.255.255/", ] { assert!( matches!( is_safe_url(url, &allow).unwrap_err(), UrlSafetyError::PrivateIp(_) ), "should reject {url}" ); } } #[test] fn safe_url_blocks_link_local() { let allow = allow_just("169.254.169.254"); // 169.254.169.254 is the AWS/GCP metadata service — the most // dangerous SSRF target on a default cloud VM. assert!(matches!( is_safe_url("http://169.254.169.254/", &allow).unwrap_err(), UrlSafetyError::PrivateIp(_) )); } #[test] fn safe_url_blocks_ipv6_loopback_and_ula() { // Debug what host_str returns first — reqwest::Url normalises // IPv6 literals as `[::1]` with brackets, which doesn't parse // as `IpAddr` directly. The implementation strips them. let allow = allow_just("[::1]"); let err = is_safe_url("http://[::1]/", &allow).unwrap_err(); assert!( matches!(err, UrlSafetyError::PrivateIp(_)), "expected PrivateIp, got {err:?}" ); let allow = allow_just("[fd00::1]"); let err = is_safe_url("http://[fd00::1]/", &allow).unwrap_err(); assert!( matches!(err, UrlSafetyError::PrivateIp(_)), "expected PrivateIp, got {err:?}" ); } #[test] fn safe_url_blocks_ipv4_mapped_ipv6_loopback() { // `Ipv6Addr::is_loopback()` only matches `::1` exactly, so // `::ffff:127.0.0.1` would slip through without the // to_ipv4_mapped() unwrap in is_private_ip. let allow = allow_just("[::ffff:127.0.0.1]"); let err = is_safe_url("http://[::ffff:127.0.0.1]/", &allow).unwrap_err(); assert!( matches!(err, UrlSafetyError::PrivateIp(_)), "expected PrivateIp, got {err:?}" ); } #[test] fn safe_url_blocks_ipv4_mapped_ipv6_rfc1918() { let allow = allow_just("[::ffff:10.0.0.1]"); let err = is_safe_url("http://[::ffff:10.0.0.1]/", &allow).unwrap_err(); assert!(matches!(err, UrlSafetyError::PrivateIp(_))); } #[test] fn safe_url_blocks_non_http_schemes() { let allow = allow_just("anywhere"); assert!(matches!( is_safe_url("file:///etc/passwd", &allow).unwrap_err(), UrlSafetyError::BadScheme(_) )); assert!(matches!( is_safe_url("gopher://anywhere:70/", &allow).unwrap_err(), UrlSafetyError::BadScheme(_) )); } #[test] fn safe_url_rejects_unparseable() { let allow = allow_just("anywhere"); assert!(matches!( is_safe_url("not a url", &allow).unwrap_err(), UrlSafetyError::Unparseable )); } #[test] fn safe_url_empty_allowlist_rejects_everything() { let allow = DownloadAllowlist::new(); let err = is_safe_url("https://cdn.example.com/img.jpg", &allow).unwrap_err(); assert!(matches!(err, UrlSafetyError::HostNotAllowed(_))); } #[test] fn allowlist_matches_case_insensitively() { let allow = DownloadAllowlist::new().allow("CDN.Example.COM"); assert!(is_safe_url("https://cdn.example.com/x.jpg", &allow).is_ok()); assert!(is_safe_url("https://CDN.EXAMPLE.com/x.jpg", &allow).is_ok()); } #[tokio::test] async fn accumulate_capped_returns_full_body_under_cap() { let chunks: Vec> = vec![ Ok(bytes::Bytes::from_static(b"hello ")), Ok(bytes::Bytes::from_static(b"world")), ]; let s = stream::iter(chunks); let out = accumulate_capped(s, 100).await.unwrap(); assert_eq!(out.as_ref(), b"hello world"); } #[tokio::test] async fn accumulate_capped_bails_past_cap() { let chunks: Vec> = vec![ Ok(bytes::Bytes::from(vec![0u8; 50])), Ok(bytes::Bytes::from(vec![0u8; 60])), ]; let s = stream::iter(chunks); let err = accumulate_capped(s, 100).await.unwrap_err(); assert!(err.to_string().contains("100-byte cap")); } #[tokio::test] async fn accumulate_capped_surfaces_stream_errors() { let chunks: Vec> = vec![ Ok(bytes::Bytes::from_static(b"ok")), Err(std::io::Error::other("network blip")), ]; let s = stream::iter(chunks); let err = accumulate_capped(s, 100).await.unwrap_err(); assert!(err.to_string().contains("network blip")); } #[test] fn looks_like_image_accepts_jpeg() { // JPEG SOI + APP0 segment. let jpeg = [0xff, 0xd8, 0xff, 0xe0, 0, 0x10, b'J', b'F', b'I', b'F']; assert!(looks_like_image(&jpeg)); } #[test] fn looks_like_image_accepts_png() { let png = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0, 0, 0, 0]; assert!(looks_like_image(&png)); } #[test] fn looks_like_image_rejects_html_disguised_as_image() { let html = b"not an image"; assert!(!looks_like_image(html)); } #[test] fn looks_like_image_rejects_empty() { assert!(!looks_like_image(&[])); } #[test] fn looks_like_image_rejects_renderable_but_unsupported_formats() { // BMP, TIFF, ICO, PSD are `infer::MatcherType::Image` but the // /files/*key handler doesn't have Content-Type mappings for // them, so they'd be served as application/octet-stream and // download instead of render. Reject at the crawler so we // never land them in storage. // BMP magic: "BM" + 4-byte size. let bmp = [b'B', b'M', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; assert!(!looks_like_image(&bmp), "BMP must be rejected (not renderable by /files)"); // TIFF little-endian magic: "II" + 42. let tiff = [0x49, 0x49, 0x2a, 0x00, 0, 0, 0, 0]; assert!(!looks_like_image(&tiff), "TIFF must be rejected"); // ICO magic: 0x00,0x00,0x01,0x00. let ico = [0x00, 0x00, 0x01, 0x00, 1, 0, 16, 16, 0, 0, 1, 0, 0x18, 0, 0x40, 0, 0, 0, 0x16, 0, 0, 0]; assert!(!looks_like_image(&ico), "ICO must be rejected"); } #[test] fn looks_like_image_accepts_webp_gif_avif() { // Cover the three remaining whitelisted formats so a future // tightening that drops one would fail noisily. let webp = [ b'R', b'I', b'F', b'F', 0, 0, 0, 0, b'W', b'E', b'B', b'P', b'V', b'P', b'8', b' ', ]; assert!(looks_like_image(&webp)); let gif = [b'G', b'I', b'F', b'8', b'7', b'a', 0, 0, 0, 0]; assert!(looks_like_image(&gif)); let avif = [ 0x00, 0x00, 0x00, 0x18, b'f', b't', b'y', b'p', b'a', b'v', b'i', b'f', 0x00, 0x00, 0x00, 0x00, b'm', b'i', b'f', b'1', b'a', b'v', b'i', b'f', ]; assert!(looks_like_image(&avif)); } }