diff --git a/.env.example b/.env.example index 6cfa300..db42701 100644 --- a/.env.example +++ b/.env.example @@ -44,6 +44,14 @@ MAX_REQUEST_BYTES=209715200 # Default 20 MiB. MAX_FILE_BYTES=20971520 +# ----- Crawler download safety ----- +# Hosts the crawler is allowed to fetch images/covers from, in addition +# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated. +# Defends against SSRF via scraped . +CRAWLER_DOWNLOAD_ALLOWLIST= +# Hard cap on a single image body. Default 32 MiB. +CRAWLER_MAX_IMAGE_BYTES=33554432 + # ----- Frontend ----- # The frontend container runs SvelteKit's Node adapter on :3000 and # proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 5642fcc..7bbdd81 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.34.0" +version = "0.34.1" dependencies = [ "anyhow", "argon2", @@ -2324,6 +2324,7 @@ dependencies = [ "cookie", "cookie_store", "futures-core", + "futures-util", "http", "http-body", "http-body-util", @@ -2343,12 +2344,14 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-rustls", + "tokio-util", "tower", "tower-http", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots", ] @@ -3527,6 +3530,19 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasmparser" version = "0.244.0" diff --git a/backend/Cargo.toml b/backend/Cargo.toml index c091570..c6640b8 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.34.0" +version = "0.34.1" edition = "2021" default-run = "mangalord" @@ -46,7 +46,7 @@ futures-util = "0.3" bytes = "1" chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false } scraper = "0.20" -reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies"] } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies", "stream"] } [dev-dependencies] tempfile = "3" diff --git a/backend/src/app.rs b/backend/src/app.rs index 27734bb..e22fb16 100644 --- a/backend/src/app.rs +++ b/backend/src/app.rs @@ -19,6 +19,7 @@ use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass use crate::crawler::jobs::JobPayload; use crate::crawler::pipeline::{self, MetadataStats}; use crate::crawler::rate_limit::HostRateLimiters; +use crate::crawler::safety::DownloadAllowlist; use crate::crawler::session; use crate::crawler::source::{target as target_source, DiscoverMode}; use crate::repo; @@ -153,6 +154,8 @@ async fn spawn_crawler_daemon( start_url: url.clone(), mode_pref: cfg.mode, incremental_stop_after: cfg.incremental_stop_after, + download_allowlist: cfg.download_allowlist.clone(), + max_image_bytes: cfg.max_image_bytes, }); m }); @@ -163,6 +166,8 @@ async fn spawn_crawler_daemon( storage: Arc::clone(&storage), http, rate: Arc::clone(&rate), + download_allowlist: cfg.download_allowlist.clone(), + max_image_bytes: cfg.max_image_bytes, }); // Shared cancellation: daemon shutdown cancels the BrowserManager's @@ -216,6 +221,8 @@ struct RealMetadataPass { start_url: String, mode_pref: CrawlerModePref, incremental_stop_after: usize, + download_allowlist: DownloadAllowlist, + max_image_bytes: usize, } #[async_trait] @@ -238,6 +245,8 @@ impl MetadataPass for RealMetadataPass { 0, false, mode, + &self.download_allowlist, + self.max_image_bytes, ) .await } @@ -293,6 +302,8 @@ struct RealChapterDispatcher { storage: Arc, http: reqwest::Client, rate: Arc, + download_allowlist: DownloadAllowlist, + max_image_bytes: usize, } #[async_trait] @@ -331,6 +342,8 @@ impl ChapterDispatcher for RealChapterDispatcher { manga_id, &source_url, false, + &self.download_allowlist, + self.max_image_bytes, ) .await?; drop(lease); diff --git a/backend/src/bin/crawler.rs b/backend/src/bin/crawler.rs index 58c80e7..346e838 100644 --- a/backend/src/bin/crawler.rs +++ b/backend/src/bin/crawler.rs @@ -229,6 +229,33 @@ async fn run( } let rate = Arc::new(rate); + // SSRF defence: only download from the catalog host + CDN host + // (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap + // single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes. + let mut allowlist = + mangalord::crawler::safety::DownloadAllowlist::new(); + if let Ok(parsed) = reqwest::Url::parse(start_url) { + if let Some(h) = parsed.host_str() { + allowlist = allowlist.allow(h); + } + } + if let Some(host) = cdn_host { + allowlist = allowlist.allow(host); + } + if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") { + for piece in extras.split(',') { + let trimmed = piece.trim(); + if !trimmed.is_empty() { + allowlist = allowlist.allow(trimmed); + } + } + } + let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(mangalord::crawler::safety::DEFAULT_MAX_IMAGE_BYTES); + let allowlist = Arc::new(allowlist); + let stats = pipeline::run_metadata_pass( manager.as_ref(), db, @@ -239,6 +266,8 @@ async fn run( limit, skip_chapters, mode, + allowlist.as_ref(), + max_image_bytes, ) .await?; tracing::info!(?stats, "metadata pass complete"); @@ -253,6 +282,8 @@ async fn run( "target", chapter_workers, force_refetch_chapters, + Arc::clone(&allowlist), + max_image_bytes, ) .await?; } @@ -276,6 +307,8 @@ async fn sync_bookmarked_chapter_content( source_id: &str, workers: usize, force_refetch: bool, + allowlist: Arc, + max_image_bytes: usize, ) -> anyhow::Result<()> { let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as( r#" @@ -312,6 +345,7 @@ async fn sync_bookmarked_chapter_content( let storage = Arc::clone(&storage); let rate = Arc::clone(&rate); let manager = Arc::clone(&manager); + let allowlist = Arc::clone(&allowlist); let stats = &stats; async move { if session_expired.load(std::sync::atomic::Ordering::Relaxed) { @@ -336,6 +370,8 @@ async fn sync_bookmarked_chapter_content( manga_id, &source_url, force_refetch, + allowlist.as_ref(), + max_image_bytes, ) .await; drop(lease); diff --git a/backend/src/config.rs b/backend/src/config.rs index 7f0180a..dc31006 100644 --- a/backend/src/config.rs +++ b/backend/src/config.rs @@ -5,6 +5,7 @@ use chrono::NaiveTime; use chrono_tz::Tz; use crate::crawler::browser::LaunchOptions; +use crate::crawler::safety::{DownloadAllowlist, DEFAULT_MAX_IMAGE_BYTES}; use crate::crawler::source::DiscoverMode; /// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default — @@ -93,6 +94,13 @@ pub struct CrawlerConfig { /// `stop_after_unchanged` threshold supplied to Incremental in both /// `Auto` (post-seed) and `Explicit(Incremental)` modes. pub incremental_stop_after: usize, + /// Hosts the crawler is allowed to download images / covers from. + /// Always seeded with the host of `start_url` and (when set) the + /// configured `cdn_host`. Additional hosts can be added via + /// `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-separated). + pub download_allowlist: DownloadAllowlist, + /// Hard upper bound on a single image download. Defaults to 32 MiB. + pub max_image_bytes: usize, } impl Default for CrawlerConfig { @@ -115,6 +123,8 @@ impl Default for CrawlerConfig { browser: LaunchOptions::headless(), mode: CrawlerModePref::Auto, incremental_stop_after: 20, + download_allowlist: DownloadAllowlist::new(), + max_image_bytes: DEFAULT_MAX_IMAGE_BYTES, } } } @@ -172,6 +182,14 @@ impl CrawlerConfig { let incremental_stop_after = env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize; let mode = parse_mode_env(incremental_stop_after)?; + let start_url = std::env::var("CRAWLER_START_URL") + .ok() + .filter(|s| !s.trim().is_empty()); + let cdn_host = std::env::var("CRAWLER_CDN_HOST") + .ok() + .filter(|s| !s.trim().is_empty()); + let download_allowlist = + build_download_allowlist(start_url.as_deref(), cdn_host.as_deref()); Ok(Self { daemon_enabled: env_bool("CRAWLER_DAEMON", true), daily_at, @@ -179,13 +197,9 @@ impl CrawlerConfig { idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)), chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize, retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32, - start_url: std::env::var("CRAWLER_START_URL") - .ok() - .filter(|s| !s.trim().is_empty()), + start_url, rate_ms: env_u64("CRAWLER_RATE_MS", 1000), - cdn_host: std::env::var("CRAWLER_CDN_HOST") - .ok() - .filter(|s| !s.trim().is_empty()), + cdn_host, cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)), phpsessid: std::env::var("CRAWLER_PHPSESSID") .ok() @@ -202,10 +216,45 @@ impl CrawlerConfig { browser: LaunchOptions::from_env(), mode, incremental_stop_after, + download_allowlist, + max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES), }) } } +/// Build the download allowlist from env. Always includes +/// `CRAWLER_START_URL`'s host (so the crawler can fetch covers from +/// the catalog itself) and `CRAWLER_CDN_HOST` when set. Additional +/// hosts can be supplied via `CRAWLER_DOWNLOAD_ALLOWLIST` (comma- +/// separated). Empty by default — meaning the crawler refuses to +/// download anything when no source is configured, which is the safe +/// fail-closed posture. +fn build_download_allowlist( + start_url: Option<&str>, + cdn_host: Option<&str>, +) -> DownloadAllowlist { + let mut allow = DownloadAllowlist::new(); + if let Some(url) = start_url { + if let Ok(parsed) = reqwest::Url::parse(url) { + if let Some(h) = parsed.host_str() { + allow = allow.allow(h); + } + } + } + if let Some(host) = cdn_host { + allow = allow.allow(host); + } + if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") { + for piece in extras.split(',') { + let trimmed = piece.trim(); + if !trimmed.is_empty() { + allow = allow.allow(trimmed); + } + } + } + allow +} + /// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are /// `auto`, `backfill`, and `incremental` (case-insensitive). Anything /// else is a hard error so a typo can't silently fall through to the diff --git a/backend/src/crawler/content.rs b/backend/src/crawler/content.rs index 71fbfe3..1d5706a 100644 --- a/backend/src/crawler/content.rs +++ b/backend/src/crawler/content.rs @@ -18,7 +18,8 @@ use uuid::Uuid; use crate::crawler::detect::PageError; use crate::crawler::rate_limit::HostRateLimiters; -use crate::crawler::session; +use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist}; +use crate::crawler::session::{self, ChapterProbe}; use crate::storage::Storage; /// Parse the chapter page DOM and return the page images in `pageN` @@ -88,6 +89,8 @@ pub async fn sync_chapter_content( manga_id: Uuid, source_url: &str, force_refetch: bool, + allowlist: &DownloadAllowlist, + max_image_bytes: usize, ) -> anyhow::Result { // Skip if already fetched, unless caller explicitly forces. if !force_refetch { @@ -110,16 +113,28 @@ pub async fn sync_chapter_content( .with_context(|| format!("open chapter page {source_url}"))?; page.wait_for_navigation().await.context("wait for chapter nav")?; - // Session probe: avatar present == still logged in. Missing means - // PHPSESSID expired; bail the entire crawler run. - if page.find_element("#avatar_menu").await.is_err() { - page.close().await.ok(); - return Ok(SyncOutcome::SessionExpired); - } - let html = page.content().await.context("read chapter html")?; page.close().await.ok(); + // Three-way session classification: distinguishes a transient + // hiccup (broken-page body or logged-in-but-no-reader) from a + // genuine PHPSESSID expiry (no reader and no avatar widget). The + // earlier binary `#avatar_menu` check conflated both and froze + // every worker on a layout shift. + match session::classify_chapter_probe(&html) { + ChapterProbe::Unauthenticated => return Ok(SyncOutcome::SessionExpired), + ChapterProbe::Transient => { + // Surface as a typed Err so the dispatcher path runs + // ack_failed with exponential backoff (rather than the + // session-expired sticky flag). + anyhow::bail!( + "chapter page at {source_url} returned a transient response \ + (broken-page body or reader didn't render); will retry" + ); + } + ChapterProbe::Ok => {} + } + let images = parse_chapter_pages(&html) .with_context(|| format!("parse chapter pages at {source_url}"))?; if images.is_empty() { @@ -138,18 +153,29 @@ pub async fn sync_chapter_content( format!("join image URL {} onto {source_url}", img.url) })?; rate.wait_for(url.as_str()).await?; - let resp = http - .get(url.clone()) - // Source CDNs commonly check Referer. Set it to the - // chapter page — matches what the browser would send. - .header(reqwest::header::REFERER, source_url) - .send() - .await - .with_context(|| format!("GET {url}"))? - .error_for_status() - .with_context(|| format!("non-2xx for {url}"))?; - let bytes = resp.bytes().await.context("read image body")?.to_vec(); - let ext = infer::get(&bytes).map(|k| k.extension()).unwrap_or("bin"); + let bytes = fetch_bytes_capped( + http, + url.as_str(), + Some(source_url), + allowlist, + max_image_bytes, + ) + .await? + .to_vec(); + // Reject any non-image response: the only valid output of an + // image URL is an image. `infer` returns None on truncated + // bytes too, which also wants to be a failure not a silent + // `.bin` extension. + if !looks_like_image(&bytes) { + anyhow::bail!( + "image URL {url} returned non-image bytes \ + (first 16: {:?}); refusing to store as binary blob", + &bytes.get(..16.min(bytes.len())) + ); + } + let ext = infer::get(&bytes) + .map(|k| k.extension()) + .expect("looks_like_image asserted infer succeeded"); fetched.push((img.page_number, bytes, ext)); } @@ -194,8 +220,9 @@ pub async fn sync_chapter_content( Ok(SyncOutcome::Fetched { pages: fetched.len() }) } -// Suppress unused-import warning for `session` until the bin/crawler -// wiring lands in this branch and uses it through this module. +// Suppress unused-import warning for `session::registrable_domain` +// until the bin/crawler wiring lands in this branch and uses it +// through this module. #[allow(dead_code)] fn _keep_session_in_scope() { let _ = session::registrable_domain; diff --git a/backend/src/crawler/mod.rs b/backend/src/crawler/mod.rs index be3dcaa..9d54cff 100644 --- a/backend/src/crawler/mod.rs +++ b/backend/src/crawler/mod.rs @@ -22,5 +22,6 @@ pub mod diff; pub mod jobs; pub mod pipeline; pub mod rate_limit; +pub mod safety; pub mod session; pub mod source; diff --git a/backend/src/crawler/pipeline.rs b/backend/src/crawler/pipeline.rs index e1050e9..4c72ad5 100644 --- a/backend/src/crawler/pipeline.rs +++ b/backend/src/crawler/pipeline.rs @@ -9,6 +9,7 @@ use uuid::Uuid; use crate::crawler::browser_manager::BrowserManager; use crate::crawler::jobs::{self, EnqueueResult, JobPayload}; use crate::crawler::rate_limit::HostRateLimiters; +use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist}; use crate::crawler::source::target::TargetSource; use crate::crawler::source::{DiscoverMode, FetchContext, Source}; use crate::repo; @@ -62,6 +63,8 @@ pub async fn run_metadata_pass( limit: usize, skip_chapters: bool, mode: DiscoverMode, + allowlist: &DownloadAllowlist, + max_image_bytes: usize, ) -> anyhow::Result { let lease = browser_manager .acquire() @@ -181,6 +184,8 @@ pub async fn run_metadata_pass( &r.url, upsert.manga_id, cover_url, + allowlist, + max_image_bytes, ) .await { @@ -382,6 +387,7 @@ pub struct EnqueueSummary { /// pipeline because the CLI still calls it from its inline chapter-content /// loop; once the worker pool fully replaces that path we can fold this /// into `pipeline` proper. +#[allow(clippy::too_many_arguments)] async fn download_and_store_cover( db: &PgPool, storage: &dyn Storage, @@ -390,6 +396,8 @@ async fn download_and_store_cover( manga_url: &str, manga_id: Uuid, cover_url: &str, + allowlist: &DownloadAllowlist, + max_image_bytes: usize, ) -> anyhow::Result<()> { let absolute = reqwest::Url::parse(manga_url) .context("parse manga URL")? @@ -397,17 +405,22 @@ async fn download_and_store_cover( .context("join cover URL onto manga URL")?; rate.wait_for(absolute.as_str()).await?; - let resp = http - .get(absolute.clone()) - .header(reqwest::header::REFERER, manga_url) - .send() - .await - .with_context(|| format!("GET {absolute}"))? - .error_for_status() - .with_context(|| format!("non-2xx for {absolute}"))?; - let bytes = resp.bytes().await.context("read cover body")?; - let kind = infer::get(&bytes); - let ext = kind.map(|k| k.extension()).unwrap_or("bin"); + let bytes = fetch_bytes_capped( + http, + absolute.as_str(), + Some(manga_url), + allowlist, + max_image_bytes, + ) + .await?; + if !looks_like_image(&bytes) { + anyhow::bail!( + "cover URL {absolute} returned non-image bytes; refusing to store as binary blob" + ); + } + let ext = infer::get(&bytes) + .map(|k| k.extension()) + .expect("looks_like_image asserted infer succeeded"); let key = format!("mangas/{manga_id}/cover.{ext}"); storage diff --git a/backend/src/crawler/safety.rs b/backend/src/crawler/safety.rs new file mode 100644 index 0000000..d9c62ad --- /dev/null +++ b/backend/src/crawler/safety.rs @@ -0,0 +1,486 @@ +//! Defensive helpers for the image-download paths. +//! +//! Two threats this module addresses: +//! +//! - **SSRF**: a scraped chapter or manga page can embed an absolute +//! ``. The crawler runs inside the +//! backend container with intra-compose access to `postgres:5432` +//! and possibly other internal services; without a host check the +//! crawler would happily probe them. [`is_safe_url`] rejects +//! anything whose host isn't on the operator-configured allowlist, +//! plus any IP literal in RFC1918 / loopback / link-local / unique- +//! local space (including IPv4-mapped IPv6 like `::ffff:127.0.0.1`) +//! as a second defence for the case where an allowlisted hostname's +//! DNS happens to resolve to a literal private address. +//! +//! **DNS rebinding is not covered.** A hostname like `cdn.allowed.com` +//! that *resolves* to `127.0.0.1` via hostile DNS bypasses the IP +//! check entirely — `is_safe_url` only inspects URL strings, not +//! resolved IPs. Mitigating that requires a custom reqwest resolver +//! that filters IPs after DNS, which would mean rebuilding reqwest's +//! connector. The allowlist + good operator DNS hygiene is the +//! realistic mitigation today. +//! +//! - **Unbounded download**: `Response::bytes().await` reads the full +//! body before returning. A malicious source serving a 10 GiB image +//! would fill memory and then disk. [`accumulate_capped`] streams +//! the body chunk-by-chunk into a [`bytes::BytesMut`] and bails as +//! soon as the running total exceeds the cap. +//! +//! Both helpers are pure-data: the SSRF check is keyed off a parsed +//! URL string, and the byte accumulator is keyed off a generic stream. +//! Easy to unit-test without a live network or browser. + +use std::net::IpAddr; + +use anyhow::{bail, Context}; +use bytes::BytesMut; +use futures_util::StreamExt; +use reqwest::Url; + +/// Default per-image download cap. A page image is generally <2 MiB; +/// 32 MiB leaves headroom for high-resolution covers while still +/// stopping a misbehaving CDN dead. Override via `CRAWLER_MAX_IMAGE_BYTES`. +pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024; + +/// Hosts that are always allowed in addition to the operator's +/// configured allowlist. None by default — keeping the surface area +/// minimal so the only way a URL gets through is if it matches an +/// explicit catalog/CDN entry. +#[derive(Clone, Debug, Default)] +pub struct DownloadAllowlist { + hosts: Vec, +} + +impl DownloadAllowlist { + pub fn new() -> Self { + Self { hosts: Vec::new() } + } + + /// Add a host (case-insensitive match). Sub-domains are *not* + /// implied: pass `cdn.example.com` and `example.com` separately + /// if both should be reachable. + pub fn allow(mut self, host: impl Into) -> Self { + let h = host.into().to_ascii_lowercase(); + if !h.is_empty() && !self.hosts.iter().any(|existing| existing == &h) { + self.hosts.push(h); + } + self + } + + pub fn is_empty(&self) -> bool { + self.hosts.is_empty() + } + + pub fn contains(&self, host: &str) -> bool { + let lower = host.to_ascii_lowercase(); + self.hosts.iter().any(|h| h == &lower) + } +} + +/// Verify a URL is safe for the crawler to fetch. +/// +/// Rejects: +/// - non-http(s) schemes (file://, gopher://, …), +/// - any IP literal in private / loopback / link-local / unique-local +/// space (defense in depth — a DNS allowlist alone wouldn't cover an +/// attacker that places an entry like `cdn.evil` pointing at +/// `192.168.1.1`), +/// - the literal hostname `localhost`, +/// - hosts that aren't on the supplied allowlist. +/// +/// An empty allowlist rejects everything (the conservative default — +/// callers must explicitly allow the catalog and CDN hosts). +pub fn is_safe_url(raw_url: &str, allow: &DownloadAllowlist) -> Result<(), UrlSafetyError> { + let url = Url::parse(raw_url).map_err(|_| UrlSafetyError::Unparseable)?; + let scheme = url.scheme(); + if scheme != "http" && scheme != "https" { + return Err(UrlSafetyError::BadScheme(scheme.to_string())); + } + let host = url.host_str().ok_or(UrlSafetyError::NoHost)?; + let lower_host = host.to_ascii_lowercase(); + if lower_host == "localhost" { + return Err(UrlSafetyError::Loopback); + } + // Reject IP literals in private/loopback ranges regardless of the + // allowlist — if someone puts an IP literal on the allowlist they + // almost certainly didn't mean a private range. + // reqwest::Url normalises IPv6 literals as `[::1]` (brackets + // included) in `host_str()`. Strip the brackets before parsing. + let ip_candidate = lower_host + .strip_prefix('[') + .and_then(|s| s.strip_suffix(']')) + .unwrap_or(&lower_host); + if let Ok(ip) = ip_candidate.parse::() { + if is_private_ip(&ip) { + return Err(UrlSafetyError::PrivateIp(ip)); + } + } + if !allow.contains(&lower_host) { + return Err(UrlSafetyError::HostNotAllowed(lower_host)); + } + Ok(()) +} + +fn is_private_ip(ip: &IpAddr) -> bool { + match ip { + IpAddr::V4(v4) => { + v4.is_loopback() + || v4.is_private() + || v4.is_link_local() + || v4.is_unspecified() + || v4.is_broadcast() + // CGNAT 100.64.0.0/10 + || (v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64) + // 169.254/16 link-local already covered, but 0.0.0.0/8 is special-use + || v4.octets()[0] == 0 + } + IpAddr::V6(v6) => { + // IPv4-mapped IPv6 (::ffff:0:0/96): unwrap to the embedded + // IPv4 and recurse so `::ffff:127.0.0.1` is caught by the + // IPv4 loopback check rather than passing through. + // `Ipv6Addr::is_loopback()` only matches `::1` exactly. + if let Some(v4) = v6.to_ipv4_mapped() { + return is_private_ip(&IpAddr::V4(v4)); + } + v6.is_loopback() + || v6.is_unspecified() + // fc00::/7 unique-local + || (v6.segments()[0] & 0xfe00) == 0xfc00 + // fe80::/10 link-local + || (v6.segments()[0] & 0xffc0) == 0xfe80 + } + } +} + +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum UrlSafetyError { + #[error("URL is not parseable")] + Unparseable, + #[error("scheme {0:?} is not http or https")] + BadScheme(String), + #[error("URL is missing a host")] + NoHost, + #[error("host points at the loopback interface")] + Loopback, + #[error("host is a private/internal IP: {0}")] + PrivateIp(IpAddr), + #[error("host {0:?} is not on the crawler download allowlist")] + HostNotAllowed(String), +} + +/// Drain a byte stream into a single buffer, bailing out as soon as +/// the running total exceeds `max_bytes`. Generic over the stream so +/// it's testable without a live HTTP response. +pub async fn accumulate_capped(stream: S, max_bytes: usize) -> anyhow::Result +where + S: futures_core::Stream>, + E: std::error::Error + Send + Sync + 'static, +{ + let mut buf = BytesMut::new(); + let mut stream = std::pin::pin!(stream); + while let Some(chunk) = stream.next().await { + let chunk = chunk.map_err(|e| anyhow::anyhow!("stream chunk: {e}"))?; + if buf.len().saturating_add(chunk.len()) > max_bytes { + bail!( + "response exceeds {max_bytes}-byte cap (received >{}+{})", + buf.len(), + chunk.len() + ); + } + buf.extend_from_slice(&chunk); + } + Ok(buf.freeze()) +} + +/// Send `req` and stream the response into a length-limited buffer. +/// Combines [`is_safe_url`] check + [`accumulate_capped`] so each +/// call-site is one line. +pub async fn fetch_bytes_capped( + http: &reqwest::Client, + url: &str, + referer: Option<&str>, + allow: &DownloadAllowlist, + max_bytes: usize, +) -> anyhow::Result { + is_safe_url(url, allow).with_context(|| format!("reject unsafe URL {url}"))?; + let mut req = http.get(url); + if let Some(r) = referer { + req = req.header(reqwest::header::REFERER, r); + } + let resp = req + .send() + .await + .with_context(|| format!("GET {url}"))? + .error_for_status() + .with_context(|| format!("non-2xx for {url}"))?; + accumulate_capped(resp.bytes_stream(), max_bytes) + .await + .with_context(|| format!("download body for {url}")) +} + +/// True when `bytes` sniffs as one of the *renderable* image formats +/// the `/files/*key` endpoint can serve with a correct Content-Type: +/// JPEG, PNG, WebP, GIF, AVIF. Matches the upload pipeline's +/// whitelist in `upload::parse_image`. +/// +/// `infer::MatcherType::Image` is intentionally NOT used — it also +/// matches BMP, TIFF, HEIF, ICO, PSD, and JP2. Those would sniff as +/// "image" here but [`api::files::content_type_for`] would fall back +/// to `application/octet-stream`, prompting browsers to download +/// instead of render. Keep the two layers aligned. +pub fn looks_like_image(bytes: &[u8]) -> bool { + matches!( + infer::get(bytes).map(|k| k.mime_type()), + Some("image/jpeg" | "image/png" | "image/webp" | "image/gif" | "image/avif") + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use futures_util::stream; + + fn allow_just(host: &str) -> DownloadAllowlist { + DownloadAllowlist::new().allow(host) + } + + #[test] + fn safe_url_allows_listed_host() { + let allow = allow_just("cdn.example.com"); + assert!(is_safe_url("https://cdn.example.com/img.jpg", &allow).is_ok()); + } + + #[test] + fn safe_url_blocks_unlisted_host() { + let allow = allow_just("cdn.example.com"); + let err = is_safe_url("https://evil.example.org/img.jpg", &allow).unwrap_err(); + assert!(matches!(err, UrlSafetyError::HostNotAllowed(h) if h == "evil.example.org")); + } + + #[test] + fn safe_url_blocks_localhost_even_if_allowlisted() { + let allow = allow_just("localhost"); + assert!(matches!( + is_safe_url("http://localhost:8080/", &allow).unwrap_err(), + UrlSafetyError::Loopback + )); + } + + #[test] + fn safe_url_blocks_loopback_ipv4() { + let allow = allow_just("127.0.0.1"); + assert!(matches!( + is_safe_url("http://127.0.0.1/", &allow).unwrap_err(), + UrlSafetyError::PrivateIp(_) + )); + } + + #[test] + fn safe_url_blocks_rfc1918() { + let allow = allow_just("10.0.0.1"); + for url in [ + "http://10.0.0.1/", + "http://192.168.1.1/", + "http://172.16.0.5/", + "http://172.31.255.255/", + ] { + assert!( + matches!( + is_safe_url(url, &allow).unwrap_err(), + UrlSafetyError::PrivateIp(_) + ), + "should reject {url}" + ); + } + } + + #[test] + fn safe_url_blocks_link_local() { + let allow = allow_just("169.254.169.254"); + // 169.254.169.254 is the AWS/GCP metadata service — the most + // dangerous SSRF target on a default cloud VM. + assert!(matches!( + is_safe_url("http://169.254.169.254/", &allow).unwrap_err(), + UrlSafetyError::PrivateIp(_) + )); + } + + #[test] + fn safe_url_blocks_ipv6_loopback_and_ula() { + // Debug what host_str returns first — reqwest::Url normalises + // IPv6 literals as `[::1]` with brackets, which doesn't parse + // as `IpAddr` directly. The implementation strips them. + let allow = allow_just("[::1]"); + let err = is_safe_url("http://[::1]/", &allow).unwrap_err(); + assert!( + matches!(err, UrlSafetyError::PrivateIp(_)), + "expected PrivateIp, got {err:?}" + ); + let allow = allow_just("[fd00::1]"); + let err = is_safe_url("http://[fd00::1]/", &allow).unwrap_err(); + assert!( + matches!(err, UrlSafetyError::PrivateIp(_)), + "expected PrivateIp, got {err:?}" + ); + } + + #[test] + fn safe_url_blocks_ipv4_mapped_ipv6_loopback() { + // `Ipv6Addr::is_loopback()` only matches `::1` exactly, so + // `::ffff:127.0.0.1` would slip through without the + // to_ipv4_mapped() unwrap in is_private_ip. + let allow = allow_just("[::ffff:127.0.0.1]"); + let err = is_safe_url("http://[::ffff:127.0.0.1]/", &allow).unwrap_err(); + assert!( + matches!(err, UrlSafetyError::PrivateIp(_)), + "expected PrivateIp, got {err:?}" + ); + } + + #[test] + fn safe_url_blocks_ipv4_mapped_ipv6_rfc1918() { + let allow = allow_just("[::ffff:10.0.0.1]"); + let err = is_safe_url("http://[::ffff:10.0.0.1]/", &allow).unwrap_err(); + assert!(matches!(err, UrlSafetyError::PrivateIp(_))); + } + + #[test] + fn safe_url_blocks_non_http_schemes() { + let allow = allow_just("anywhere"); + assert!(matches!( + is_safe_url("file:///etc/passwd", &allow).unwrap_err(), + UrlSafetyError::BadScheme(_) + )); + assert!(matches!( + is_safe_url("gopher://anywhere:70/", &allow).unwrap_err(), + UrlSafetyError::BadScheme(_) + )); + } + + #[test] + fn safe_url_rejects_unparseable() { + let allow = allow_just("anywhere"); + assert!(matches!( + is_safe_url("not a url", &allow).unwrap_err(), + UrlSafetyError::Unparseable + )); + } + + #[test] + fn safe_url_empty_allowlist_rejects_everything() { + let allow = DownloadAllowlist::new(); + let err = is_safe_url("https://cdn.example.com/img.jpg", &allow).unwrap_err(); + assert!(matches!(err, UrlSafetyError::HostNotAllowed(_))); + } + + #[test] + fn allowlist_matches_case_insensitively() { + let allow = DownloadAllowlist::new().allow("CDN.Example.COM"); + assert!(is_safe_url("https://cdn.example.com/x.jpg", &allow).is_ok()); + assert!(is_safe_url("https://CDN.EXAMPLE.com/x.jpg", &allow).is_ok()); + } + + #[tokio::test] + async fn accumulate_capped_returns_full_body_under_cap() { + let chunks: Vec> = vec![ + Ok(bytes::Bytes::from_static(b"hello ")), + Ok(bytes::Bytes::from_static(b"world")), + ]; + let s = stream::iter(chunks); + let out = accumulate_capped(s, 100).await.unwrap(); + assert_eq!(out.as_ref(), b"hello world"); + } + + #[tokio::test] + async fn accumulate_capped_bails_past_cap() { + let chunks: Vec> = vec![ + Ok(bytes::Bytes::from(vec![0u8; 50])), + Ok(bytes::Bytes::from(vec![0u8; 60])), + ]; + let s = stream::iter(chunks); + let err = accumulate_capped(s, 100).await.unwrap_err(); + assert!(err.to_string().contains("100-byte cap")); + } + + #[tokio::test] + async fn accumulate_capped_surfaces_stream_errors() { + let chunks: Vec> = vec![ + Ok(bytes::Bytes::from_static(b"ok")), + Err(std::io::Error::other("network blip")), + ]; + let s = stream::iter(chunks); + let err = accumulate_capped(s, 100).await.unwrap_err(); + assert!(err.to_string().contains("network blip")); + } + + #[test] + fn looks_like_image_accepts_jpeg() { + // JPEG SOI + APP0 segment. + let jpeg = [0xff, 0xd8, 0xff, 0xe0, 0, 0x10, b'J', b'F', b'I', b'F']; + assert!(looks_like_image(&jpeg)); + } + + #[test] + fn looks_like_image_accepts_png() { + let png = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0, 0, 0, 0]; + assert!(looks_like_image(&png)); + } + + #[test] + fn looks_like_image_rejects_html_disguised_as_image() { + let html = b"not an image"; + assert!(!looks_like_image(html)); + } + + #[test] + fn looks_like_image_rejects_empty() { + assert!(!looks_like_image(&[])); + } + + #[test] + fn looks_like_image_rejects_renderable_but_unsupported_formats() { + // BMP, TIFF, ICO, PSD are `infer::MatcherType::Image` but the + // /files/*key handler doesn't have Content-Type mappings for + // them, so they'd be served as application/octet-stream and + // download instead of render. Reject at the crawler so we + // never land them in storage. + // BMP magic: "BM" + 4-byte size. + let bmp = [b'B', b'M', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + assert!(!looks_like_image(&bmp), "BMP must be rejected (not renderable by /files)"); + + // TIFF little-endian magic: "II" + 42. + let tiff = [0x49, 0x49, 0x2a, 0x00, 0, 0, 0, 0]; + assert!(!looks_like_image(&tiff), "TIFF must be rejected"); + + // ICO magic: 0x00,0x00,0x01,0x00. + let ico = [0x00, 0x00, 0x01, 0x00, 1, 0, 16, 16, 0, 0, 1, 0, 0x18, 0, 0x40, 0, 0, 0, 0x16, 0, 0, 0]; + assert!(!looks_like_image(&ico), "ICO must be rejected"); + } + + #[test] + fn looks_like_image_accepts_webp_gif_avif() { + // Cover the three remaining whitelisted formats so a future + // tightening that drops one would fail noisily. + let webp = [ + b'R', b'I', b'F', b'F', + 0, 0, 0, 0, + b'W', b'E', b'B', b'P', + b'V', b'P', b'8', b' ', + ]; + assert!(looks_like_image(&webp)); + + let gif = [b'G', b'I', b'F', b'8', b'7', b'a', 0, 0, 0, 0]; + assert!(looks_like_image(&gif)); + + let avif = [ + 0x00, 0x00, 0x00, 0x18, + b'f', b't', b'y', b'p', + b'a', b'v', b'i', b'f', + 0x00, 0x00, 0x00, 0x00, + b'm', b'i', b'f', b'1', + b'a', b'v', b'i', b'f', + ]; + assert!(looks_like_image(&avif)); + } +} diff --git a/backend/src/crawler/session.rs b/backend/src/crawler/session.rs index 209ea5c..b35efe5 100644 --- a/backend/src/crawler/session.rs +++ b/backend/src/crawler/session.rs @@ -127,6 +127,54 @@ pub fn classify_probe(html: &str) -> SessionProbe { } } +/// Three-way classification of a chapter page response. +/// +/// Reader pages don't render `#logo`, so [`classify_probe`] can't be +/// reused as-is. The chapter-specific marker is `a#pic_container` +/// (asserted by the reader-page parser at `parse_chapter_pages`). +/// +/// Order matters: broken-page body wins over selector matches, so a +/// transient site-wide 5xx that happens to render the avatar widget +/// elsewhere doesn't falsely reach `Ok`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ChapterProbe { + /// `a#pic_container` present — reader rendered. Whether + /// `#avatar_menu` is also there is informational; if the reader + /// loaded the session is by definition still good. + Ok, + /// Site rendered a "logged out" or "please log in" page (no + /// reader, no broken-page body, and no avatar widget either). + /// Distinguishes the genuine expired-session case from a + /// transient site hiccup. + Unauthenticated, + /// Broken-page body, or reader didn't render but the user is + /// still logged in (avatar widget present). Caller should retry + /// rather than blame the session. + Transient, +} + +pub fn classify_chapter_probe(html: &str) -> ChapterProbe { + if is_broken_page_body(html) { + return ChapterProbe::Transient; + } + let doc = scraper::Html::parse_document(html); + let container = scraper::Selector::parse("a#pic_container").unwrap(); + if doc.select(&container).next().is_some() { + return ChapterProbe::Ok; + } + let avatar = scraper::Selector::parse("#avatar_menu").unwrap(); + if doc.select(&avatar).next().is_some() { + // Logged-in user, but the reader didn't render — most likely + // the layout shifted or the site is serving an interstitial. + ChapterProbe::Transient + } else { + // No reader, no avatar, no broken-body marker — site rendered + // the "please log in" page, which is the genuine session- + // expired signal on this route. + ChapterProbe::Unauthenticated + } +} + /// In-startup retry budget for the session probe. Small but non-zero — /// startup hitting a 5-second site hiccup shouldn't fail the operator /// with "PHPSESSID expired" when the session is actually fine. @@ -273,6 +321,73 @@ mod tests { assert_eq!(classify_probe(""), SessionProbe::Transient); } + #[test] + fn classify_chapter_probe_ok_when_reader_rendered() { + let html = r#" + + + + + + "#; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); + } + + #[test] + fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() { + // What a logged-out hit on a chapter URL renders: a normal + // site layout (header etc.) with a "please log in" body, but + // no reader and no avatar widget. + let html = r#" + +
+
Please log in to read this chapter.
+ + "#; + assert_eq!( + classify_chapter_probe(html), + ChapterProbe::Unauthenticated + ); + } + + #[test] + fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() { + // Avatar shows the session is still valid; reader didn't + // render — site is serving an interstitial or the layout + // momentarily shifted. Retry, don't blame the session. + let html = r#" + +
+
Site maintenance — back in 5 minutes.
+ + "#; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); + } + + #[test] + fn classify_chapter_probe_transient_on_broken_page_body() { + let html = + "

we're sorry, the request file are not found.

"; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); + } + + #[test] + fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() { + // Regression for the original bug: the binary + // find_element("#avatar_menu") check treated "no avatar" as + // session-expired even when a transient hiccup was the real + // cause. classify_chapter_probe must NOT trip on that pattern + // when pic_container *is* present. + let html = r#" + + + + + + "#; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); + } + #[test] fn classify_probe_trusts_broken_body_over_stray_avatar_match() { // Defensive: if a broken-page body somehow contains an diff --git a/frontend/package.json b/frontend/package.json index 159dad9..72a32cd 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.34.0", + "version": "0.34.1", "private": true, "type": "module", "scripts": {