From 8d34132883b441535b25e560d146f7c82bd0eee9 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Thu, 28 May 2026 20:24:51 +0200 Subject: [PATCH] bugfix: security & correctness bundle (0.34.1) Five fixes bundled into one release: - preserve user-attached tags across crawler upserts (repo::crawler::sync_tags now scopes to added_by IS NULL; orphaned attachments from deleted users are reaped as crawler-owned) - gate manga PATCH and cover endpoints on uploaded_by (require_can_edit in api::mangas; non-NULL uploaded_by must match the caller) - equalise login response time across user-existence branches (run argon2 against a OnceLock-cached dummy hash on the no-user branch so timing doesn't leak username existence) - crawler download defences (SSRF allowlist of host literals including IPv4-mapped IPv6 ranges, 32 MiB streamed size cap, reject non-whitelisted image types, three-way chapter-probe classifier replaces the binary #avatar_menu check) - tighten validation and clean up dead unload path (attach_tag + create_token enforce 64-char caps; LocalStorage rejects NUL bytes explicitly; reader flushFinalProgress drops the always-405 sendBeacon path) Co-Authored-By: Claude Opus 4.7 --- .env.example | 8 + backend/Cargo.lock | 18 +- backend/Cargo.toml | 4 +- backend/src/api/auth.rs | 47 +- backend/src/api/mangas.rs | 62 ++- backend/src/app.rs | 13 + backend/src/bin/crawler.rs | 36 ++ backend/src/config.rs | 61 ++- backend/src/crawler/content.rs | 71 ++- backend/src/crawler/mod.rs | 1 + backend/src/crawler/pipeline.rs | 35 +- backend/src/crawler/safety.rs | 486 ++++++++++++++++++ backend/src/crawler/session.rs | 115 +++++ backend/src/repo/crawler.rs | 15 +- backend/src/repo/manga.rs | 14 + backend/src/storage/local.rs | 10 + backend/tests/api_auth.rs | 109 ++++ backend/tests/api_mangas_cover.rs | 50 ++ backend/tests/api_mangas_metadata.rs | 75 +++ backend/tests/api_tags.rs | 25 + backend/tests/crawler_sync.rs | 164 ++++++ frontend/package.json | 2 +- frontend/src/lib/api/auth.test.ts | 5 + frontend/src/lib/api/auth.ts | 9 +- .../[id]/chapter/[chapter_id]/+page.svelte | 52 +- 25 files changed, 1399 insertions(+), 88 deletions(-) create mode 100644 backend/src/crawler/safety.rs diff --git a/.env.example b/.env.example index 7e5487b..07c2184 100644 --- a/.env.example +++ b/.env.example @@ -54,6 +54,14 @@ MAX_REQUEST_BYTES=209715200 # Default 20 MiB. MAX_FILE_BYTES=20971520 +# ----- Crawler download safety ----- +# Hosts the crawler is allowed to fetch images/covers from, in addition +# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated. +# Defends against SSRF via scraped . +CRAWLER_DOWNLOAD_ALLOWLIST= +# Hard cap on a single image body. Default 32 MiB. +CRAWLER_MAX_IMAGE_BYTES=33554432 + # ----- Frontend ----- # The frontend container runs SvelteKit's Node adapter on :3000 and # proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 5642fcc..7bbdd81 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "mangalord" -version = "0.34.0" +version = "0.34.1" dependencies = [ "anyhow", "argon2", @@ -2324,6 +2324,7 @@ dependencies = [ "cookie", "cookie_store", "futures-core", + "futures-util", "http", "http-body", "http-body-util", @@ -2343,12 +2344,14 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-rustls", + "tokio-util", "tower", "tower-http", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots", ] @@ -3527,6 +3530,19 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasmparser" version = "0.244.0" diff --git a/backend/Cargo.toml b/backend/Cargo.toml index c091570..c6640b8 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.34.0" +version = "0.34.1" edition = "2021" default-run = "mangalord" @@ -46,7 +46,7 @@ futures-util = "0.3" bytes = "1" chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false } scraper = "0.20" -reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies"] } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies", "stream"] } [dev-dependencies] tempfile = "3" diff --git a/backend/src/api/auth.rs b/backend/src/api/auth.rs index 3dd5cc2..975afe1 100644 --- a/backend/src/api/auth.rs +++ b/backend/src/api/auth.rs @@ -4,6 +4,8 @@ //! expire naturally rather than being explicitly invalidated, so other //! devices keep their existing logins). +use std::sync::OnceLock; + use axum::extract::{Path, State}; use axum::http::StatusCode; use axum::response::IntoResponse; @@ -102,9 +104,15 @@ async fn login( )); } - let user = repo::user::find_by_username(&state.db, username) - .await? - .ok_or(AppError::Unauthenticated)?; + let user = repo::user::find_by_username(&state.db, username).await?; + let Some(user) = user else { + // No such user. Run argon2 against a stable dummy hash so the + // response time matches the wrong-password branch — otherwise + // an attacker can enumerate usernames by timing the no-user + // 401 against the wrong-password 401. + let _ = verify_password(&input.password, dummy_password_hash()); + return Err(AppError::Unauthenticated); + }; if !verify_password(&input.password, &user.password_hash) { return Err(AppError::Unauthenticated); } @@ -113,6 +121,21 @@ async fn login( Ok((StatusCode::OK, jar, Json(AuthResponse { user }))) } +/// Lazily-computed argon2 hash used to equalise login response time +/// across the "no such user" and "wrong password" branches. Computing +/// it once (on the first login of the process) is enough — the hash is +/// never compared against a real password, only used to force argon2 +/// to do the same amount of work it would for a real verify. +fn dummy_password_hash() -> &'static str { + static DUMMY: OnceLock = OnceLock::new(); + DUMMY + .get_or_init(|| { + crate::auth::password::hash_password("login-timing-equaliser") + .expect("hash_password on a fixed input cannot fail") + }) + .as_str() +} + async fn logout( State(state): State, jar: CookieJar, @@ -230,8 +253,24 @@ async fn create_token( Json(input): Json, ) -> AppResult { let name = input.name.trim(); + // Both arms use `ValidationFailed` (422 with field details) to + // match the structured-error shape `attach_tag` returns for the + // same kind of free-form-identifier validation. The other + // /auth/* handlers in this file use `InvalidInput` (400); the + // divergence is pre-existing and would warrant a project-wide + // pass to flip them all if the client side wants uniform per- + // field error rendering. if name.is_empty() { - return Err(AppError::InvalidInput("token name is required".into())); + return Err(AppError::ValidationFailed { + message: "token name is required".into(), + details: serde_json::json!({ "name": "required" }), + }); + } + if name.chars().count() > 64 { + return Err(AppError::ValidationFailed { + message: "token name too long".into(), + details: serde_json::json!({ "name": "max 64 characters" }), + }); } let (raw, hash) = generate_token(); let token = repo::api_token::create(&state.db, user.id, name, &hash).await?; diff --git a/backend/src/api/mangas.rs b/backend/src/api/mangas.rs index f0e21af..17899a3 100644 --- a/backend/src/api/mangas.rs +++ b/backend/src/api/mangas.rs @@ -196,16 +196,14 @@ async fn create( async fn update( State(state): State, - CurrentUser(_user): CurrentUser, + CurrentUser(user): CurrentUser, Path(id): Path, Json(patch): Json, ) -> AppResult> { - // TODO(auth): until uploaders are tracked (Phase 5), any signed-in - // user can edit any manga. Restrict to uploader + admin once that - // column lands. if !repo::manga::exists(&state.db, id).await? { return Err(AppError::NotFound); } + require_can_edit(&state, id, user.id).await?; if let Some(ref status) = patch.status { let trimmed = status.trim(); @@ -269,16 +267,14 @@ async fn update( /// `MangaDetail`. async fn put_cover( State(state): State, - CurrentUser(_user): CurrentUser, + CurrentUser(user): CurrentUser, Path(id): Path, mut multipart: Multipart, ) -> AppResult> { - // TODO(auth): until uploaders are tracked (Phase 5), any signed-in - // user can edit any manga's cover. Restrict to uploader + admin - // once that column lands. if !repo::manga::exists(&state.db, id).await? { return Err(AppError::NotFound); } + require_can_edit(&state, id, user.id).await?; let mut cover: Option = None; while let Some(field) = next_field(&mut multipart).await? { @@ -320,13 +316,13 @@ async fn put_cover( /// with the unchanged detail. async fn delete_cover( State(state): State, - CurrentUser(_user): CurrentUser, + CurrentUser(user): CurrentUser, Path(id): Path, ) -> AppResult> { - // TODO(auth): same caveat as put_cover. if !repo::manga::exists(&state.db, id).await? { return Err(AppError::NotFound); } + require_can_edit(&state, id, user.id).await?; if let Some(key) = repo::manga::get(&state.db, id).await?.cover_image_path { match state.storage.delete(&key).await { Ok(()) | Err(StorageError::NotFound) => {} @@ -348,6 +344,7 @@ async fn attach_tag( Path(id): Path, Json(body): Json, ) -> AppResult<(StatusCode, Json)> { + validate_tag_name(&body.name)?; if !repo::manga::exists(&state.db, id).await? { return Err(AppError::NotFound); } @@ -394,6 +391,27 @@ async fn detach_tag( } } +/// Request-side validation for `POST /mangas/:id/tags` body. Mirrors +/// the repo-level cap in `repo::tag::upsert_by_name` (max 64 chars +/// after trim) but surfaces the failure at the handler boundary with +/// the same envelope shape other validations use. +fn validate_tag_name(name: &str) -> AppResult<()> { + let trimmed = name.trim(); + if trimmed.is_empty() { + return Err(AppError::ValidationFailed { + message: "tag name cannot be empty".into(), + details: json!({ "name": "required" }), + }); + } + if trimmed.chars().count() > 64 { + return Err(AppError::ValidationFailed { + message: "tag name too long".into(), + details: json!({ "name": "max 64 characters" }), + }); + } + Ok(()) +} + fn validate_new_manga(input: &NewManga) -> AppResult<()> { if input.title.trim().is_empty() { return Err(AppError::ValidationFailed { @@ -413,6 +431,30 @@ fn validate_new_manga(input: &NewManga) -> AppResult<()> { Ok(()) } +/// Authorisation gate for manga mutations. The manga is assumed to +/// exist (the caller runs [`repo::manga::exists`] first so a missing id +/// surfaces as `NotFound`, not `Forbidden`). +/// +/// Rule: a non-NULL `uploaded_by` must match the current user. Legacy +/// rows with `uploaded_by IS NULL` (pre-migration-0011) are still +/// editable by any signed-in user — there's nobody to gate on yet, and +/// the historical-data note in 0011 acknowledges the gap. Once an +/// admin role lands the NULL case can flip to admin-only. +/// +/// Returns `Forbidden` (not `NotFound`) on owner mismatch — mangas +/// are listable via `GET /mangas`, so existence isn't a secret and +/// the more accurate 403 is fine. This deliberately differs from +/// `repo::collection::require_owner`, which collapses both states to +/// `NotFound` because collections are private to a user and existence +/// itself is information worth hiding from non-owners. +async fn require_can_edit(state: &AppState, manga_id: Uuid, user_id: Uuid) -> AppResult<()> { + match repo::manga::uploaded_by(&state.db, manga_id).await? { + Some(owner) if owner != user_id => Err(AppError::Forbidden), + // Some(owner) == user_id (good) or None (legacy row, no owner). + _ => Ok(()), + } +} + async fn validate_genre_ids(state: &AppState, ids: &[Uuid]) -> AppResult<()> { if ids.is_empty() { return Ok(()); diff --git a/backend/src/app.rs b/backend/src/app.rs index 17c93ef..9f9241d 100644 --- a/backend/src/app.rs +++ b/backend/src/app.rs @@ -19,6 +19,7 @@ use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass use crate::crawler::jobs::JobPayload; use crate::crawler::pipeline::{self, MetadataStats}; use crate::crawler::rate_limit::HostRateLimiters; +use crate::crawler::safety::DownloadAllowlist; use crate::crawler::session; use crate::crawler::source::{target as target_source, DiscoverMode}; use crate::repo; @@ -153,6 +154,8 @@ async fn spawn_crawler_daemon( start_url: url.clone(), mode_pref: cfg.mode, incremental_stop_after: cfg.incremental_stop_after, + download_allowlist: cfg.download_allowlist.clone(), + max_image_bytes: cfg.max_image_bytes, }); m }); @@ -163,6 +166,8 @@ async fn spawn_crawler_daemon( storage: Arc::clone(&storage), http, rate: Arc::clone(&rate), + download_allowlist: cfg.download_allowlist.clone(), + max_image_bytes: cfg.max_image_bytes, }); // Shared cancellation: daemon shutdown cancels the BrowserManager's @@ -216,6 +221,8 @@ struct RealMetadataPass { start_url: String, mode_pref: CrawlerModePref, incremental_stop_after: usize, + download_allowlist: DownloadAllowlist, + max_image_bytes: usize, } #[async_trait] @@ -238,6 +245,8 @@ impl MetadataPass for RealMetadataPass { 0, false, mode, + &self.download_allowlist, + self.max_image_bytes, ) .await } @@ -293,6 +302,8 @@ struct RealChapterDispatcher { storage: Arc, http: reqwest::Client, rate: Arc, + download_allowlist: DownloadAllowlist, + max_image_bytes: usize, } #[async_trait] @@ -322,6 +333,8 @@ impl ChapterDispatcher for RealChapterDispatcher { manga_id, &source_url, false, + &self.download_allowlist, + self.max_image_bytes, ) .await?; drop(lease); diff --git a/backend/src/bin/crawler.rs b/backend/src/bin/crawler.rs index 58c80e7..346e838 100644 --- a/backend/src/bin/crawler.rs +++ b/backend/src/bin/crawler.rs @@ -229,6 +229,33 @@ async fn run( } let rate = Arc::new(rate); + // SSRF defence: only download from the catalog host + CDN host + // (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap + // single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes. + let mut allowlist = + mangalord::crawler::safety::DownloadAllowlist::new(); + if let Ok(parsed) = reqwest::Url::parse(start_url) { + if let Some(h) = parsed.host_str() { + allowlist = allowlist.allow(h); + } + } + if let Some(host) = cdn_host { + allowlist = allowlist.allow(host); + } + if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") { + for piece in extras.split(',') { + let trimmed = piece.trim(); + if !trimmed.is_empty() { + allowlist = allowlist.allow(trimmed); + } + } + } + let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(mangalord::crawler::safety::DEFAULT_MAX_IMAGE_BYTES); + let allowlist = Arc::new(allowlist); + let stats = pipeline::run_metadata_pass( manager.as_ref(), db, @@ -239,6 +266,8 @@ async fn run( limit, skip_chapters, mode, + allowlist.as_ref(), + max_image_bytes, ) .await?; tracing::info!(?stats, "metadata pass complete"); @@ -253,6 +282,8 @@ async fn run( "target", chapter_workers, force_refetch_chapters, + Arc::clone(&allowlist), + max_image_bytes, ) .await?; } @@ -276,6 +307,8 @@ async fn sync_bookmarked_chapter_content( source_id: &str, workers: usize, force_refetch: bool, + allowlist: Arc, + max_image_bytes: usize, ) -> anyhow::Result<()> { let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as( r#" @@ -312,6 +345,7 @@ async fn sync_bookmarked_chapter_content( let storage = Arc::clone(&storage); let rate = Arc::clone(&rate); let manager = Arc::clone(&manager); + let allowlist = Arc::clone(&allowlist); let stats = &stats; async move { if session_expired.load(std::sync::atomic::Ordering::Relaxed) { @@ -336,6 +370,8 @@ async fn sync_bookmarked_chapter_content( manga_id, &source_url, force_refetch, + allowlist.as_ref(), + max_image_bytes, ) .await; drop(lease); diff --git a/backend/src/config.rs b/backend/src/config.rs index 7f0180a..dc31006 100644 --- a/backend/src/config.rs +++ b/backend/src/config.rs @@ -5,6 +5,7 @@ use chrono::NaiveTime; use chrono_tz::Tz; use crate::crawler::browser::LaunchOptions; +use crate::crawler::safety::{DownloadAllowlist, DEFAULT_MAX_IMAGE_BYTES}; use crate::crawler::source::DiscoverMode; /// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default — @@ -93,6 +94,13 @@ pub struct CrawlerConfig { /// `stop_after_unchanged` threshold supplied to Incremental in both /// `Auto` (post-seed) and `Explicit(Incremental)` modes. pub incremental_stop_after: usize, + /// Hosts the crawler is allowed to download images / covers from. + /// Always seeded with the host of `start_url` and (when set) the + /// configured `cdn_host`. Additional hosts can be added via + /// `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-separated). + pub download_allowlist: DownloadAllowlist, + /// Hard upper bound on a single image download. Defaults to 32 MiB. + pub max_image_bytes: usize, } impl Default for CrawlerConfig { @@ -115,6 +123,8 @@ impl Default for CrawlerConfig { browser: LaunchOptions::headless(), mode: CrawlerModePref::Auto, incremental_stop_after: 20, + download_allowlist: DownloadAllowlist::new(), + max_image_bytes: DEFAULT_MAX_IMAGE_BYTES, } } } @@ -172,6 +182,14 @@ impl CrawlerConfig { let incremental_stop_after = env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize; let mode = parse_mode_env(incremental_stop_after)?; + let start_url = std::env::var("CRAWLER_START_URL") + .ok() + .filter(|s| !s.trim().is_empty()); + let cdn_host = std::env::var("CRAWLER_CDN_HOST") + .ok() + .filter(|s| !s.trim().is_empty()); + let download_allowlist = + build_download_allowlist(start_url.as_deref(), cdn_host.as_deref()); Ok(Self { daemon_enabled: env_bool("CRAWLER_DAEMON", true), daily_at, @@ -179,13 +197,9 @@ impl CrawlerConfig { idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)), chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize, retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32, - start_url: std::env::var("CRAWLER_START_URL") - .ok() - .filter(|s| !s.trim().is_empty()), + start_url, rate_ms: env_u64("CRAWLER_RATE_MS", 1000), - cdn_host: std::env::var("CRAWLER_CDN_HOST") - .ok() - .filter(|s| !s.trim().is_empty()), + cdn_host, cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)), phpsessid: std::env::var("CRAWLER_PHPSESSID") .ok() @@ -202,10 +216,45 @@ impl CrawlerConfig { browser: LaunchOptions::from_env(), mode, incremental_stop_after, + download_allowlist, + max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES), }) } } +/// Build the download allowlist from env. Always includes +/// `CRAWLER_START_URL`'s host (so the crawler can fetch covers from +/// the catalog itself) and `CRAWLER_CDN_HOST` when set. Additional +/// hosts can be supplied via `CRAWLER_DOWNLOAD_ALLOWLIST` (comma- +/// separated). Empty by default — meaning the crawler refuses to +/// download anything when no source is configured, which is the safe +/// fail-closed posture. +fn build_download_allowlist( + start_url: Option<&str>, + cdn_host: Option<&str>, +) -> DownloadAllowlist { + let mut allow = DownloadAllowlist::new(); + if let Some(url) = start_url { + if let Ok(parsed) = reqwest::Url::parse(url) { + if let Some(h) = parsed.host_str() { + allow = allow.allow(h); + } + } + } + if let Some(host) = cdn_host { + allow = allow.allow(host); + } + if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") { + for piece in extras.split(',') { + let trimmed = piece.trim(); + if !trimmed.is_empty() { + allow = allow.allow(trimmed); + } + } + } + allow +} + /// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are /// `auto`, `backfill`, and `incremental` (case-insensitive). Anything /// else is a hard error so a typo can't silently fall through to the diff --git a/backend/src/crawler/content.rs b/backend/src/crawler/content.rs index 71fbfe3..1d5706a 100644 --- a/backend/src/crawler/content.rs +++ b/backend/src/crawler/content.rs @@ -18,7 +18,8 @@ use uuid::Uuid; use crate::crawler::detect::PageError; use crate::crawler::rate_limit::HostRateLimiters; -use crate::crawler::session; +use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist}; +use crate::crawler::session::{self, ChapterProbe}; use crate::storage::Storage; /// Parse the chapter page DOM and return the page images in `pageN` @@ -88,6 +89,8 @@ pub async fn sync_chapter_content( manga_id: Uuid, source_url: &str, force_refetch: bool, + allowlist: &DownloadAllowlist, + max_image_bytes: usize, ) -> anyhow::Result { // Skip if already fetched, unless caller explicitly forces. if !force_refetch { @@ -110,16 +113,28 @@ pub async fn sync_chapter_content( .with_context(|| format!("open chapter page {source_url}"))?; page.wait_for_navigation().await.context("wait for chapter nav")?; - // Session probe: avatar present == still logged in. Missing means - // PHPSESSID expired; bail the entire crawler run. - if page.find_element("#avatar_menu").await.is_err() { - page.close().await.ok(); - return Ok(SyncOutcome::SessionExpired); - } - let html = page.content().await.context("read chapter html")?; page.close().await.ok(); + // Three-way session classification: distinguishes a transient + // hiccup (broken-page body or logged-in-but-no-reader) from a + // genuine PHPSESSID expiry (no reader and no avatar widget). The + // earlier binary `#avatar_menu` check conflated both and froze + // every worker on a layout shift. + match session::classify_chapter_probe(&html) { + ChapterProbe::Unauthenticated => return Ok(SyncOutcome::SessionExpired), + ChapterProbe::Transient => { + // Surface as a typed Err so the dispatcher path runs + // ack_failed with exponential backoff (rather than the + // session-expired sticky flag). + anyhow::bail!( + "chapter page at {source_url} returned a transient response \ + (broken-page body or reader didn't render); will retry" + ); + } + ChapterProbe::Ok => {} + } + let images = parse_chapter_pages(&html) .with_context(|| format!("parse chapter pages at {source_url}"))?; if images.is_empty() { @@ -138,18 +153,29 @@ pub async fn sync_chapter_content( format!("join image URL {} onto {source_url}", img.url) })?; rate.wait_for(url.as_str()).await?; - let resp = http - .get(url.clone()) - // Source CDNs commonly check Referer. Set it to the - // chapter page — matches what the browser would send. - .header(reqwest::header::REFERER, source_url) - .send() - .await - .with_context(|| format!("GET {url}"))? - .error_for_status() - .with_context(|| format!("non-2xx for {url}"))?; - let bytes = resp.bytes().await.context("read image body")?.to_vec(); - let ext = infer::get(&bytes).map(|k| k.extension()).unwrap_or("bin"); + let bytes = fetch_bytes_capped( + http, + url.as_str(), + Some(source_url), + allowlist, + max_image_bytes, + ) + .await? + .to_vec(); + // Reject any non-image response: the only valid output of an + // image URL is an image. `infer` returns None on truncated + // bytes too, which also wants to be a failure not a silent + // `.bin` extension. + if !looks_like_image(&bytes) { + anyhow::bail!( + "image URL {url} returned non-image bytes \ + (first 16: {:?}); refusing to store as binary blob", + &bytes.get(..16.min(bytes.len())) + ); + } + let ext = infer::get(&bytes) + .map(|k| k.extension()) + .expect("looks_like_image asserted infer succeeded"); fetched.push((img.page_number, bytes, ext)); } @@ -194,8 +220,9 @@ pub async fn sync_chapter_content( Ok(SyncOutcome::Fetched { pages: fetched.len() }) } -// Suppress unused-import warning for `session` until the bin/crawler -// wiring lands in this branch and uses it through this module. +// Suppress unused-import warning for `session::registrable_domain` +// until the bin/crawler wiring lands in this branch and uses it +// through this module. #[allow(dead_code)] fn _keep_session_in_scope() { let _ = session::registrable_domain; diff --git a/backend/src/crawler/mod.rs b/backend/src/crawler/mod.rs index 07a3b85..3e265df 100644 --- a/backend/src/crawler/mod.rs +++ b/backend/src/crawler/mod.rs @@ -22,6 +22,7 @@ pub mod diff; pub mod jobs; pub mod pipeline; pub mod rate_limit; +pub mod safety; pub mod session; pub mod source; pub mod url_utils; diff --git a/backend/src/crawler/pipeline.rs b/backend/src/crawler/pipeline.rs index 91a3abc..e2b08a9 100644 --- a/backend/src/crawler/pipeline.rs +++ b/backend/src/crawler/pipeline.rs @@ -9,6 +9,7 @@ use uuid::Uuid; use crate::crawler::browser_manager::BrowserManager; use crate::crawler::jobs::{self, EnqueueResult, JobPayload}; use crate::crawler::rate_limit::HostRateLimiters; +use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist}; use crate::crawler::source::target::TargetSource; use crate::crawler::source::{DiscoverMode, FetchContext, Source}; use crate::repo; @@ -62,6 +63,8 @@ pub async fn run_metadata_pass( limit: usize, skip_chapters: bool, mode: DiscoverMode, + allowlist: &DownloadAllowlist, + max_image_bytes: usize, ) -> anyhow::Result { let lease = browser_manager .acquire() @@ -181,6 +184,8 @@ pub async fn run_metadata_pass( &r.url, upsert.manga_id, cover_url, + allowlist, + max_image_bytes, ) .await { @@ -382,6 +387,7 @@ pub struct EnqueueSummary { /// pipeline because the CLI still calls it from its inline chapter-content /// loop; once the worker pool fully replaces that path we can fold this /// into `pipeline` proper. +#[allow(clippy::too_many_arguments)] async fn download_and_store_cover( db: &PgPool, storage: &dyn Storage, @@ -390,6 +396,8 @@ async fn download_and_store_cover( manga_url: &str, manga_id: Uuid, cover_url: &str, + allowlist: &DownloadAllowlist, + max_image_bytes: usize, ) -> anyhow::Result<()> { let absolute = reqwest::Url::parse(manga_url) .context("parse manga URL")? @@ -397,17 +405,22 @@ async fn download_and_store_cover( .context("join cover URL onto manga URL")?; rate.wait_for(absolute.as_str()).await?; - let resp = http - .get(absolute.clone()) - .header(reqwest::header::REFERER, manga_url) - .send() - .await - .with_context(|| format!("GET {absolute}"))? - .error_for_status() - .with_context(|| format!("non-2xx for {absolute}"))?; - let bytes = resp.bytes().await.context("read cover body")?; - let kind = infer::get(&bytes); - let ext = kind.map(|k| k.extension()).unwrap_or("bin"); + let bytes = fetch_bytes_capped( + http, + absolute.as_str(), + Some(manga_url), + allowlist, + max_image_bytes, + ) + .await?; + if !looks_like_image(&bytes) { + anyhow::bail!( + "cover URL {absolute} returned non-image bytes; refusing to store as binary blob" + ); + } + let ext = infer::get(&bytes) + .map(|k| k.extension()) + .expect("looks_like_image asserted infer succeeded"); let key = format!("mangas/{manga_id}/cover.{ext}"); storage diff --git a/backend/src/crawler/safety.rs b/backend/src/crawler/safety.rs new file mode 100644 index 0000000..d9c62ad --- /dev/null +++ b/backend/src/crawler/safety.rs @@ -0,0 +1,486 @@ +//! Defensive helpers for the image-download paths. +//! +//! Two threats this module addresses: +//! +//! - **SSRF**: a scraped chapter or manga page can embed an absolute +//! ``. The crawler runs inside the +//! backend container with intra-compose access to `postgres:5432` +//! and possibly other internal services; without a host check the +//! crawler would happily probe them. [`is_safe_url`] rejects +//! anything whose host isn't on the operator-configured allowlist, +//! plus any IP literal in RFC1918 / loopback / link-local / unique- +//! local space (including IPv4-mapped IPv6 like `::ffff:127.0.0.1`) +//! as a second defence for the case where an allowlisted hostname's +//! DNS happens to resolve to a literal private address. +//! +//! **DNS rebinding is not covered.** A hostname like `cdn.allowed.com` +//! that *resolves* to `127.0.0.1` via hostile DNS bypasses the IP +//! check entirely — `is_safe_url` only inspects URL strings, not +//! resolved IPs. Mitigating that requires a custom reqwest resolver +//! that filters IPs after DNS, which would mean rebuilding reqwest's +//! connector. The allowlist + good operator DNS hygiene is the +//! realistic mitigation today. +//! +//! - **Unbounded download**: `Response::bytes().await` reads the full +//! body before returning. A malicious source serving a 10 GiB image +//! would fill memory and then disk. [`accumulate_capped`] streams +//! the body chunk-by-chunk into a [`bytes::BytesMut`] and bails as +//! soon as the running total exceeds the cap. +//! +//! Both helpers are pure-data: the SSRF check is keyed off a parsed +//! URL string, and the byte accumulator is keyed off a generic stream. +//! Easy to unit-test without a live network or browser. + +use std::net::IpAddr; + +use anyhow::{bail, Context}; +use bytes::BytesMut; +use futures_util::StreamExt; +use reqwest::Url; + +/// Default per-image download cap. A page image is generally <2 MiB; +/// 32 MiB leaves headroom for high-resolution covers while still +/// stopping a misbehaving CDN dead. Override via `CRAWLER_MAX_IMAGE_BYTES`. +pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024; + +/// Hosts that are always allowed in addition to the operator's +/// configured allowlist. None by default — keeping the surface area +/// minimal so the only way a URL gets through is if it matches an +/// explicit catalog/CDN entry. +#[derive(Clone, Debug, Default)] +pub struct DownloadAllowlist { + hosts: Vec, +} + +impl DownloadAllowlist { + pub fn new() -> Self { + Self { hosts: Vec::new() } + } + + /// Add a host (case-insensitive match). Sub-domains are *not* + /// implied: pass `cdn.example.com` and `example.com` separately + /// if both should be reachable. + pub fn allow(mut self, host: impl Into) -> Self { + let h = host.into().to_ascii_lowercase(); + if !h.is_empty() && !self.hosts.iter().any(|existing| existing == &h) { + self.hosts.push(h); + } + self + } + + pub fn is_empty(&self) -> bool { + self.hosts.is_empty() + } + + pub fn contains(&self, host: &str) -> bool { + let lower = host.to_ascii_lowercase(); + self.hosts.iter().any(|h| h == &lower) + } +} + +/// Verify a URL is safe for the crawler to fetch. +/// +/// Rejects: +/// - non-http(s) schemes (file://, gopher://, …), +/// - any IP literal in private / loopback / link-local / unique-local +/// space (defense in depth — a DNS allowlist alone wouldn't cover an +/// attacker that places an entry like `cdn.evil` pointing at +/// `192.168.1.1`), +/// - the literal hostname `localhost`, +/// - hosts that aren't on the supplied allowlist. +/// +/// An empty allowlist rejects everything (the conservative default — +/// callers must explicitly allow the catalog and CDN hosts). +pub fn is_safe_url(raw_url: &str, allow: &DownloadAllowlist) -> Result<(), UrlSafetyError> { + let url = Url::parse(raw_url).map_err(|_| UrlSafetyError::Unparseable)?; + let scheme = url.scheme(); + if scheme != "http" && scheme != "https" { + return Err(UrlSafetyError::BadScheme(scheme.to_string())); + } + let host = url.host_str().ok_or(UrlSafetyError::NoHost)?; + let lower_host = host.to_ascii_lowercase(); + if lower_host == "localhost" { + return Err(UrlSafetyError::Loopback); + } + // Reject IP literals in private/loopback ranges regardless of the + // allowlist — if someone puts an IP literal on the allowlist they + // almost certainly didn't mean a private range. + // reqwest::Url normalises IPv6 literals as `[::1]` (brackets + // included) in `host_str()`. Strip the brackets before parsing. + let ip_candidate = lower_host + .strip_prefix('[') + .and_then(|s| s.strip_suffix(']')) + .unwrap_or(&lower_host); + if let Ok(ip) = ip_candidate.parse::() { + if is_private_ip(&ip) { + return Err(UrlSafetyError::PrivateIp(ip)); + } + } + if !allow.contains(&lower_host) { + return Err(UrlSafetyError::HostNotAllowed(lower_host)); + } + Ok(()) +} + +fn is_private_ip(ip: &IpAddr) -> bool { + match ip { + IpAddr::V4(v4) => { + v4.is_loopback() + || v4.is_private() + || v4.is_link_local() + || v4.is_unspecified() + || v4.is_broadcast() + // CGNAT 100.64.0.0/10 + || (v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64) + // 169.254/16 link-local already covered, but 0.0.0.0/8 is special-use + || v4.octets()[0] == 0 + } + IpAddr::V6(v6) => { + // IPv4-mapped IPv6 (::ffff:0:0/96): unwrap to the embedded + // IPv4 and recurse so `::ffff:127.0.0.1` is caught by the + // IPv4 loopback check rather than passing through. + // `Ipv6Addr::is_loopback()` only matches `::1` exactly. + if let Some(v4) = v6.to_ipv4_mapped() { + return is_private_ip(&IpAddr::V4(v4)); + } + v6.is_loopback() + || v6.is_unspecified() + // fc00::/7 unique-local + || (v6.segments()[0] & 0xfe00) == 0xfc00 + // fe80::/10 link-local + || (v6.segments()[0] & 0xffc0) == 0xfe80 + } + } +} + +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum UrlSafetyError { + #[error("URL is not parseable")] + Unparseable, + #[error("scheme {0:?} is not http or https")] + BadScheme(String), + #[error("URL is missing a host")] + NoHost, + #[error("host points at the loopback interface")] + Loopback, + #[error("host is a private/internal IP: {0}")] + PrivateIp(IpAddr), + #[error("host {0:?} is not on the crawler download allowlist")] + HostNotAllowed(String), +} + +/// Drain a byte stream into a single buffer, bailing out as soon as +/// the running total exceeds `max_bytes`. Generic over the stream so +/// it's testable without a live HTTP response. +pub async fn accumulate_capped(stream: S, max_bytes: usize) -> anyhow::Result +where + S: futures_core::Stream>, + E: std::error::Error + Send + Sync + 'static, +{ + let mut buf = BytesMut::new(); + let mut stream = std::pin::pin!(stream); + while let Some(chunk) = stream.next().await { + let chunk = chunk.map_err(|e| anyhow::anyhow!("stream chunk: {e}"))?; + if buf.len().saturating_add(chunk.len()) > max_bytes { + bail!( + "response exceeds {max_bytes}-byte cap (received >{}+{})", + buf.len(), + chunk.len() + ); + } + buf.extend_from_slice(&chunk); + } + Ok(buf.freeze()) +} + +/// Send `req` and stream the response into a length-limited buffer. +/// Combines [`is_safe_url`] check + [`accumulate_capped`] so each +/// call-site is one line. +pub async fn fetch_bytes_capped( + http: &reqwest::Client, + url: &str, + referer: Option<&str>, + allow: &DownloadAllowlist, + max_bytes: usize, +) -> anyhow::Result { + is_safe_url(url, allow).with_context(|| format!("reject unsafe URL {url}"))?; + let mut req = http.get(url); + if let Some(r) = referer { + req = req.header(reqwest::header::REFERER, r); + } + let resp = req + .send() + .await + .with_context(|| format!("GET {url}"))? + .error_for_status() + .with_context(|| format!("non-2xx for {url}"))?; + accumulate_capped(resp.bytes_stream(), max_bytes) + .await + .with_context(|| format!("download body for {url}")) +} + +/// True when `bytes` sniffs as one of the *renderable* image formats +/// the `/files/*key` endpoint can serve with a correct Content-Type: +/// JPEG, PNG, WebP, GIF, AVIF. Matches the upload pipeline's +/// whitelist in `upload::parse_image`. +/// +/// `infer::MatcherType::Image` is intentionally NOT used — it also +/// matches BMP, TIFF, HEIF, ICO, PSD, and JP2. Those would sniff as +/// "image" here but [`api::files::content_type_for`] would fall back +/// to `application/octet-stream`, prompting browsers to download +/// instead of render. Keep the two layers aligned. +pub fn looks_like_image(bytes: &[u8]) -> bool { + matches!( + infer::get(bytes).map(|k| k.mime_type()), + Some("image/jpeg" | "image/png" | "image/webp" | "image/gif" | "image/avif") + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use futures_util::stream; + + fn allow_just(host: &str) -> DownloadAllowlist { + DownloadAllowlist::new().allow(host) + } + + #[test] + fn safe_url_allows_listed_host() { + let allow = allow_just("cdn.example.com"); + assert!(is_safe_url("https://cdn.example.com/img.jpg", &allow).is_ok()); + } + + #[test] + fn safe_url_blocks_unlisted_host() { + let allow = allow_just("cdn.example.com"); + let err = is_safe_url("https://evil.example.org/img.jpg", &allow).unwrap_err(); + assert!(matches!(err, UrlSafetyError::HostNotAllowed(h) if h == "evil.example.org")); + } + + #[test] + fn safe_url_blocks_localhost_even_if_allowlisted() { + let allow = allow_just("localhost"); + assert!(matches!( + is_safe_url("http://localhost:8080/", &allow).unwrap_err(), + UrlSafetyError::Loopback + )); + } + + #[test] + fn safe_url_blocks_loopback_ipv4() { + let allow = allow_just("127.0.0.1"); + assert!(matches!( + is_safe_url("http://127.0.0.1/", &allow).unwrap_err(), + UrlSafetyError::PrivateIp(_) + )); + } + + #[test] + fn safe_url_blocks_rfc1918() { + let allow = allow_just("10.0.0.1"); + for url in [ + "http://10.0.0.1/", + "http://192.168.1.1/", + "http://172.16.0.5/", + "http://172.31.255.255/", + ] { + assert!( + matches!( + is_safe_url(url, &allow).unwrap_err(), + UrlSafetyError::PrivateIp(_) + ), + "should reject {url}" + ); + } + } + + #[test] + fn safe_url_blocks_link_local() { + let allow = allow_just("169.254.169.254"); + // 169.254.169.254 is the AWS/GCP metadata service — the most + // dangerous SSRF target on a default cloud VM. + assert!(matches!( + is_safe_url("http://169.254.169.254/", &allow).unwrap_err(), + UrlSafetyError::PrivateIp(_) + )); + } + + #[test] + fn safe_url_blocks_ipv6_loopback_and_ula() { + // Debug what host_str returns first — reqwest::Url normalises + // IPv6 literals as `[::1]` with brackets, which doesn't parse + // as `IpAddr` directly. The implementation strips them. + let allow = allow_just("[::1]"); + let err = is_safe_url("http://[::1]/", &allow).unwrap_err(); + assert!( + matches!(err, UrlSafetyError::PrivateIp(_)), + "expected PrivateIp, got {err:?}" + ); + let allow = allow_just("[fd00::1]"); + let err = is_safe_url("http://[fd00::1]/", &allow).unwrap_err(); + assert!( + matches!(err, UrlSafetyError::PrivateIp(_)), + "expected PrivateIp, got {err:?}" + ); + } + + #[test] + fn safe_url_blocks_ipv4_mapped_ipv6_loopback() { + // `Ipv6Addr::is_loopback()` only matches `::1` exactly, so + // `::ffff:127.0.0.1` would slip through without the + // to_ipv4_mapped() unwrap in is_private_ip. + let allow = allow_just("[::ffff:127.0.0.1]"); + let err = is_safe_url("http://[::ffff:127.0.0.1]/", &allow).unwrap_err(); + assert!( + matches!(err, UrlSafetyError::PrivateIp(_)), + "expected PrivateIp, got {err:?}" + ); + } + + #[test] + fn safe_url_blocks_ipv4_mapped_ipv6_rfc1918() { + let allow = allow_just("[::ffff:10.0.0.1]"); + let err = is_safe_url("http://[::ffff:10.0.0.1]/", &allow).unwrap_err(); + assert!(matches!(err, UrlSafetyError::PrivateIp(_))); + } + + #[test] + fn safe_url_blocks_non_http_schemes() { + let allow = allow_just("anywhere"); + assert!(matches!( + is_safe_url("file:///etc/passwd", &allow).unwrap_err(), + UrlSafetyError::BadScheme(_) + )); + assert!(matches!( + is_safe_url("gopher://anywhere:70/", &allow).unwrap_err(), + UrlSafetyError::BadScheme(_) + )); + } + + #[test] + fn safe_url_rejects_unparseable() { + let allow = allow_just("anywhere"); + assert!(matches!( + is_safe_url("not a url", &allow).unwrap_err(), + UrlSafetyError::Unparseable + )); + } + + #[test] + fn safe_url_empty_allowlist_rejects_everything() { + let allow = DownloadAllowlist::new(); + let err = is_safe_url("https://cdn.example.com/img.jpg", &allow).unwrap_err(); + assert!(matches!(err, UrlSafetyError::HostNotAllowed(_))); + } + + #[test] + fn allowlist_matches_case_insensitively() { + let allow = DownloadAllowlist::new().allow("CDN.Example.COM"); + assert!(is_safe_url("https://cdn.example.com/x.jpg", &allow).is_ok()); + assert!(is_safe_url("https://CDN.EXAMPLE.com/x.jpg", &allow).is_ok()); + } + + #[tokio::test] + async fn accumulate_capped_returns_full_body_under_cap() { + let chunks: Vec> = vec![ + Ok(bytes::Bytes::from_static(b"hello ")), + Ok(bytes::Bytes::from_static(b"world")), + ]; + let s = stream::iter(chunks); + let out = accumulate_capped(s, 100).await.unwrap(); + assert_eq!(out.as_ref(), b"hello world"); + } + + #[tokio::test] + async fn accumulate_capped_bails_past_cap() { + let chunks: Vec> = vec![ + Ok(bytes::Bytes::from(vec![0u8; 50])), + Ok(bytes::Bytes::from(vec![0u8; 60])), + ]; + let s = stream::iter(chunks); + let err = accumulate_capped(s, 100).await.unwrap_err(); + assert!(err.to_string().contains("100-byte cap")); + } + + #[tokio::test] + async fn accumulate_capped_surfaces_stream_errors() { + let chunks: Vec> = vec![ + Ok(bytes::Bytes::from_static(b"ok")), + Err(std::io::Error::other("network blip")), + ]; + let s = stream::iter(chunks); + let err = accumulate_capped(s, 100).await.unwrap_err(); + assert!(err.to_string().contains("network blip")); + } + + #[test] + fn looks_like_image_accepts_jpeg() { + // JPEG SOI + APP0 segment. + let jpeg = [0xff, 0xd8, 0xff, 0xe0, 0, 0x10, b'J', b'F', b'I', b'F']; + assert!(looks_like_image(&jpeg)); + } + + #[test] + fn looks_like_image_accepts_png() { + let png = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0, 0, 0, 0]; + assert!(looks_like_image(&png)); + } + + #[test] + fn looks_like_image_rejects_html_disguised_as_image() { + let html = b"not an image"; + assert!(!looks_like_image(html)); + } + + #[test] + fn looks_like_image_rejects_empty() { + assert!(!looks_like_image(&[])); + } + + #[test] + fn looks_like_image_rejects_renderable_but_unsupported_formats() { + // BMP, TIFF, ICO, PSD are `infer::MatcherType::Image` but the + // /files/*key handler doesn't have Content-Type mappings for + // them, so they'd be served as application/octet-stream and + // download instead of render. Reject at the crawler so we + // never land them in storage. + // BMP magic: "BM" + 4-byte size. + let bmp = [b'B', b'M', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + assert!(!looks_like_image(&bmp), "BMP must be rejected (not renderable by /files)"); + + // TIFF little-endian magic: "II" + 42. + let tiff = [0x49, 0x49, 0x2a, 0x00, 0, 0, 0, 0]; + assert!(!looks_like_image(&tiff), "TIFF must be rejected"); + + // ICO magic: 0x00,0x00,0x01,0x00. + let ico = [0x00, 0x00, 0x01, 0x00, 1, 0, 16, 16, 0, 0, 1, 0, 0x18, 0, 0x40, 0, 0, 0, 0x16, 0, 0, 0]; + assert!(!looks_like_image(&ico), "ICO must be rejected"); + } + + #[test] + fn looks_like_image_accepts_webp_gif_avif() { + // Cover the three remaining whitelisted formats so a future + // tightening that drops one would fail noisily. + let webp = [ + b'R', b'I', b'F', b'F', + 0, 0, 0, 0, + b'W', b'E', b'B', b'P', + b'V', b'P', b'8', b' ', + ]; + assert!(looks_like_image(&webp)); + + let gif = [b'G', b'I', b'F', b'8', b'7', b'a', 0, 0, 0, 0]; + assert!(looks_like_image(&gif)); + + let avif = [ + 0x00, 0x00, 0x00, 0x18, + b'f', b't', b'y', b'p', + b'a', b'v', b'i', b'f', + 0x00, 0x00, 0x00, 0x00, + b'm', b'i', b'f', b'1', + b'a', b'v', b'i', b'f', + ]; + assert!(looks_like_image(&avif)); + } +} diff --git a/backend/src/crawler/session.rs b/backend/src/crawler/session.rs index 321363e..1782c83 100644 --- a/backend/src/crawler/session.rs +++ b/backend/src/crawler/session.rs @@ -100,6 +100,54 @@ pub fn classify_probe(html: &str) -> SessionProbe { } } +/// Three-way classification of a chapter page response. +/// +/// Reader pages don't render `#logo`, so [`classify_probe`] can't be +/// reused as-is. The chapter-specific marker is `a#pic_container` +/// (asserted by the reader-page parser at `parse_chapter_pages`). +/// +/// Order matters: broken-page body wins over selector matches, so a +/// transient site-wide 5xx that happens to render the avatar widget +/// elsewhere doesn't falsely reach `Ok`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ChapterProbe { + /// `a#pic_container` present — reader rendered. Whether + /// `#avatar_menu` is also there is informational; if the reader + /// loaded the session is by definition still good. + Ok, + /// Site rendered a "logged out" or "please log in" page (no + /// reader, no broken-page body, and no avatar widget either). + /// Distinguishes the genuine expired-session case from a + /// transient site hiccup. + Unauthenticated, + /// Broken-page body, or reader didn't render but the user is + /// still logged in (avatar widget present). Caller should retry + /// rather than blame the session. + Transient, +} + +pub fn classify_chapter_probe(html: &str) -> ChapterProbe { + if is_broken_page_body(html) { + return ChapterProbe::Transient; + } + let doc = scraper::Html::parse_document(html); + let container = scraper::Selector::parse("a#pic_container").unwrap(); + if doc.select(&container).next().is_some() { + return ChapterProbe::Ok; + } + let avatar = scraper::Selector::parse("#avatar_menu").unwrap(); + if doc.select(&avatar).next().is_some() { + // Logged-in user, but the reader didn't render — most likely + // the layout shifted or the site is serving an interstitial. + ChapterProbe::Transient + } else { + // No reader, no avatar, no broken-body marker — site rendered + // the "please log in" page, which is the genuine session- + // expired signal on this route. + ChapterProbe::Unauthenticated + } +} + /// In-startup retry budget for the session probe. Small but non-zero — /// startup hitting a 5-second site hiccup shouldn't fail the operator /// with "PHPSESSID expired" when the session is actually fine. @@ -210,6 +258,73 @@ mod tests { assert_eq!(classify_probe(""), SessionProbe::Transient); } + #[test] + fn classify_chapter_probe_ok_when_reader_rendered() { + let html = r#" + + + + + + "#; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); + } + + #[test] + fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() { + // What a logged-out hit on a chapter URL renders: a normal + // site layout (header etc.) with a "please log in" body, but + // no reader and no avatar widget. + let html = r#" + +
+
Please log in to read this chapter.
+ + "#; + assert_eq!( + classify_chapter_probe(html), + ChapterProbe::Unauthenticated + ); + } + + #[test] + fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() { + // Avatar shows the session is still valid; reader didn't + // render — site is serving an interstitial or the layout + // momentarily shifted. Retry, don't blame the session. + let html = r#" + +
+
Site maintenance — back in 5 minutes.
+ + "#; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); + } + + #[test] + fn classify_chapter_probe_transient_on_broken_page_body() { + let html = + "

we're sorry, the request file are not found.

"; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); + } + + #[test] + fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() { + // Regression for the original bug: the binary + // find_element("#avatar_menu") check treated "no avatar" as + // session-expired even when a transient hiccup was the real + // cause. classify_chapter_probe must NOT trip on that pattern + // when pic_container *is* present. + let html = r#" + + + + + + "#; + assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); + } + #[test] fn classify_probe_trusts_broken_body_over_stray_avatar_match() { // Defensive: if a broken-page body somehow contains an diff --git a/backend/src/repo/crawler.rs b/backend/src/repo/crawler.rs index eb83877..eeb5bfd 100644 --- a/backend/src/repo/crawler.rs +++ b/backend/src/repo/crawler.rs @@ -274,7 +274,20 @@ async fn sync_tags( manga_id: Uuid, tags: &[String], ) -> sqlx::Result<()> { - sqlx::query("DELETE FROM manga_tags WHERE manga_id = $1") + // Only clear crawler-owned attachments (added_by IS NULL). User- + // attached tags are owned by the attaching user and must survive + // the recurring metadata pass — see manga_tags.added_by in + // migration 0009. + // + // Note on orphans: `manga_tags.added_by` is `ON DELETE SET NULL`, + // so an attachment whose user was deleted becomes + // indistinguishable from a crawler-owned row and is cleaned up + // here. That mirrors how `api::mangas::detach_tag` already treats + // orphans ("nobody owns it, refuse to let anyone but admin clear + // them") — the crawler now becomes the eventual reaper. Tracked + // by `sync_tags_garbage_collects_orphan_user_attachments` in + // backend/tests/crawler_sync.rs. + sqlx::query("DELETE FROM manga_tags WHERE manga_id = $1 AND added_by IS NULL") .bind(manga_id) .execute(&mut **tx) .await?; diff --git a/backend/src/repo/manga.rs b/backend/src/repo/manga.rs index f285940..53d2ace 100644 --- a/backend/src/repo/manga.rs +++ b/backend/src/repo/manga.rs @@ -281,3 +281,17 @@ pub async fn exists(pool: &PgPool, id: Uuid) -> AppResult { .await?; Ok(exists) } + +/// Returns the uploader's user id for a manga. `None` either when the +/// manga doesn't exist or when the row predates the `uploaded_by` +/// column (historical NULL — see migration 0011). Callers must +/// distinguish "manga missing" via [`exists`] before relying on this +/// to make an authz decision. +pub async fn uploaded_by(pool: &PgPool, id: Uuid) -> AppResult> { + let row: Option<(Option,)> = + sqlx::query_as("SELECT uploaded_by FROM mangas WHERE id = $1") + .bind(id) + .fetch_optional(pool) + .await?; + Ok(row.and_then(|(u,)| u)) +} diff --git a/backend/src/storage/local.rs b/backend/src/storage/local.rs index 7e6dd23..77a4fa2 100644 --- a/backend/src/storage/local.rs +++ b/backend/src/storage/local.rs @@ -16,6 +16,13 @@ impl LocalStorage { } fn resolve(&self, key: &str) -> Result { + // NUL bytes are rejected by the Linux syscall layer, but the + // error surfaces as an opaque IO failure rather than the + // explicit `BadKey` the rest of the contract uses. Catch it + // here so the error path is consistent. + if key.contains('\0') { + return Err(StorageError::BadKey); + } let key = key.trim_start_matches('/'); if key.is_empty() { return Err(StorageError::BadKey); @@ -114,6 +121,9 @@ mod tests { assert!(matches!(s.get(".").await, Err(StorageError::BadKey))); // Empty segment via doubled slash. assert!(matches!(s.get("a//b").await, Err(StorageError::BadKey))); + // NUL byte (rejected explicitly so callers see BadKey rather + // than an opaque IO error from the kernel). + assert!(matches!(s.put("a\0b", b"x").await, Err(StorageError::BadKey))); } #[tokio::test] diff --git a/backend/tests/api_auth.rs b/backend/tests/api_auth.rs index 8ded505..827e6c2 100644 --- a/backend/tests/api_auth.rs +++ b/backend/tests/api_auth.rs @@ -567,6 +567,91 @@ async fn user_a_cannot_delete_user_b_token(pool: PgPool) { assert_eq!(resp.status(), StatusCode::NO_CONTENT); } +/// Username enumeration via login response time: an attacker probes +/// for valid usernames by measuring how long /auth/login takes. Before +/// the equalisation fix, the no-user branch returned 401 in <1 ms +/// while the wrong-password branch took ~50-100 ms (the argon2 verify +/// cost). This test asserts the no-user branch now spends at least +/// some meaningful fraction of the wrong-password branch's time. +/// +/// Tolerance is intentionally loose so CI variance doesn't flap the +/// test. The unequalised gap is large enough (~50x) that even a noisy +/// CI run with a 5x slack still catches it. +#[sqlx::test(migrations = "./migrations")] +async fn login_no_user_branch_runs_argon2_for_timing_equalisation(pool: PgPool) { + use std::time::Instant; + + let h = common::harness(pool); + + // Register the victim user so the wrong-password branch has a real + // argon2 hash to verify against. + let _ = h + .app + .clone() + .oneshot(common::post_json( + "/api/v1/auth/register", + json!({ "username": "victim", "password": "hunter2hunter2" }), + )) + .await + .unwrap(); + + // Warm-up: first login of the process initialises the dummy hash + // lazily. Skip that cost when measuring. + let _ = h + .app + .clone() + .oneshot(common::post_json( + "/api/v1/auth/login", + json!({ "username": "victim", "password": "wrong" }), + )) + .await + .unwrap(); + let _ = h + .app + .clone() + .oneshot(common::post_json( + "/api/v1/auth/login", + json!({ "username": "ghost", "password": "wrong" }), + )) + .await + .unwrap(); + + // Median-of-N is more stable than a single sample. + async fn sample_min( + app: &axum::Router, + username: &str, + n: u32, + ) -> std::time::Duration { + let mut samples = Vec::with_capacity(n as usize); + for _ in 0..n { + let req = common::post_json( + "/api/v1/auth/login", + json!({ "username": username, "password": "wrong-guess" }), + ); + let t = Instant::now(); + let resp = app.clone().oneshot(req).await.unwrap(); + let d = t.elapsed(); + assert_eq!(resp.status(), StatusCode::UNAUTHORIZED); + samples.push(d); + } + // Use the minimum: it's the floor that argon2 takes, robust + // against unrelated stalls (DB connection acquisition, etc.). + *samples.iter().min().unwrap() + } + + let wrong_pwd = sample_min(&h.app, "victim", 3).await; + let no_user = sample_min(&h.app, "ghost", 3).await; + + // 5x slack: argon2 dominates both branches, so they should be + // within an order of magnitude. Unequalised, no_user would be + // ~50-100x faster. Asserting "no_user >= wrong_pwd / 5" catches + // the bug without being flaky in CI. + assert!( + no_user * 5 >= wrong_pwd, + "login timing leaks user existence: no_user={no_user:?}, wrong_pwd={wrong_pwd:?}" + ); +} + #[sqlx::test(migrations = "./migrations")] async fn delete_unknown_token_is_404(pool: PgPool) { let h = common::harness(pool); @@ -581,3 +666,27 @@ async fn delete_unknown_token_is_404(pool: PgPool) { .unwrap(); assert_eq!(resp.status(), StatusCode::NOT_FOUND); } + +/// Bot token names are user-supplied free-form strings; a 10 MB name +/// was accepted before. Cap at 64 chars to match the other free-form +/// identifier caps (tags, collection names). The response uses +/// `ValidationFailed` (422 with per-field details) so clients can +/// render the same shape they already handle for `attach_tag`. +#[sqlx::test(migrations = "./migrations")] +async fn create_token_rejects_name_over_64_chars(pool: PgPool) { + let h = common::harness(pool); + let (_, cookie) = common::register_user(&h.app).await; + let resp = h + .app + .oneshot(common::post_json_with_cookie( + "/api/v1/auth/tokens", + json!({ "name": "x".repeat(65) }), + &cookie, + )) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY); + let body = common::body_json(resp).await; + assert_eq!(body["error"]["code"], "validation_failed"); + assert!(body["error"]["details"]["name"].is_string()); +} diff --git a/backend/tests/api_mangas_cover.rs b/backend/tests/api_mangas_cover.rs index 396eacd..398a3f3 100644 --- a/backend/tests/api_mangas_cover.rs +++ b/backend/tests/api_mangas_cover.rs @@ -410,3 +410,53 @@ async fn delete_cover_404_on_unknown_id(pool: PgPool) { .unwrap(); assert_eq!(resp.status(), StatusCode::NOT_FOUND); } + +/// Authz: PUT /mangas/:id/cover must be uploader-only. +#[sqlx::test(migrations = "./migrations")] +async fn put_cover_forbidden_for_non_uploader(pool: PgPool) { + let h = harness(pool); + let (_, owner_cookie) = register_user(&h.app).await; + let (_, intruder_cookie) = register_user(&h.app).await; + + let manga = + create_manga_with_cover(&h.app, &owner_cookie, "Mine", None).await; + let id = id_of(&manga); + + let resp = h + .app + .oneshot(put_multipart_with_cookie( + &format!("/api/v1/mangas/{id}/cover"), + cover_form(&fake_png_bytes()), + &intruder_cookie, + )) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::FORBIDDEN); +} + +/// Authz: DELETE /mangas/:id/cover must be uploader-only. +#[sqlx::test(migrations = "./migrations")] +async fn delete_cover_forbidden_for_non_uploader(pool: PgPool) { + let h = harness(pool); + let (_, owner_cookie) = register_user(&h.app).await; + let (_, intruder_cookie) = register_user(&h.app).await; + + let manga = create_manga_with_cover( + &h.app, + &owner_cookie, + "Mine", + Some(("image/jpeg", &fake_jpeg_bytes())), + ) + .await; + let id = id_of(&manga); + + let resp = h + .app + .oneshot(delete_with_cookie( + &format!("/api/v1/mangas/{id}/cover"), + &intruder_cookie, + )) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::FORBIDDEN); +} diff --git a/backend/tests/api_mangas_metadata.rs b/backend/tests/api_mangas_metadata.rs index 2797d34..107f9d2 100644 --- a/backend/tests/api_mangas_metadata.rs +++ b/backend/tests/api_mangas_metadata.rs @@ -566,3 +566,78 @@ async fn patch_requires_authentication(pool: PgPool) { .unwrap(); assert_eq!(resp.status(), StatusCode::UNAUTHORIZED); } + +/// A signed-in user who didn't upload the manga must not be able to +/// PATCH it. Without the uploader-gate this returned 200 — see +/// REVIEW.md "manga PATCH / cover endpoints don't check ownership". +#[sqlx::test(migrations = "./migrations")] +async fn patch_forbidden_for_non_uploader(pool: PgPool) { + let h = common::harness(pool); + let (_, owner_cookie) = common::register_user(&h.app).await; + let (_, intruder_cookie) = common::register_user(&h.app).await; + + let created = create_manga(&h.app, &owner_cookie, json!({ "title": "Mine" })).await; + let id = id_of(&created); + + let resp = h + .app + .oneshot(common::patch_json_with_cookie( + &format!("/api/v1/mangas/{id}"), + json!({ "status": "completed" }), + &intruder_cookie, + )) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::FORBIDDEN); +} + +/// Owner can still edit their own manga (regression guard for the +/// authz fix). +#[sqlx::test(migrations = "./migrations")] +async fn patch_allowed_for_uploader(pool: PgPool) { + let h = common::harness(pool); + let (_, cookie) = common::register_user(&h.app).await; + let created = create_manga(&h.app, &cookie, json!({ "title": "Owned" })).await; + let id = id_of(&created); + let resp = h + .app + .oneshot(common::patch_json_with_cookie( + &format!("/api/v1/mangas/{id}"), + json!({ "status": "completed" }), + &cookie, + )) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); +} + +/// Legacy rows with `uploaded_by IS NULL` (created before migration +/// 0011) remain editable by any signed-in user. Without this carve-out +/// the historical-data note in 0011 would be broken. +#[sqlx::test(migrations = "./migrations")] +async fn patch_allowed_on_legacy_null_uploader(pool: PgPool) { + let h = common::harness(pool.clone()); + let (_, cookie) = common::register_user(&h.app).await; + let created = create_manga(&h.app, &cookie, json!({ "title": "Legacy" })).await; + let id = id_of(&created); + + // Simulate a row uploaded before the column existed: clear + // uploaded_by directly via SQL. + sqlx::query("UPDATE mangas SET uploaded_by = NULL WHERE id = $1") + .bind(id) + .execute(&pool) + .await + .unwrap(); + + let (_, other_cookie) = common::register_user(&h.app).await; + let resp = h + .app + .oneshot(common::patch_json_with_cookie( + &format!("/api/v1/mangas/{id}"), + json!({ "status": "completed" }), + &other_cookie, + )) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); +} diff --git a/backend/tests/api_tags.rs b/backend/tests/api_tags.rs index 1ae77b3..c959151 100644 --- a/backend/tests/api_tags.rs +++ b/backend/tests/api_tags.rs @@ -59,6 +59,31 @@ async fn reattach_same_tag_is_idempotent_and_returns_200(pool: PgPool) { assert_eq!(second.status(), StatusCode::OK); } +/// Tag names over 64 chars are rejected at the handler boundary. The +/// repo enforces the same cap, but doing it at the handler keeps the +/// envelope consistent with the other validation paths +/// (username, collection name, etc.). +#[sqlx::test(migrations = "./migrations")] +async fn attach_rejects_tag_name_over_64_chars(pool: PgPool) { + let h = common::harness(pool); + let (_, cookie) = common::register_user(&h.app).await; + let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await; + + let long_name: String = "x".repeat(65); + let resp = h + .app + .oneshot(common::post_json_with_cookie( + &format!("/api/v1/mangas/{manga_id}/tags"), + json!({ "name": long_name }), + &cookie, + )) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY); + let body = common::body_json(resp).await; + assert_eq!(body["error"]["code"], "validation_failed"); +} + #[sqlx::test(migrations = "./migrations")] async fn tag_names_dedup_case_insensitively(pool: PgPool) { let h = common::harness(pool); diff --git a/backend/tests/crawler_sync.rs b/backend/tests/crawler_sync.rs index 27f6f3b..d6f7df2 100644 --- a/backend/tests/crawler_sync.rs +++ b/backend/tests/crawler_sync.rs @@ -440,6 +440,170 @@ async fn arbitrary_genres_from_source_get_inserted(pool: PgPool) { assert_eq!(webtoons_count.0, 1, "case-insensitive lookup reuses the existing row"); } +/// User-attached tags (rows with non-NULL `added_by` in `manga_tags`) +/// must survive a crawler upsert. The crawler owns source-attached tags +/// (added_by IS NULL); user attachments are owned by the user who made +/// them and the recurring metadata pass must not delete them. +#[sqlx::test(migrations = "./migrations")] +async fn sync_tags_preserves_user_attached_tags(pool: PgPool) { + crawler::ensure_source(&pool, "target", "T", "https://x.example") + .await + .unwrap(); + let m = sample_manga("foo", "Foo Manga", "hash-1"); + let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) + .await + .unwrap(); + + // A real user attaches a personal tag. + let user = mangalord::repo::user::create(&pool, "alice", "phc-stub") + .await + .unwrap(); + let outcome = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id) + .await + .unwrap(); + assert!(outcome.created_attachment); + + // Second crawler pass. Use a different metadata_hash so the upsert + // takes the Updated branch, but the bug also fires on Unchanged + // ticks since sync_tags runs unconditionally. + let mut m2 = m.clone(); + m2.metadata_hash = "hash-2".into(); + m2.tags = vec!["popular".into(), "weekly".into()]; + let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2) + .await + .unwrap(); + + // The user tag must still be attached. + let user_tag_rows: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM manga_tags mt \ + JOIN tags t ON t.id = mt.tag_id \ + WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \ + AND mt.added_by = $2", + ) + .bind(up.manga_id) + .bind(user.id) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!( + user_tag_rows.0, 1, + "user-attached tag must survive a crawler upsert" + ); + + // The source's tags should still attach as well, as crawler-owned. + let source_tag_rows: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM manga_tags mt \ + JOIN tags t ON t.id = mt.tag_id \ + WHERE mt.manga_id = $1 \ + AND mt.added_by IS NULL \ + AND lower(t.name) IN ('popular', 'weekly')", + ) + .bind(up.manga_id) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(source_tag_rows.0, 2, "source tags re-attach on each pass"); + + // A subsequent pass where the source drops a previously-seen tag + // must clear that crawler-owned attachment (otherwise crawler-tags + // would only ever accumulate). + let mut m3 = m2.clone(); + m3.metadata_hash = "hash-3".into(); + m3.tags = vec!["popular".into()]; + let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m3) + .await + .unwrap(); + let weekly_rows: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM manga_tags mt \ + JOIN tags t ON t.id = mt.tag_id \ + WHERE mt.manga_id = $1 AND lower(t.name) = 'weekly'", + ) + .bind(up.manga_id) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(weekly_rows.0, 0, "source-owned tag dropped by source goes away"); + + // And the user tag still survives that third pass. + let user_tag_rows: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM manga_tags mt \ + JOIN tags t ON t.id = mt.tag_id \ + WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \ + AND mt.added_by = $2", + ) + .bind(up.manga_id) + .bind(user.id) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(user_tag_rows.0, 1); +} + +/// `manga_tags.added_by` is `ON DELETE SET NULL` on the user FK. When +/// the attaching user is deleted, their attachments become orphans +/// indistinguishable from crawler-owned rows — and the crawler should +/// reap them on the next pass. Pins the semantic so a future change +/// can't quietly leave orphan rows lying around. +#[sqlx::test(migrations = "./migrations")] +async fn sync_tags_garbage_collects_orphan_user_attachments(pool: PgPool) { + crawler::ensure_source(&pool, "target", "T", "https://x.example") + .await + .unwrap(); + let m = sample_manga("foo", "Foo", "hash-1"); + let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m) + .await + .unwrap(); + + // A user attaches "personal", then the user gets deleted. The + // attachment row stays (manga_tags.manga_id FK is CASCADE on + // mangas only; we never CASCADE-delete user attachments). The FK + // on added_by is `ON DELETE SET NULL`, so the row's owner column + // goes NULL — same shape as a crawler-owned row. + let user = mangalord::repo::user::create(&pool, "bob", "phc-stub") + .await + .unwrap(); + let _ = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id) + .await + .unwrap(); + sqlx::query("DELETE FROM users WHERE id = $1") + .bind(user.id) + .execute(&pool) + .await + .unwrap(); + + // Sanity: the orphan still exists post-user-delete with added_by NULL. + let (orphan_rows,): (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM manga_tags mt \ + JOIN tags t ON t.id = mt.tag_id \ + WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \ + AND mt.added_by IS NULL", + ) + .bind(up.manga_id) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(orphan_rows, 1); + + // Next crawler pass — orphan should be reaped along with any + // other source-owned rows that aren't in the new tag list. + let mut m2 = m.clone(); + m2.metadata_hash = "hash-2".into(); + m2.tags = vec!["popular".into()]; + let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2) + .await + .unwrap(); + let (orphan_rows,): (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM manga_tags mt \ + JOIN tags t ON t.id = mt.tag_id \ + WHERE mt.manga_id = $1 AND lower(t.name) = 'personal'", + ) + .bind(up.manga_id) + .fetch_one(&pool) + .await + .unwrap(); + assert_eq!(orphan_rows, 0, "orphan user-attached tag should be reaped"); +} + #[sqlx::test(migrations = "./migrations")] async fn re_appearing_manga_clears_dropped_at(pool: PgPool) { crawler::ensure_source(&pool, "target", "T", "https://x.example") diff --git a/frontend/package.json b/frontend/package.json index 159dad9..72a32cd 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.34.0", + "version": "0.34.1", "private": true, "type": "module", "scripts": { diff --git a/frontend/src/lib/api/auth.test.ts b/frontend/src/lib/api/auth.test.ts index 7826151..5281f93 100644 --- a/frontend/src/lib/api/auth.test.ts +++ b/frontend/src/lib/api/auth.test.ts @@ -94,6 +94,11 @@ describe('auth api client', () => { expect(url).toMatch(/\/v1\/auth\/logout$/); const init = fetchSpy.mock.calls[0][1] as RequestInit; expect(init.method).toBe('POST'); + // Consistent content-type for all mutation requests, matching + // the rest of the module — axum doesn't require it but the + // header keeps the request style uniform. + const headers = new Headers(init.headers); + expect(headers.get('content-type')).toBe('application/json'); }); it('me returns the user on 200', async () => { diff --git a/frontend/src/lib/api/auth.ts b/frontend/src/lib/api/auth.ts index bc9f905..4102325 100644 --- a/frontend/src/lib/api/auth.ts +++ b/frontend/src/lib/api/auth.ts @@ -32,7 +32,14 @@ export async function login(creds: Credentials): Promise { } export async function logout(): Promise { - await request('/v1/auth/logout', { method: 'POST' }); + await request('/v1/auth/logout', { + method: 'POST', + // Consistent with the other POST/PATCH helpers in this module. + // axum doesn't require it (no body), but keeping the header + // on every mutation request avoids the false-flag in logs and + // matches the project's style. + headers: { 'content-type': 'application/json' } + }); } export type ChangePassword = { diff --git a/frontend/src/routes/manga/[id]/chapter/[chapter_id]/+page.svelte b/frontend/src/routes/manga/[id]/chapter/[chapter_id]/+page.svelte index e0334b2..ed81173 100644 --- a/frontend/src/routes/manga/[id]/chapter/[chapter_id]/+page.svelte +++ b/frontend/src/routes/manga/[id]/chapter/[chapter_id]/+page.svelte @@ -350,54 +350,48 @@ }); /** - * `fetch()` initiated during `pagehide` / `beforeunload` is - * cancelled by every browser by default. `sendBeacon` is the - * supported way to ship a small payload during unload — it's - * guaranteed to survive even if the tab is closing. Failure here - * is silent because the API is fire-and-forget. + * Flush read-progress as the tab is closing. A plain `fetch()` + * during `pagehide` / `beforeunload` is cancelled by every + * browser; `fetch(..., { keepalive: true })` is the supported + * escape hatch and survives the close. + * + * `sendBeacon` would be the textbook alternative, but it's + * POST-only and `/me/read-progress` takes PUT — so a beacon + * always 405s, adds server-log noise, then falls through to this + * same keepalive path anyway. The beacon was dropped; the + * keepalive fetch is the only path. */ - function beaconFinalProgress() { + function flushFinalProgress() { if (!session.user) return; const body = JSON.stringify({ manga_id: manga.id, chapter_id: chapter.id, page: progressPage }); - const blob = new Blob([body], { type: 'application/json' }); - // sendBeacon only supports POST — the server's PUT route is - // strict on method. The dedicated POST alias is omitted; in - // practice the in-app navigation path (back-link, chapter - // links) already covers the common-case unmount via the - // onDestroy fetch. Fall through to fetch+keepalive for browser - // implementations that don't honor sendBeacon for this endpoint. try { - const ok = navigator.sendBeacon('/api/v1/me/read-progress', blob); - if (!ok) throw new Error('sendBeacon rejected'); + void fetch('/api/v1/me/read-progress', { + method: 'PUT', + headers: { 'content-type': 'application/json' }, + body, + keepalive: true, + credentials: 'include' + }); } catch { - try { - void fetch('/api/v1/me/read-progress', { - method: 'PUT', - headers: { 'content-type': 'application/json' }, - body, - keepalive: true, - credentials: 'include' - }); - } catch { - // Final fallback failed; the in-app onDestroy flush - // below catches the SPA-navigation case. - } + // keepalive fetch was rejected (very old Firefox etc.); + // the in-app onDestroy flush below catches the SPA- + // navigation case, which is the common one anyway. } } onMount(() => { - window.addEventListener('pagehide', beaconFinalProgress); + window.addEventListener('pagehide', flushFinalProgress); }); onDestroy(() => { observer?.disconnect(); if (progressTimer) clearTimeout(progressTimer); if (typeof window !== 'undefined') { - window.removeEventListener('pagehide', beaconFinalProgress); + window.removeEventListener('pagehide', flushFinalProgress); } // Don't let the fullscreen flag leak to non-reader pages — // otherwise the layout header would stay slid-off on /upload