bugfix: security & correctness bundle (0.34.1)

Five fixes bundled into one release:

- preserve user-attached tags across crawler upserts
  (repo::crawler::sync_tags now scopes to added_by IS NULL; orphaned
  attachments from deleted users are reaped as crawler-owned)
- gate manga PATCH and cover endpoints on uploaded_by (require_can_edit
  in api::mangas; non-NULL uploaded_by must match the caller)
- equalise login response time across user-existence branches
  (run argon2 against a OnceLock-cached dummy hash on the no-user
  branch so timing doesn't leak username existence)
- crawler download defences (SSRF allowlist of host literals
  including IPv4-mapped IPv6 ranges, 32 MiB streamed size cap,
  reject non-whitelisted image types, three-way chapter-probe
  classifier replaces the binary #avatar_menu check)
- tighten validation and clean up dead unload path
  (attach_tag + create_token enforce 64-char caps; LocalStorage
  rejects NUL bytes explicitly; reader flushFinalProgress drops
  the always-405 sendBeacon path)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-28 20:24:51 +02:00
parent c5c1179e9d
commit 8d34132883
25 changed files with 1399 additions and 88 deletions

View File

@@ -5,6 +5,7 @@ use chrono::NaiveTime;
use chrono_tz::Tz;
use crate::crawler::browser::LaunchOptions;
use crate::crawler::safety::{DownloadAllowlist, DEFAULT_MAX_IMAGE_BYTES};
use crate::crawler::source::DiscoverMode;
/// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default —
@@ -93,6 +94,13 @@ pub struct CrawlerConfig {
/// `stop_after_unchanged` threshold supplied to Incremental in both
/// `Auto` (post-seed) and `Explicit(Incremental)` modes.
pub incremental_stop_after: usize,
/// Hosts the crawler is allowed to download images / covers from.
/// Always seeded with the host of `start_url` and (when set) the
/// configured `cdn_host`. Additional hosts can be added via
/// `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-separated).
pub download_allowlist: DownloadAllowlist,
/// Hard upper bound on a single image download. Defaults to 32 MiB.
pub max_image_bytes: usize,
}
impl Default for CrawlerConfig {
@@ -115,6 +123,8 @@ impl Default for CrawlerConfig {
browser: LaunchOptions::headless(),
mode: CrawlerModePref::Auto,
incremental_stop_after: 20,
download_allowlist: DownloadAllowlist::new(),
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
}
}
}
@@ -172,6 +182,14 @@ impl CrawlerConfig {
let incremental_stop_after =
env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
let mode = parse_mode_env(incremental_stop_after)?;
let start_url = std::env::var("CRAWLER_START_URL")
.ok()
.filter(|s| !s.trim().is_empty());
let cdn_host = std::env::var("CRAWLER_CDN_HOST")
.ok()
.filter(|s| !s.trim().is_empty());
let download_allowlist =
build_download_allowlist(start_url.as_deref(), cdn_host.as_deref());
Ok(Self {
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
daily_at,
@@ -179,13 +197,9 @@ impl CrawlerConfig {
idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
start_url: std::env::var("CRAWLER_START_URL")
.ok()
.filter(|s| !s.trim().is_empty()),
start_url,
rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
cdn_host: std::env::var("CRAWLER_CDN_HOST")
.ok()
.filter(|s| !s.trim().is_empty()),
cdn_host,
cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
phpsessid: std::env::var("CRAWLER_PHPSESSID")
.ok()
@@ -202,10 +216,45 @@ impl CrawlerConfig {
browser: LaunchOptions::from_env(),
mode,
incremental_stop_after,
download_allowlist,
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
})
}
}
/// Build the download allowlist from env. Always includes
/// `CRAWLER_START_URL`'s host (so the crawler can fetch covers from
/// the catalog itself) and `CRAWLER_CDN_HOST` when set. Additional
/// hosts can be supplied via `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-
/// separated). Empty by default — meaning the crawler refuses to
/// download anything when no source is configured, which is the safe
/// fail-closed posture.
fn build_download_allowlist(
start_url: Option<&str>,
cdn_host: Option<&str>,
) -> DownloadAllowlist {
let mut allow = DownloadAllowlist::new();
if let Some(url) = start_url {
if let Ok(parsed) = reqwest::Url::parse(url) {
if let Some(h) = parsed.host_str() {
allow = allow.allow(h);
}
}
}
if let Some(host) = cdn_host {
allow = allow.allow(host);
}
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
for piece in extras.split(',') {
let trimmed = piece.trim();
if !trimmed.is_empty() {
allow = allow.allow(trimmed);
}
}
}
allow
}
/// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are
/// `auto`, `backfill`, and `incremental` (case-insensitive). Anything
/// else is a hard error so a typo can't silently fall through to the