Compare commits

..

1 Commits

Author SHA1 Message Date
MechaCat02
156d9e427d feat: handle SIGTERM for graceful container stops (0.35.0)
`docker compose stop` and Kubernetes / Podman / systemd all send
SIGTERM first; SIGINT is for interactive shells. Without a SIGTERM
listener the container's stop-grace period elapses with the API still
running, then SIGKILL skips the daemon shutdown path and leaks
Chromium until the OS reaps the parent. Replace the bare
`tokio::signal::ctrl_c()` with a select over ctrl_c and
SignalKind::terminate() so the daemon.shutdown().await path runs in
both cases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 18:59:33 +02:00
134 changed files with 1303 additions and 18243 deletions

View File

@@ -1,30 +1,20 @@
# Copy to .env for `docker compose up --build`. Local-dev runs (cargo run
# / npm run dev) read backend/.env if present, or pick up the variables
# from your shell.
#
# Production note: COOKIE_SECURE=true (the default below) makes browsers
# refuse to send the session cookie over plain HTTP. Run with a TLS-
# terminating reverse proxy (Caddy, Traefik, nginx) in front — the
# compose file here doesn't ship one. Local/dev runs without HTTPS
# should set COOKIE_SECURE=false.
# ----- Postgres -----
# These are read by the Postgres container *and* by DATABASE_URL below;
# changing them after the first boot won't migrate existing data, so set
# them up front for any new deployment.
#
# POSTGRES_PASSWORD is REQUIRED — docker-compose.yml fails fast if it
# isn't set in this file, to prevent a deploy without an .env booting
# Postgres with a publicly-known credential.
POSTGRES_USER=mangalord
POSTGRES_PASSWORD=change-me-to-a-strong-random-string
POSTGRES_PASSWORD=mangalord
POSTGRES_DB=mangalord
# ----- Backend -----
DATABASE_URL=postgres://mangalord:mangalord@postgres:5432/mangalord
BIND_ADDRESS=0.0.0.0:8080
STORAGE_DIR=/var/lib/mangalord/storage
RUST_LOG=info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off
RUST_LOG=info,mangalord=debug
# ----- Auth / cookies -----
# COOKIE_SECURE controls whether the `Secure` flag is set on the session
@@ -39,13 +29,6 @@ COOKIE_DOMAIN=
# get reaped lazily.
SESSION_TTL_DAYS=30
# ----- Auth brute-force rate limits -----
# Token-bucket budget shared across /auth/login, /auth/register, and
# /auth/me/password. Set per_sec=0 to disable (e.g. behind a
# rate-limiting reverse proxy that already enforces a budget).
AUTH_RATE_PER_SEC=5
AUTH_RATE_BURST=10
# ----- CORS -----
# Comma-separated origins allowed to call the API with credentials.
# Default is empty: same-origin only. Set when frontend and backend live
@@ -61,69 +44,6 @@ MAX_REQUEST_BYTES=209715200
# Default 20 MiB.
MAX_FILE_BYTES=20971520
# ----- Crawler download safety -----
# Hosts the crawler is allowed to fetch images/covers from, in addition
# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated.
# Defends against SSRF via scraped <img src="http://10.0.0.1/...">.
CRAWLER_DOWNLOAD_ALLOWLIST=
# Bypass the host allowlist entirely. Intended for sources that shard
# images across numbered CDN subdomains (cdn1/cdn2/…) where enumerating
# every host upfront is impractical. The private-IP / localhost / non-
# http(s) scheme defenses STAY ON — a scraped <img src="http://10.0.0.1/">
# is still refused with this flag set.
CRAWLER_ALLOW_ANY_HOST=false
# Hard cap on a single image body. Default 32 MiB.
CRAWLER_MAX_IMAGE_BYTES=33554432
# Max manga detail fetches per metadata pass (both the in-process daemon
# and the `bin/crawler` CLI). 0 means no cap — let the source walker run
# to completion. Useful for capped test runs against a new source.
CRAWLER_LIMIT=0
# Path to a system Chromium binary. When set, the crawler skips the
# bundled-fetcher download. Required on platforms without a usable
# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On
# Debian: /usr/bin/chromium-headless-shell or /usr/bin/chromium. On
# Ubuntu the package is chromium-browser (different path). Pair with
# `docker compose build --build-arg INSTALL_CHROMIUM=true backend` so
# the image actually contains the binary.
CRAWLER_CHROMIUM_BINARY=
# ----- Crawler TOR proxy + recircuit -----
# The compose stack ships a `tor` service (dockurr/tor) and defaults
# CRAWLER_PROXY to it, so by default all crawler traffic exits via the
# TOR network. To opt out, set CRAWLER_PROXY= (empty) AND
# CRAWLER_TOR_CONTROL_URL= (empty) below — the tor service can stay
# running, it just won't be used.
#
# Going through TOR adds latency to every fetch; image downloads in
# particular slow noticeably. The win is on sites that rate-limit or
# fingerprint by exit IP — NEWNYM recirculation makes a fresh exit
# cheap to reach for.
#
# CRAWLER_PROXY: SOCKS5(h) URL. Use `socks5h://` (not `socks5://`) so
# DNS resolution also goes through TOR, avoiding leaks via the host's
# resolver. Leave unset to talk to the upstream directly.
CRAWLER_PROXY=socks5h://tor:9050
# Control-port URL for SIGNAL NEWNYM ("get a fresh circuit"). Triggered
# automatically on bad pages (broken-page body, missing #logo) and on
# the Unauthenticated session probe outcome. Leave unset to disable
# the recircuit feature (the SOCKS proxy still works).
CRAWLER_TOR_CONTROL_URL=tcp://tor:9051
# Max NEWNYM-and-retry cycles per recircuit-eligible failure. Default 3.
CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS=3
# ----- TOR control-port password -----
# Shared between the bundled dockurr/tor service (which hashes it into
# its HashedControlPassword) and the backend's
# CRAWLER_TOR_CONTROL_PASSWORD. REQUIRED — docker-compose.yml fails
# fast if absent. Generate a strong random string; rotate by setting
# a new value and restarting both `tor` and `backend`.
#
# Operators running their own non-dockurr tor daemon with cookie-file
# auth can ignore this var and instead set
# CRAWLER_TOR_CONTROL_COOKIE_PATH on the backend — the TorController
# prefers cookie when both are present.
TOR_CONTROL_PASSWORD=change-me-to-a-strong-random-string
# ----- Frontend -----
# The frontend container runs SvelteKit's Node adapter on :3000 and
# proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the
@@ -131,8 +51,3 @@ TOR_CONTROL_PASSWORD=change-me-to-a-strong-random-string
# internal docker network. Override only if you're running the
# frontend container against a backend somewhere else.
BACKEND_URL=http://backend:8080
# Per-request wall-clock cap for the /api/* reverse proxy (milliseconds).
# Default 300000 (5 min) covers a typical 200 MiB chapter upload over
# 25 Mbps; raise for users on slower upstream links or lower if a
# tighter front proxy already bounds the request lifetime.
BACKEND_PROXY_TIMEOUT_MS=300000

View File

@@ -3,13 +3,13 @@ name: deploy
on:
push:
branches: [main]
pull_request:
branches: [main]
workflow_dispatch:
jobs:
test-backend:
runs-on: ubuntu-latest
container:
image: rust:1-slim
services:
postgres:
image: postgres:16-alpine
@@ -26,18 +26,10 @@ jobs:
DATABASE_URL: postgres://mangalord:mangalord@postgres:5432/mangalord
steps:
- uses: actions/checkout@v4
# ubuntu-latest has node (so JS actions like checkout/cache run) but no
# Rust. We intentionally avoid `container: rust:1-slim` because act_runner
# runs JS actions with node *inside* the job container, and the slim Rust
# image ships no node (checkout would fail with exit 127).
- name: Install Rust + build deps
- name: Install build deps
run: |
set -eu
SUDO=""; [ "$(id -u)" = "0" ] || SUDO="sudo"
$SUDO apt-get update
$SUDO apt-get install -y --no-install-recommends pkg-config libssl-dev ca-certificates curl
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
apt-get update
apt-get install -y --no-install-recommends pkg-config libssl-dev ca-certificates
- name: Cache cargo registry and target
uses: actions/cache@v4
with:
@@ -71,18 +63,6 @@ jobs:
build-and-push:
runs-on: ubuntu-latest
needs: [test-backend, test-frontend]
# PRs only run the test jobs; build + deploy are reserved for
# post-merge pushes to main.
if: github.event_name != 'pull_request'
# Build on the host docker daemon directly (docker-outside-of-docker):
# the runner shares the deploy host's daemon, so a plain `docker build`
# reuses the host's layer cache and avoids buildx's docker-container
# driver + the gha cache exporter — neither works against this single-host
# act_runner, and there is no in-job daemon socket unless we mount it.
container:
image: docker.gitea.com/runner-images:ubuntu-latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
outputs:
image_tag: ${{ steps.meta.outputs.image_tag }}
version: ${{ steps.meta.outputs.version }}
@@ -101,53 +81,64 @@ jobs:
echo "image_tag=${GITHUB_SHA}" >> "$GITHUB_OUTPUT"
echo "version=${version}" >> "$GITHUB_OUTPUT"
- name: Build & push backend + frontend
env:
REGISTRY_URL: ${{ secrets.REGISTRY_URL }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
IMAGE_TAG: ${{ steps.meta.outputs.image_tag }}
VERSION: ${{ steps.meta.outputs.version }}
run: |
set -eu
echo "$REGISTRY_PASSWORD" | docker login "$REGISTRY_URL" -u "$REGISTRY_USERNAME" --password-stdin
for svc in backend frontend; do
img="$REGISTRY_URL/mangalord-$svc"
docker build -t "$img:$IMAGE_TAG" -t "$img:latest" -t "$img:$VERSION" "./$svc"
for tag in "$IMAGE_TAG" latest "$VERSION"; do docker push "$img:$tag"; done
done
docker logout "$REGISTRY_URL"
- uses: docker/setup-buildx-action@v3
- name: docker login
uses: docker/login-action@v3
with:
registry: ${{ secrets.REGISTRY_URL }}
username: ${{ secrets.REGISTRY_USERNAME }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: Build & push backend
uses: docker/build-push-action@v5
with:
context: ./backend
push: true
tags: |
${{ secrets.REGISTRY_URL }}/mangalord-backend:latest
${{ secrets.REGISTRY_URL }}/mangalord-backend:${{ steps.meta.outputs.image_tag }}
${{ secrets.REGISTRY_URL }}/mangalord-backend:${{ steps.meta.outputs.version }}
cache-from: type=gha,scope=backend
cache-to: type=gha,mode=max,scope=backend
- name: Build & push frontend
uses: docker/build-push-action@v5
with:
context: ./frontend
push: true
tags: |
${{ secrets.REGISTRY_URL }}/mangalord-frontend:latest
${{ secrets.REGISTRY_URL }}/mangalord-frontend:${{ steps.meta.outputs.image_tag }}
${{ secrets.REGISTRY_URL }}/mangalord-frontend:${{ steps.meta.outputs.version }}
cache-from: type=gha,scope=frontend
cache-to: type=gha,mode=max,scope=frontend
deploy:
runs-on: ubuntu-latest
needs: build-and-push
if: github.event_name != 'pull_request'
# Single-host deploy: the runner lives on the same box as the stack, so we
# drive the host docker daemon directly (the job mounts the host docker
# socket) instead of SSHing out. The compose dir is bind-mounted at its
# REAL host path so compose's relative bind-mounts (./mangalord/...,
# ./Caddyfile) resolve; both paths must be in the runner's
# container.valid_volumes. The central compose references the images as
# registry.mc02.dev/mangalord-*:${MANGALORD_TAG:-latest}, so we only pull
# and recreate the two mangalord services at the freshly built SHA.
container:
image: docker:cli
volumes:
- /mnt/ssd/docker-data:/mnt/ssd/docker-data
- /var/run/docker.sock:/var/run/docker.sock
steps:
- name: Deploy to the local stack
working-directory: /mnt/ssd/docker-data
- name: SSH deploy
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.SSH_HOST }}
username: ${{ secrets.SSH_USER }}
key: ${{ secrets.SSH_PRIVATE_KEY }}
port: ${{ secrets.SSH_PORT || 22 }}
envs: REGISTRY_URL,REGISTRY_USERNAME,REGISTRY_PASSWORD,IMAGE_TAG,DEPLOY_PATH
script_stop: true
script: |
set -euo pipefail
cd "$DEPLOY_PATH"
echo "$REGISTRY_PASSWORD" | docker login "$REGISTRY_URL" -u "$REGISTRY_USERNAME" --password-stdin
export REGISTRY_URL IMAGE_TAG
docker compose -f docker-compose.yml -f docker-compose.prod.yml pull
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
docker image prune -f
docker logout "$REGISTRY_URL"
env:
REGISTRY_URL: ${{ secrets.REGISTRY_URL }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
IMAGE_TAG: ${{ needs.build-and-push.outputs.image_tag }}
run: |
set -eu
echo "$REGISTRY_PASSWORD" | docker login "$REGISTRY_URL" -u "$REGISTRY_USERNAME" --password-stdin
export MANGALORD_TAG="$IMAGE_TAG"
docker compose pull mangalord-backend mangalord-frontend
docker compose up -d mangalord-backend mangalord-frontend
docker image prune -f
docker logout "$REGISTRY_URL"
DEPLOY_PATH: ${{ vars.DEPLOY_PATH }}

139
backend/Cargo.lock generated
View File

@@ -1202,7 +1202,7 @@ dependencies = [
"js-sys",
"log",
"wasm-bindgen",
"windows-core 0.62.2",
"windows-core",
]
[[package]]
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mangalord"
version = "0.54.0"
version = "0.34.0"
dependencies = [
"anyhow",
"argon2",
@@ -1488,7 +1488,6 @@ dependencies = [
"http-body-util",
"infer",
"mime",
"nix 0.29.0",
"rand 0.8.6",
"reqwest",
"scraper",
@@ -1497,7 +1496,6 @@ dependencies = [
"sha2",
"sqlx",
"subtle",
"sysinfo",
"tempfile",
"thiserror 1.0.69",
"time",
@@ -1605,18 +1603,6 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "nix"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
dependencies = [
"bitflags",
"cfg-if",
"cfg_aliases",
"libc",
]
[[package]]
name = "nix"
version = "0.31.3"
@@ -1629,15 +1615,6 @@ dependencies = [
"libc",
]
[[package]]
name = "ntapi"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.50.3"
@@ -1878,7 +1855,7 @@ checksum = "9cf20a545b305cf1da722b236b5155c9bb35f1d5ceb28c048bd96ca842f41b5b"
dependencies = [
"android_system_properties",
"log",
"nix 0.31.3",
"nix",
"objc2",
"objc2-foundation",
"objc2-ui-kit",
@@ -2347,7 +2324,6 @@ dependencies = [
"cookie",
"cookie_store",
"futures-core",
"futures-util",
"http",
"http-body",
"http-body-util",
@@ -2367,14 +2343,12 @@ dependencies = [
"sync_wrapper",
"tokio",
"tokio-rustls",
"tokio-util",
"tower",
"tower-http",
"tower-service",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"wasm-streams",
"web-sys",
"webpki-roots",
]
@@ -3008,19 +2982,6 @@ dependencies = [
"syn",
]
[[package]]
name = "sysinfo"
version = "0.32.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c33cd241af0f2e9e3b5c32163b873b29956890b5342e6745b917ce9d490f4af"
dependencies = [
"core-foundation-sys",
"libc",
"memchr",
"ntapi",
"windows",
]
[[package]]
name = "tempfile"
version = "3.27.0"
@@ -3566,19 +3527,6 @@ dependencies = [
"wasmparser",
]
[[package]]
name = "wasm-streams"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
dependencies = [
"futures-util",
"js-sys",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "wasmparser"
version = "0.244.0"
@@ -3642,74 +3590,19 @@ dependencies = [
"wasite",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143"
dependencies = [
"windows-core 0.57.0",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d"
dependencies = [
"windows-implement 0.57.0",
"windows-interface 0.57.0",
"windows-result 0.1.2",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.62.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
dependencies = [
"windows-implement 0.60.2",
"windows-interface 0.59.3",
"windows-implement",
"windows-interface",
"windows-link",
"windows-result 0.4.1",
"windows-result",
"windows-strings",
]
[[package]]
name = "windows-implement"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "windows-implement"
version = "0.60.2"
@@ -3721,17 +3614,6 @@ dependencies = [
"syn",
]
[[package]]
name = "windows-interface"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "windows-interface"
version = "0.59.3"
@@ -3749,15 +3631,6 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-result"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-result"
version = "0.4.1"

View File

@@ -1,6 +1,6 @@
[package]
name = "mangalord"
version = "0.54.0"
version = "0.35.0"
edition = "2021"
default-run = "mangalord"
@@ -45,10 +45,8 @@ futures-core = "0.3"
futures-util = "0.3"
bytes = "1"
chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false }
sysinfo = { version = "0.32", default-features = false, features = ["system"] }
nix = { version = "0.29", features = ["fs"] }
scraper = "0.20"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies", "stream"] }
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies"] }
[dev-dependencies]
tempfile = "3"
@@ -57,13 +55,3 @@ http-body-util = "0.1"
mime = "0.3"
futures-util = "0.3"
tokio = { version = "1", features = ["test-util"] }
# Trim debug builds: keep line numbers in panics / backtraces but drop the
# full DWARF info (variable-level inspection in gdb/lldb). With a sqlx +
# axum + tokio dep tree the default ("full") leaves backend/target on the
# order of tens of GiB; this typically cuts ~5070% off that.
[profile.dev]
debug = "line-tables-only"
[profile.test]
debug = "line-tables-only"

View File

@@ -10,8 +10,7 @@ RUN apt-get update \
# exact crate versions CI tested. Without Cargo.lock + the flag, cargo
# would silently resolve fresh on every image build.
COPY Cargo.toml Cargo.lock ./
RUN mkdir -p src/bin && echo "fn main() {}" > src/main.rs && echo "" > src/lib.rs \
&& echo "fn main() {}" > src/bin/crawler.rs \
RUN mkdir src && echo "fn main() {}" > src/main.rs && echo "" > src/lib.rs \
&& cargo build --locked --release \
&& rm -rf src
@@ -19,68 +18,13 @@ COPY src ./src
COPY migrations ./migrations
RUN touch src/main.rs src/lib.rs && cargo build --locked --release
FROM debian:trixie-slim
# Runtime base must match the builder's Debian release: `rust:1-slim` tracks
# trixie (glibc 2.41), so a bookworm runtime (glibc 2.36) can't run the
# binary ("GLIBC_2.39 not found"). Keep these two in lockstep on bumps.
# `curl` is for the container HEALTHCHECK; `ca-certificates` is for
# outbound HTTPS (crawler covers/pages).
#
# INSTALL_CHROMIUM is an opt-in for deployments that can't use the
# chromiumoxide fetcher path (notably Linux_arm64 / Raspberry Pi, where
# the upstream snapshot bucket has no usable build). When `true`, adds
# Debian's apt-packaged headless chromium plus a baseline font set —
# pair with `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell`
# at runtime so the launcher uses it. Default `false` keeps cloud/x86
# images slim.
#
# Build the Pi image with:
# docker compose build --build-arg INSTALL_CHROMIUM=true backend
ARG INSTALL_CHROMIUM=false
FROM debian:bookworm-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends ca-certificates curl \
&& if [ "$INSTALL_CHROMIUM" = "true" ]; then \
apt-get install -y --no-install-recommends chromium-headless-shell fonts-liberation; \
fi \
&& apt-get install -y --no-install-recommends ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Non-root runtime user. The API binary doesn't need any root
# privilege; the crawler daemon's Chromium launcher uses --no-sandbox
# precisely because user-namespace sandboxing is fragile, so dropping
# privileges costs nothing operationally and shrinks the blast radius
# of any RCE.
ARG APP_UID=10001
ARG APP_GID=10001
RUN groupadd --system --gid ${APP_GID} app \
&& useradd --system --uid ${APP_UID} --gid app --home-dir /home/app --create-home --shell /usr/sbin/nologin app
WORKDIR /app
COPY --from=builder /app/target/release/mangalord /usr/local/bin/mangalord
COPY --from=builder /app/migrations /app/migrations
ENV STORAGE_DIR=/var/lib/mangalord/storage
# Pre-create the storage dir so the entrypoint doesn't need to
# mkdir-as-root and so the named volume mount inherits the right
# ownership.
#
# UPGRADE NOTE for operators: if you're moving from an older image
# that ran as root, the existing `storage-data` volume has files owned
# by UID 0 and the new UID-10001 user can't write them. Run once
# before the upgrade:
# docker compose run --rm --user 0 backend \
# chown -R 10001:10001 /var/lib/mangalord/storage
# (Postgres is unaffected — that image's `postgres` user UID hasn't
# changed.)
RUN mkdir -p ${STORAGE_DIR} \
&& chown -R app:app ${STORAGE_DIR} /app /home/app
USER app
EXPOSE 8080
# `--start-period` is generous because first boot runs sqlx::migrate
# against postgres which can take a few seconds; subsequent restarts
# are sub-second.
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
CMD curl -fsS http://localhost:8080/api/v1/health > /dev/null || exit 1
CMD ["mangalord"]

View File

@@ -1,15 +0,0 @@
-- The original 0012 partial index covers `state IN ('pending','failed')`,
-- but `ack_failed` in src/crawler/jobs.rs only writes `dead` or
-- `pending` — `failed` is never set. The index branch on `failed`
-- never matches any row, so it's dead weight on every write.
--
-- Drop and recreate the index without the dead branch. The CHECK
-- constraint on `state` still allows `'failed'` so a future migration
-- can adopt that terminal-but-retryable state without a second
-- schema change.
DROP INDEX IF EXISTS crawler_jobs_ready_idx;
CREATE INDEX crawler_jobs_ready_idx
ON crawler_jobs (scheduled_at)
WHERE state = 'pending';

View File

@@ -1,20 +0,0 @@
-- chapter_sources: drop the global (source_id, source_chapter_key) PK
-- and rekey on (source_id, chapter_id).
--
-- The old PK assumed chapter slugs are unique per source. Sources whose
-- chapter naming is per-manga (chapter-1, chapter-2, ...) instead of per-
-- catalog (br_chapter-379272 with a global counter) would collide on the
-- second manga: the INSERT would conflict on (source_id, "chapter-1") and
-- the lookup would attribute the row to the first manga's chapter_id.
--
-- The new key is the natural identity of a source attachment: "this source
-- has this chapter". An (source_id, source_chapter_key) index preserves
-- the lookup path (find existing source row by source's identifier) but
-- no longer enforces uniqueness — the application combines it with the
-- chapters table's manga_id to scope the lookup per-manga.
ALTER TABLE chapter_sources DROP CONSTRAINT chapter_sources_pkey;
ALTER TABLE chapter_sources ADD PRIMARY KEY (source_id, chapter_id);
CREATE INDEX chapter_sources_source_key_idx
ON chapter_sources (source_id, source_chapter_key);

View File

@@ -1,5 +0,0 @@
-- Admin role flag on users. Booted from ADMIN_USERNAME / ADMIN_PASSWORD env at
-- startup (see app::build). Demotion is instant: the RequireAdmin extractor
-- re-reads the user row every request, so flipping this column takes effect on
-- the next call without a session purge.
ALTER TABLE users ADD COLUMN is_admin BOOLEAN NOT NULL DEFAULT false;

View File

@@ -1,20 +0,0 @@
-- Admin audit log. Written from inside the same transaction as the action
-- it records, so a failed COMMIT also rolls back the audit row — the log
-- never claims an action happened that didn't.
--
-- `actor_user_id` is ON DELETE SET NULL so audit rows outlive a deleted
-- admin (the answer to "who promoted Bob to admin?" survives even after
-- Alice's account is removed). `target_id` is intentionally not a FK
-- because future audit kinds may target non-user rows (manga, source,
-- etc.) and a single typed FK can't express that.
CREATE TABLE admin_audit (
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
actor_user_id uuid REFERENCES users(id) ON DELETE SET NULL,
action text NOT NULL,
target_kind text NOT NULL,
target_id uuid,
payload jsonb NOT NULL DEFAULT '{}'::jsonb,
at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX admin_audit_at_idx ON admin_audit (at DESC);

View File

@@ -1,14 +0,0 @@
-- Per-manga sync-state derivation joins crawler_jobs to manga_sources via
-- (payload->>'source_id', payload->>'source_manga_key') for the
-- `sync_manga` job kind (whose payload doesn't carry a manga_id directly).
-- Without this index the join falls back to a seqscan of crawler_jobs on
-- every admin manga listing — a noticeable cost as the job table grows
-- with the daily metadata pass.
--
-- Partial on `state IN ('pending','running')` so it covers only in-flight
-- jobs (the bulk of the table is done/dead and irrelevant to "is this
-- manga being synced right now").
CREATE INDEX crawler_jobs_sync_manga_key_idx
ON crawler_jobs ((payload->>'source_manga_key'))
WHERE state IN ('pending', 'running')
AND payload->>'kind' = 'sync_manga';

View File

@@ -1,18 +0,0 @@
-- Capture each chapter's position in the source site's chapter list so
-- the user-facing list can preserve site order: variants of the same
-- chapter number (e.g. "Ch.14 : PH" next to "Ch.14 : Official") stay
-- adjacent, and non-numeric entries like "notice. : Officials" land
-- where the site placed them rather than clustering at the top under
-- number = 0.
--
-- Lower source_index = closer to the top of the source DOM = newer
-- chapter on this site (it renders newest-first). The list query
-- reverses this with ORDER BY source_index DESC so the oldest chapter
-- appears first in our UI.
--
-- NULL is the sentinel for user-uploaded chapters (no source row) and
-- for crawled rows that pre-date this migration. The list query keeps
-- the existing (number, created_at) tiebreak via NULLS LAST so those
-- fall through to the prior behaviour until the next crawler tick
-- populates the column.
ALTER TABLE chapters ADD COLUMN source_index INTEGER;

View File

@@ -1,491 +0,0 @@
//! Admin-only crawler observability + control endpoints.
//!
//! Mounted under `/api/v1/admin/crawler*`, cookie-only via `RequireAdmin`.
//! All control endpoints return 503 when the crawler daemon is disabled
//! (`AppState.crawler == None`). Reads compose the live in-process status
//! ([`crate::crawler::status`]) with DB-derived queue counts and the
//! session/browser flags.
use std::convert::Infallible;
use std::time::Duration;
use axum::extract::{Query, State};
use axum::response::sse::{Event, KeepAlive, Sse};
use axum::routing::{get, post};
use axum::{Json, Router};
use futures_util::stream::Stream;
use serde::{Deserialize, Serialize};
use serde_json::json;
use uuid::Uuid;
use crate::app::{AppState, CrawlerControl};
use crate::auth::extractor::RequireAdmin;
use crate::crawler::browser_manager::RestartPhase;
use crate::crawler::status::{ActiveChapter, CoverTarget, LastPass, Phase};
use crate::error::{AppError, AppResult};
use crate::repo;
use crate::repo::crawler::{ActiveJob, DeadJob, MissingCoverRow, RequeueScope};
/// Backstop recompose interval for the SSE stream. Phase/worker/session
/// changes push instantly via the status `watch`; this only bounds the
/// staleness of DB-derived queue counts and the browser phase when those
/// change without an accompanying status poke.
const SSE_BACKSTOP: Duration = Duration::from_secs(5);
pub fn routes() -> Router<AppState> {
Router::new()
.route("/admin/crawler", get(get_status))
.route("/admin/crawler/stream", get(stream_status))
.route("/admin/crawler/run", post(run_now))
.route("/admin/crawler/browser/restart", post(restart_browser))
.route("/admin/crawler/session", post(update_session))
.route(
"/admin/crawler/session/clear-expired",
post(clear_session_expired),
)
.route("/admin/crawler/dead-jobs", get(list_dead_jobs))
.route("/admin/crawler/dead-jobs/requeue", post(requeue_dead_jobs))
.route("/admin/crawler/active-jobs", get(list_active_jobs))
.route("/admin/crawler/covers", get(list_covers))
}
// ---------------------------------------------------------------------------
// GET /admin/crawler — live status
// ---------------------------------------------------------------------------
#[derive(Debug, Serialize)]
struct QueueCounts {
pending: i64,
running: i64,
dead: i64,
}
#[derive(Debug, Serialize)]
struct SessionStatus {
/// Whether the sticky session-expired flag is set (chapter workers idle).
expired: bool,
/// Whether a PHPSESSID is currently configured at all.
configured: bool,
}
#[derive(Debug, Serialize)]
struct CrawlerStatusResponse {
/// `"running"` | `"disabled"`.
daemon: &'static str,
phase: Option<Phase>,
/// Configured chapter-worker count (for "N busy / M workers").
worker_count: usize,
/// Chapters being crawled right now, with live page counts.
active_chapters: Vec<ActiveChapter>,
/// The cover being fetched right now, if any.
current_cover: Option<CoverTarget>,
/// Mangas still queued for a cover fetch.
covers_queued: i64,
last_pass: LastPass,
session: SessionStatus,
/// `"healthy"` | `"draining"` | `"restarting"` | `"down"`.
browser: &'static str,
queue: QueueCounts,
}
fn browser_phase_str(p: RestartPhase) -> &'static str {
match p {
RestartPhase::Healthy => "healthy",
RestartPhase::Draining => "draining",
RestartPhase::Restarting => "restarting",
}
}
/// Compose a full status snapshot from the in-memory status, the
/// browser/session flags, and a fresh DB queue-count query. Shared by the
/// one-shot `get_status` and the SSE `stream_status`.
async fn compose_status(state: &AppState) -> AppResult<CrawlerStatusResponse> {
let (pending, running, dead) = repo::crawler::job_state_counts(&state.db).await?;
let queue = QueueCounts {
pending,
running,
dead,
};
let covers_queued = repo::crawler::count_missing_covers(&state.db).await?;
Ok(match state.crawler.as_ref() {
None => CrawlerStatusResponse {
daemon: "disabled",
phase: None,
worker_count: 0,
active_chapters: Vec::new(),
current_cover: None,
covers_queued,
last_pass: LastPass::default(),
session: SessionStatus {
expired: false,
configured: false,
},
browser: "down",
queue,
},
Some(c) => {
let snap = c.status.snapshot().await;
CrawlerStatusResponse {
daemon: "running",
phase: Some(snap.phase),
worker_count: snap.worker_count,
active_chapters: snap.active_chapters,
current_cover: snap.current_cover,
covers_queued,
last_pass: snap.last_pass,
session: SessionStatus {
expired: c.session.is_expired(),
configured: c.session.current().await.is_some(),
},
browser: browser_phase_str(c.browser_manager.phase()),
queue,
}
}
})
}
async fn get_status(
State(state): State<AppState>,
_admin: RequireAdmin,
) -> AppResult<Json<CrawlerStatusResponse>> {
Ok(Json(compose_status(&state).await?))
}
// ---------------------------------------------------------------------------
// GET /admin/crawler/stream — Server-Sent Events live status
// ---------------------------------------------------------------------------
/// Push live status to the dashboard instead of polling. Emits a snapshot
/// immediately on connect, then on every status change (instant, via the
/// `watch` notifier) and on a [`SSE_BACKSTOP`] tick (to refresh DB queue
/// counts / browser phase that change without a status poke). The browser
/// opens this only while the crawler page is mounted and closes it on
/// navigate-away, so the subscription is scoped to the active page.
async fn stream_status(
State(state): State<AppState>,
_admin: RequireAdmin,
) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
// Subscribe before the first emit so no change between the initial
// snapshot and the first await is lost.
let rx = state.crawler.as_ref().map(|c| c.status.subscribe());
let stream = futures_util::stream::unfold(
(state, rx, true),
|(state, mut rx, first)| async move {
// After the first immediate emit, wait for a change or the
// backstop tick before recomposing.
if !first {
match rx.as_mut() {
Some(rx) => {
tokio::select! {
_ = rx.changed() => {}
_ = tokio::time::sleep(SSE_BACKSTOP) => {}
}
}
None => tokio::time::sleep(SSE_BACKSTOP).await,
}
}
// Compose; on a transient DB error, emit a keep-alive comment
// rather than tearing down the stream.
let event = match compose_status(&state).await {
Ok(resp) => Event::default()
.event("status")
.json_data(&resp)
.unwrap_or_else(|_| Event::default().comment("serialize error")),
Err(_) => Event::default().comment("status unavailable"),
};
Some((Ok(event), (state, rx, false)))
},
);
Sse::new(stream).keep_alive(KeepAlive::default())
}
// ---------------------------------------------------------------------------
// POST /admin/crawler/run — trigger an out-of-cycle metadata pass
// ---------------------------------------------------------------------------
#[derive(Debug, Serialize)]
struct RunResponse {
started: bool,
}
async fn run_now(
State(state): State<AppState>,
admin: RequireAdmin,
) -> AppResult<Json<RunResponse>> {
let c = require_crawler(&state)?;
let mp = c.metadata_pass.as_ref().ok_or_else(|| {
AppError::ServiceUnavailable("no source configured (CRAWLER_START_URL unset)".into())
})?;
let mp = std::sync::Arc::clone(mp);
// Fire-and-forget: the pass can run for minutes; the dashboard polls
// status for progress. Overlap with the daily cron is rare (daily) and
// both serialise on the single browser lease.
tokio::spawn(async move {
if let Err(e) = mp.run().await {
tracing::warn!(error = ?e, "manual metadata pass failed");
}
});
repo::admin_audit::insert(&state.db, admin.0.id, "crawler_run", "crawler", None, json!({}))
.await?;
Ok(Json(RunResponse { started: true }))
}
// ---------------------------------------------------------------------------
// POST /admin/crawler/browser/restart — coordinated restart
// ---------------------------------------------------------------------------
#[derive(Debug, Serialize)]
struct RestartResponse {
ok: bool,
error: Option<String>,
}
async fn restart_browser(
State(state): State<AppState>,
admin: RequireAdmin,
) -> AppResult<Json<RestartResponse>> {
let c = require_crawler(&state)?;
let result = c.browser_manager.coordinated_restart(c.drain_deadline).await;
// A successful coordinated_restart re-runs on_launch, which re-injects
// PHPSESSID and re-probes — i.e. the session is live. Drop the sticky
// `session_expired` flag so chapter workers stop idling without
// requiring a second click on "Clear expired".
if result.is_ok() {
c.session.clear_expired();
}
// Push the post-restart browser phase to live subscribers immediately.
c.status.poke();
repo::admin_audit::insert(
&state.db,
admin.0.id,
"crawler_browser_restart",
"crawler",
None,
json!({ "ok": result.is_ok() }),
)
.await?;
Ok(Json(match result {
Ok(()) => RestartResponse {
ok: true,
error: None,
},
Err(e) => RestartResponse {
ok: false,
error: Some(format!("{e:#}")),
},
}))
}
// ---------------------------------------------------------------------------
// POST /admin/crawler/session — refresh PHPSESSID
// ---------------------------------------------------------------------------
#[derive(Debug, Deserialize)]
struct UpdateSessionRequest {
phpsessid: String,
}
#[derive(Debug, Serialize)]
struct UpdateSessionResponse {
/// Whether the post-update browser relaunch + session probe succeeded.
valid: bool,
error: Option<String>,
}
async fn update_session(
State(state): State<AppState>,
admin: RequireAdmin,
Json(body): Json<UpdateSessionRequest>,
) -> AppResult<Json<UpdateSessionResponse>> {
let c = require_crawler(&state)?;
c.session
.update(&body.phpsessid)
.await
.map_err(|e| AppError::InvalidInput(format!("{e:#}")))?;
// Relaunch the browser so on_launch re-injects the new cookie and
// re-probes — the restart's success IS the session-validity signal.
let probe = c.browser_manager.coordinated_restart(c.drain_deadline).await;
// Session + browser state changed — push to live subscribers.
c.status.poke();
repo::admin_audit::insert(
&state.db,
admin.0.id,
"crawler_session_update",
"crawler",
None,
json!({ "valid": probe.is_ok() }),
)
.await?;
Ok(Json(match probe {
Ok(()) => UpdateSessionResponse {
valid: true,
error: None,
},
Err(e) => UpdateSessionResponse {
valid: false,
error: Some(format!("{e:#}")),
},
}))
}
#[derive(Debug, Serialize)]
struct ClearExpiredResponse {
cleared: bool,
}
async fn clear_session_expired(
State(state): State<AppState>,
admin: RequireAdmin,
) -> AppResult<Json<ClearExpiredResponse>> {
let c = require_crawler(&state)?;
c.session.clear_expired();
// session.expired flipped — push to live subscribers.
c.status.poke();
repo::admin_audit::insert(
&state.db,
admin.0.id,
"crawler_session_clear_expired",
"crawler",
None,
json!({}),
)
.await?;
Ok(Json(ClearExpiredResponse { cleared: true }))
}
// ---------------------------------------------------------------------------
// Dead jobs
// ---------------------------------------------------------------------------
#[derive(Debug, Deserialize, Default)]
struct DeadJobsParams {
#[serde(default)]
search: Option<String>,
#[serde(default = "default_limit")]
limit: i64,
#[serde(default)]
offset: i64,
}
fn default_limit() -> i64 {
50
}
async fn list_dead_jobs(
State(state): State<AppState>,
_admin: RequireAdmin,
Query(params): Query<DeadJobsParams>,
) -> AppResult<Json<crate::api::pagination::PagedResponse<DeadJob>>> {
let limit = params.limit.clamp(1, 200);
let offset = params.offset.max(0);
let search = params.search.filter(|s| !s.trim().is_empty());
let (items, total) =
repo::crawler::list_dead_jobs(&state.db, search.as_deref(), limit, offset).await?;
Ok(Json(crate::api::pagination::PagedResponse::with_total(
items, limit, offset, total,
)))
}
#[derive(Debug, Deserialize)]
#[serde(tag = "scope", rename_all = "snake_case")]
enum RequeueRequest {
All,
Manga { manga_id: Uuid },
Chapter { chapter_id: Uuid },
Job { job_id: Uuid },
}
#[derive(Debug, Serialize)]
struct RequeueResponse {
requeued: u64,
}
async fn requeue_dead_jobs(
State(state): State<AppState>,
admin: RequireAdmin,
Json(body): Json<RequeueRequest>,
) -> AppResult<Json<RequeueResponse>> {
let scope = match &body {
RequeueRequest::All => RequeueScope::All,
RequeueRequest::Manga { manga_id } => RequeueScope::Manga(*manga_id),
RequeueRequest::Chapter { chapter_id } => RequeueScope::Chapter(*chapter_id),
RequeueRequest::Job { job_id } => RequeueScope::Job(*job_id),
};
let requeued = repo::crawler::requeue_dead_jobs(&state.db, scope).await?;
repo::admin_audit::insert(
&state.db,
admin.0.id,
"crawler_dead_jobs_requeue",
"crawler",
None,
json!({ "requeued": requeued, "scope": scope_label(&body) }),
)
.await?;
Ok(Json(RequeueResponse { requeued }))
}
fn scope_label(r: &RequeueRequest) -> &'static str {
match r {
RequeueRequest::All => "all",
RequeueRequest::Manga { .. } => "manga",
RequeueRequest::Chapter { .. } => "chapter",
RequeueRequest::Job { .. } => "job",
}
}
// ---------------------------------------------------------------------------
// Queued-chapters + queued-covers backlogs (paginated, fetched on demand)
// ---------------------------------------------------------------------------
/// Pagination + title-search params shared by the backlog list endpoints.
#[derive(Debug, Deserialize, Default)]
struct ListParams {
#[serde(default)]
search: Option<String>,
#[serde(default = "default_limit")]
limit: i64,
#[serde(default)]
offset: i64,
}
async fn list_active_jobs(
State(state): State<AppState>,
_admin: RequireAdmin,
Query(params): Query<ListParams>,
) -> AppResult<Json<crate::api::pagination::PagedResponse<ActiveJob>>> {
let limit = params.limit.clamp(1, 200);
let offset = params.offset.max(0);
let search = params.search.filter(|s| !s.trim().is_empty());
let (items, total) =
repo::crawler::list_active_jobs(&state.db, search.as_deref(), limit, offset).await?;
Ok(Json(crate::api::pagination::PagedResponse::with_total(
items, limit, offset, total,
)))
}
async fn list_covers(
State(state): State<AppState>,
_admin: RequireAdmin,
Query(params): Query<ListParams>,
) -> AppResult<Json<crate::api::pagination::PagedResponse<MissingCoverRow>>> {
let limit = params.limit.clamp(1, 200);
let offset = params.offset.max(0);
let search = params.search.filter(|s| !s.trim().is_empty());
let (items, total) =
repo::crawler::list_missing_cover_mangas(&state.db, search.as_deref(), limit, offset)
.await?;
Ok(Json(crate::api::pagination::PagedResponse::with_total(
items, limit, offset, total,
)))
}
// ---------------------------------------------------------------------------
fn require_crawler(state: &AppState) -> Result<&std::sync::Arc<CrawlerControl>, AppError> {
state.crawler.as_ref().ok_or_else(|| {
AppError::ServiceUnavailable("crawler daemon is disabled".into())
})
}

View File

@@ -1,110 +0,0 @@
//! Admin manga/chapter overview with derived sync state.
//!
//! Sync state comes from `repo::admin_view`, which joins the manga /
//! chapter tables with the crawler signals at query time — there is no
//! persisted sync_state column. See [`repo::admin_view`] for the
//! derivation priority order.
use axum::extract::{Path, Query, State};
use axum::routing::get;
use axum::{Json, Router};
use serde::Deserialize;
use uuid::Uuid;
use crate::api::pagination::PagedResponse;
use crate::app::AppState;
use crate::auth::extractor::RequireAdmin;
use crate::domain::MangaSyncState;
use crate::error::{AppError, AppResult};
use crate::repo;
use crate::repo::admin_view::{AdminChapterRow, AdminMangaRow};
pub fn routes() -> Router<AppState> {
Router::new()
.route("/admin/mangas", get(list_mangas))
.route("/admin/mangas/:id/chapters", get(list_chapters))
}
#[derive(Debug, Deserialize, Default)]
pub struct ListChaptersParams {
#[serde(default = "default_chapter_limit")]
pub limit: i64,
#[serde(default)]
pub offset: i64,
}
fn default_chapter_limit() -> i64 {
200
}
#[derive(Debug, Deserialize, Default)]
pub struct ListMangasParams {
#[serde(default)]
pub search: Option<String>,
/// `in_progress` | `dropped` | `synced`. Unrecognised values are a 400.
#[serde(default)]
pub sync_state: Option<String>,
#[serde(default = "default_limit")]
pub limit: i64,
#[serde(default)]
pub offset: i64,
}
fn default_limit() -> i64 {
50
}
async fn list_mangas(
State(state): State<AppState>,
_admin: RequireAdmin,
Query(params): Query<ListMangasParams>,
) -> AppResult<Json<PagedResponse<AdminMangaRow>>> {
let limit = params.limit.clamp(1, 200);
let offset = params.offset.max(0);
let sync_state = match params.sync_state.as_deref() {
None | Some("") => None,
Some("in_progress") => Some(MangaSyncState::InProgress),
Some("dropped") => Some(MangaSyncState::Dropped),
Some("synced") => Some(MangaSyncState::Synced),
Some(other) => {
return Err(AppError::InvalidInput(format!(
"sync_state must be one of in_progress|dropped|synced (got {other:?})"
)));
}
};
let q = repo::admin_view::ListAdminMangasQuery {
search: params.search.filter(|s| !s.trim().is_empty()),
sync_state,
limit,
offset,
};
let (items, total) = repo::admin_view::list_mangas_with_sync_state(&state.db, &q).await?;
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
}
async fn list_chapters(
State(state): State<AppState>,
_admin: RequireAdmin,
Path(manga_id): Path<Uuid>,
Query(params): Query<ListChaptersParams>,
) -> AppResult<Json<PagedResponse<AdminChapterRow>>> {
// Explicit existence check so a typo / deleted manga returns 404
// rather than a misleading "no chapters" 200.
if !repo::manga::exists(&state.db, manga_id).await? {
return Err(AppError::NotFound);
}
// Cap at 500 to bound the per-row scalar-subquery cost on
// long-runners with thousands of chapters; default 200 covers
// typical browsing without paging round-trips.
let limit = params.limit.clamp(1, 500);
let offset = params.offset.max(0);
let q = repo::admin_view::ListAdminChaptersQuery {
manga_id,
limit,
offset,
};
let (items, total) = repo::admin_view::list_chapters_with_sync_state(&state.db, &q).await?;
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
}

View File

@@ -1,24 +0,0 @@
//! Admin-only endpoints. Mounted under `/api/v1/admin/*` by
//! `crate::api::routes`. Every handler in this subtree is guarded by
//! `RequireAdmin`, which only accepts session-cookie authentication —
//! bot/API tokens cannot reach admin routes (see
//! `crate::auth::extractor::RequireAdmin`).
pub mod crawler;
pub mod mangas;
pub mod resync;
pub mod system;
pub mod users;
use axum::Router;
use crate::app::AppState;
pub fn routes() -> Router<AppState> {
Router::new()
.merge(users::routes())
.merge(mangas::routes())
.merge(resync::routes())
.merge(system::routes())
.merge(crawler::routes())
}

View File

@@ -1,176 +0,0 @@
//! Admin-triggered force resync of a single manga's metadata + cover,
//! or a single chapter's content.
//!
//! Both endpoints are admin-only (`RequireAdmin`, cookie-only) and run
//! synchronously with the request — the response carries the refreshed
//! resource so the UI can swap it in without a follow-up GET. The work
//! itself is delegated to [`ResyncService`] (set on AppState by
//! `app::build` when the crawler daemon is enabled); when the daemon
//! is disabled, both handlers return 503.
use axum::extract::{Path, State};
use axum::routing::post;
use axum::{Json, Router};
use serde::Serialize;
use serde_json::json;
use uuid::Uuid;
use crate::app::AppState;
use crate::auth::extractor::RequireAdmin;
use crate::crawler::resync::{ChapterResyncOutcome, ResyncError};
use crate::domain::manga::MangaDetail;
use crate::domain::Chapter;
use crate::error::{AppError, AppResult};
use crate::repo;
use crate::repo::crawler::UpsertStatus;
pub fn routes() -> Router<AppState> {
Router::new()
.route("/admin/mangas/:id/resync", post(resync_manga))
.route("/admin/chapters/:id/resync", post(resync_chapter))
}
#[derive(Debug, Serialize)]
pub struct MangaResyncResponse {
pub manga: MangaDetail,
/// `"new" | "updated" | "unchanged"` — mirrors [`UpsertStatus`].
pub metadata_status: &'static str,
pub cover_fetched: bool,
}
#[derive(Debug, Serialize)]
pub struct ChapterResyncResponse {
pub chapter: Chapter,
/// `"fetched" | "skipped"` — whether new pages landed or the
/// service short-circuited (e.g. chapter already had pages and the
/// session was lost so force was downgraded).
pub outcome: &'static str,
/// Page count when `outcome == "fetched"`. `None` for `skipped`.
pub pages: Option<usize>,
}
async fn resync_manga(
State(state): State<AppState>,
admin: RequireAdmin,
Path(manga_id): Path<Uuid>,
) -> AppResult<Json<MangaResyncResponse>> {
if !repo::manga::exists(&state.db, manga_id).await? {
return Err(AppError::NotFound);
}
let resync = state
.resync
.as_ref()
.ok_or_else(|| AppError::ServiceUnavailable(
"crawler daemon is disabled; force resync unavailable".into(),
))?;
let outcome = resync.resync_manga(manga_id).await.map_err(map_resync_err)?;
// Audit the action with the actor + the resync outcome so an
// operator-of-operators can answer "who refetched this manga, and
// did the cover land?" from the log alone.
repo::admin_audit::insert(
&state.db,
admin.0.id,
"manga_resync",
"manga",
Some(manga_id),
json!({
"metadata_status": status_str(outcome.metadata_status),
"cover_fetched": outcome.cover_fetched,
}),
)
.await?;
let manga = repo::manga::get_detail(&state.db, manga_id).await?;
Ok(Json(MangaResyncResponse {
manga,
metadata_status: status_str(outcome.metadata_status),
cover_fetched: outcome.cover_fetched,
}))
}
async fn resync_chapter(
State(state): State<AppState>,
admin: RequireAdmin,
Path(chapter_id): Path<Uuid>,
) -> AppResult<Json<ChapterResyncResponse>> {
let resync = state
.resync
.as_ref()
.ok_or_else(|| AppError::ServiceUnavailable(
"crawler daemon is disabled; force resync unavailable".into(),
))?;
// Look up the manga the chapter belongs to so we can return the
// refreshed chapter row in the response and 404 for unknown ids.
let manga_id: Option<Uuid> =
sqlx::query_scalar("SELECT manga_id FROM chapters WHERE id = $1")
.bind(chapter_id)
.fetch_optional(&state.db)
.await?;
let Some(manga_id) = manga_id else {
return Err(AppError::NotFound);
};
let outcome = resync
.resync_chapter(chapter_id)
.await
.map_err(map_resync_err)?;
let (outcome_str, pages) = match &outcome {
ChapterResyncOutcome::Fetched { pages, .. } => ("fetched", Some(*pages)),
ChapterResyncOutcome::Skipped { .. } => ("skipped", None),
};
repo::admin_audit::insert(
&state.db,
admin.0.id,
"chapter_resync",
"chapter",
Some(chapter_id),
json!({
"outcome": outcome_str,
"pages": pages,
}),
)
.await?;
let chapter = repo::chapter::find_by_id_in_manga(&state.db, manga_id, chapter_id)
.await?
.ok_or(AppError::NotFound)?;
Ok(Json(ChapterResyncResponse {
chapter,
outcome: outcome_str,
pages,
}))
}
fn status_str(s: UpsertStatus) -> &'static str {
match s {
UpsertStatus::New => "new",
UpsertStatus::Updated => "updated",
UpsertStatus::Unchanged => "unchanged",
}
}
/// Map [`ResyncError`] (and the anyhow envelopes wrapping it) onto the
/// right [`AppError`]. Anything else surfaces as a generic 500 via the
/// `Other` arm — the operator sees the underlying anyhow chain in
/// server logs, the client sees a clean envelope.
fn map_resync_err(err: anyhow::Error) -> AppError {
if let Some(rerr) = err.downcast_ref::<ResyncError>() {
match rerr {
ResyncError::NoMangaSource => AppError::ValidationFailed {
message: "manga has no live crawler source — cannot resync".into(),
details: json!({ "manga": "no_source" }),
},
ResyncError::NoChapterSource => AppError::ValidationFailed {
message: "chapter has no live crawler source — cannot resync".into(),
details: json!({ "chapter": "no_source" }),
},
}
} else {
AppError::Other(err)
}
}

View File

@@ -1,163 +0,0 @@
//! System metrics for the admin dashboard.
//!
//! Disk is `statvfs(storage_dir)` so the number reflects the volume the
//! app actually writes to (not the root filesystem of the host). When the
//! storage backend doesn't expose a local path (e.g. a future S3 impl)
//! the disk fields are `null` rather than fabricated.
//!
//! Memory and CPU come from `sysinfo`. CPU requires two refreshes with
//! at least 200ms between them to compute a meaningful delta; the
//! handler eats the 250ms wall-clock cost on each request. Admin
//! traffic is low-volume so a background cache isn't worth the moving
//! parts yet — revisit if polling becomes frequent.
use std::path::Path;
use std::time::Duration;
use axum::extract::State;
use axum::routing::get;
use axum::{Json, Router};
use serde::Serialize;
use sysinfo::{CpuRefreshKind, MemoryRefreshKind, RefreshKind, System};
use crate::app::AppState;
use crate::auth::extractor::RequireAdmin;
use crate::error::AppResult;
const ALERT_THRESHOLD_PERCENT: f64 = 90.0;
pub fn routes() -> Router<AppState> {
Router::new().route("/admin/system", get(system))
}
#[derive(Debug, Serialize)]
pub struct SystemStats {
pub disk: Option<DiskStats>,
pub memory: MemoryStats,
pub cpu: CpuStats,
pub alerts: Vec<Alert>,
}
#[derive(Debug, Serialize)]
pub struct DiskStats {
pub total_bytes: u64,
pub used_bytes: u64,
pub free_bytes: u64,
pub percent_used: f64,
}
#[derive(Debug, Serialize)]
pub struct MemoryStats {
pub total_bytes: u64,
pub used_bytes: u64,
pub percent_used: f64,
}
#[derive(Debug, Serialize)]
pub struct CpuStats {
pub percent_used: f64,
}
#[derive(Debug, Serialize)]
pub struct Alert {
pub level: AlertLevel,
pub message: String,
}
#[derive(Debug, Serialize, Clone, Copy)]
#[serde(rename_all = "snake_case")]
pub enum AlertLevel {
Warning,
}
async fn system(
State(state): State<AppState>,
_admin: RequireAdmin,
) -> AppResult<Json<SystemStats>> {
let disk = state.storage.local_root().and_then(disk_stats_for);
let (memory, cpu) = memory_and_cpu().await;
let mut alerts = Vec::new();
if let Some(d) = &disk {
if d.percent_used >= ALERT_THRESHOLD_PERCENT {
alerts.push(Alert {
level: AlertLevel::Warning,
message: format!(
"disk near full ({:.0}% used)",
d.percent_used
),
});
}
}
if memory.percent_used >= ALERT_THRESHOLD_PERCENT {
alerts.push(Alert {
level: AlertLevel::Warning,
message: format!(
"memory near full ({:.0}% used)",
memory.percent_used
),
});
}
Ok(Json(SystemStats {
disk,
memory,
cpu,
alerts,
}))
}
fn disk_stats_for(root: &Path) -> Option<DiskStats> {
let s = nix::sys::statvfs::statvfs(root).ok()?;
// statvfs reports `f_frsize * f_blocks` for total bytes. `f_bavail`
// is "free to non-root callers" which is what an operator actually
// cares about — `f_bfree` includes blocks reserved for root.
let block = s.fragment_size();
let total = block * s.blocks();
let avail = block * s.blocks_available();
let used = total.saturating_sub(avail);
let percent_used = if total > 0 {
(used as f64) * 100.0 / (total as f64)
} else {
0.0
};
Some(DiskStats {
total_bytes: total,
used_bytes: used,
free_bytes: avail,
percent_used,
})
}
async fn memory_and_cpu() -> (MemoryStats, CpuStats) {
// sysinfo's CPU sampling needs two refreshes with a delay between
// them — the first seeds the delta counters, the second measures.
// We do this once per request; admin traffic is low enough that the
// 250ms cost is invisible.
let mut sys = System::new_with_specifics(
RefreshKind::new()
.with_cpu(CpuRefreshKind::everything())
.with_memory(MemoryRefreshKind::everything()),
);
sys.refresh_cpu_all();
// Yield the runtime instead of blocking it for the gap.
tokio::time::sleep(Duration::from_millis(250)).await;
sys.refresh_cpu_all();
sys.refresh_memory();
let total = sys.total_memory();
let used = sys.used_memory();
let mem_pct = if total > 0 {
(used as f64) * 100.0 / (total as f64)
} else {
0.0
};
let memory = MemoryStats {
total_bytes: total,
used_bytes: used,
percent_used: mem_pct,
};
let cpu = CpuStats {
percent_used: sys.global_cpu_usage() as f64,
};
(memory, cpu)
}

View File

@@ -1,128 +0,0 @@
//! Admin user management: list, delete, promote/demote.
//!
//! All handlers are gated by `RequireAdmin` and rely on
//! `repo::user::admin_safe_*` for self-protection and the last-admin
//! invariant. Audit rows are written inside the same DB transaction as
//! the action they record.
use axum::extract::{Path, Query, State};
use axum::http::StatusCode;
use axum::routing::{delete, get};
use axum::{Json, Router};
use serde::Deserialize;
use uuid::Uuid;
use crate::api::auth::{validate_password, validate_username};
use crate::api::pagination::PagedResponse;
use crate::app::AppState;
use crate::auth::extractor::RequireAdmin;
use crate::auth::password::hash_password;
use crate::domain::User;
use crate::error::{AppError, AppResult};
use crate::repo;
pub fn routes() -> Router<AppState> {
Router::new()
.route("/admin/users", get(list_users).post(create_user))
.route(
"/admin/users/:id",
delete(delete_user).patch(update_user),
)
}
#[derive(Debug, Deserialize, Default)]
pub struct ListUsersParams {
#[serde(default)]
pub search: Option<String>,
#[serde(default = "default_limit")]
pub limit: i64,
#[serde(default)]
pub offset: i64,
}
fn default_limit() -> i64 {
50
}
async fn list_users(
State(state): State<AppState>,
_admin: RequireAdmin,
Query(params): Query<ListUsersParams>,
) -> AppResult<Json<PagedResponse<User>>> {
let limit = params.limit.clamp(1, 200);
let offset = params.offset.max(0);
let (items, total) = repo::user::list_with_total(
&state.db,
&repo::user::ListUsersQuery {
search: params.search.filter(|s| !s.trim().is_empty()),
limit,
offset,
},
)
.await?;
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
}
#[derive(Debug, Deserialize)]
pub struct UpdateUserInput {
pub is_admin: Option<bool>,
}
async fn update_user(
State(state): State<AppState>,
RequireAdmin(actor): RequireAdmin,
Path(id): Path<Uuid>,
Json(input): Json<UpdateUserInput>,
) -> AppResult<Json<User>> {
let Some(is_admin) = input.is_admin else {
return Err(AppError::InvalidInput(
"no updatable fields supplied".into(),
));
};
let updated =
repo::user::admin_safe_set_is_admin(&state.db, actor.id, id, is_admin).await?;
Ok(Json(updated))
}
async fn delete_user(
State(state): State<AppState>,
RequireAdmin(actor): RequireAdmin,
Path(id): Path<Uuid>,
) -> AppResult<StatusCode> {
repo::user::admin_safe_delete(&state.db, actor.id, id).await?;
Ok(StatusCode::NO_CONTENT)
}
#[derive(Debug, Deserialize)]
pub struct CreateUserInput {
pub username: String,
pub password: String,
/// Defaults to false; admins may mint other admins in a single
/// call. Doing it as one POST avoids a second audit row for the
/// common "invite a co-admin" flow.
#[serde(default)]
pub is_admin: bool,
}
async fn create_user(
State(state): State<AppState>,
RequireAdmin(actor): RequireAdmin,
Json(input): Json<CreateUserInput>,
) -> AppResult<(StatusCode, Json<User>)> {
let username = input.username.trim();
// Reuse the canonical self-register validators so the admin-create
// path can never produce a username that self-register would
// reject (and vice versa).
validate_username(username)?;
validate_password(&input.password)?;
let pwhash = hash_password(&input.password)?;
let user = repo::user::admin_create_user(
&state.db,
actor.id,
username,
&pwhash,
input.is_admin,
)
.await?;
Ok((StatusCode::CREATED, Json(user)))
}

View File

@@ -4,8 +4,6 @@
//! expire naturally rather than being explicitly invalidated, so other
//! devices keep their existing logins).
use std::sync::OnceLock;
use axum::extract::{Path, State};
use axum::http::StatusCode;
use axum::response::IntoResponse;
@@ -28,7 +26,6 @@ use crate::repo;
pub fn routes() -> Router<AppState> {
Router::new()
.route("/auth/config", get(auth_config))
.route("/auth/register", post(register))
.route("/auth/login", post(login))
.route("/auth/logout", post(logout))
@@ -42,25 +39,6 @@ pub fn routes() -> Router<AppState> {
.route("/auth/tokens/:id", delete(delete_token))
}
/// Public, unauthenticated. Exposes anonymous-relevant auth policy so
/// the frontend can render its login / register affordances correctly
/// without a probe request that would conflate "disabled" with
/// "rate-limited". `self_register_enabled` is the *effective* value
/// (`allow_self_register && !private_mode`), so a private-mode
/// instance reports `false` even if the raw flag is on.
#[derive(Debug, Serialize)]
pub struct AuthConfigResponse {
pub self_register_enabled: bool,
pub private_mode: bool,
}
async fn auth_config(State(state): State<AppState>) -> Json<AuthConfigResponse> {
Json(AuthConfigResponse {
self_register_enabled: state.auth.allow_self_register && !state.auth.private_mode,
private_mode: state.auth.private_mode,
})
}
#[derive(Debug, Deserialize)]
pub struct Credentials {
pub username: String,
@@ -102,17 +80,6 @@ async fn register(
jar: CookieJar,
Json(input): Json<Credentials>,
) -> AppResult<impl IntoResponse> {
// Rate limit before the disabled check so an operator who flips
// the toggle can't be probed for the toggle state via timing —
// disabled and enabled paths both consume a token, and disabled
// returns 403 instead of running argon2.
check_auth_rate_limit(&state, "register")?;
// Private mode force-blocks self-registration regardless of
// ALLOW_SELF_REGISTER — operators of locked-down instances mint
// accounts via `POST /admin/users` instead.
if !state.auth.allow_self_register || state.auth.private_mode {
return Err(AppError::Forbidden);
}
let username = input.username.trim();
validate_username(username)?;
validate_password(&input.password)?;
@@ -128,7 +95,6 @@ async fn login(
jar: CookieJar,
Json(input): Json<Credentials>,
) -> AppResult<impl IntoResponse> {
check_auth_rate_limit(&state, "login")?;
let username = input.username.trim();
if username.is_empty() || input.password.is_empty() {
return Err(AppError::InvalidInput(
@@ -136,15 +102,9 @@ async fn login(
));
}
let user = repo::user::find_by_username(&state.db, username).await?;
let Some(user) = user else {
// No such user. Run argon2 against a stable dummy hash so the
// response time matches the wrong-password branch — otherwise
// an attacker can enumerate usernames by timing the no-user
// 401 against the wrong-password 401.
let _ = verify_password(&input.password, dummy_password_hash());
return Err(AppError::Unauthenticated);
};
let user = repo::user::find_by_username(&state.db, username)
.await?
.ok_or(AppError::Unauthenticated)?;
if !verify_password(&input.password, &user.password_hash) {
return Err(AppError::Unauthenticated);
}
@@ -153,21 +113,6 @@ async fn login(
Ok((StatusCode::OK, jar, Json(AuthResponse { user })))
}
/// Lazily-computed argon2 hash used to equalise login response time
/// across the "no such user" and "wrong password" branches. Computing
/// it once (on the first login of the process) is enough — the hash is
/// never compared against a real password, only used to force argon2
/// to do the same amount of work it would for a real verify.
fn dummy_password_hash() -> &'static str {
static DUMMY: OnceLock<String> = OnceLock::new();
DUMMY
.get_or_init(|| {
crate::auth::password::hash_password("login-timing-equaliser")
.expect("hash_password on a fixed input cannot fail")
})
.as_str()
}
async fn logout(
State(state): State<AppState>,
jar: CookieJar,
@@ -204,7 +149,6 @@ async fn change_password(
jar: CookieJar,
Json(input): Json<ChangePassword>,
) -> AppResult<impl IntoResponse> {
check_auth_rate_limit(&state, "change_password")?;
if !verify_password(&input.current_password, &user.password_hash) {
return Err(AppError::Unauthenticated);
}
@@ -286,24 +230,8 @@ async fn create_token(
Json(input): Json<CreateTokenInput>,
) -> AppResult<impl IntoResponse> {
let name = input.name.trim();
// Both arms use `ValidationFailed` (422 with field details) to
// match the structured-error shape `attach_tag` returns for the
// same kind of free-form-identifier validation. The other
// /auth/* handlers in this file use `InvalidInput` (400); the
// divergence is pre-existing and would warrant a project-wide
// pass to flip them all if the client side wants uniform per-
// field error rendering.
if name.is_empty() {
return Err(AppError::ValidationFailed {
message: "token name is required".into(),
details: serde_json::json!({ "name": "required" }),
});
}
if name.chars().count() > 64 {
return Err(AppError::ValidationFailed {
message: "token name too long".into(),
details: serde_json::json!({ "name": "max 64 characters" }),
});
return Err(AppError::InvalidInput("token name is required".into()));
}
let (raw, hash) = generate_token();
let token = repo::api_token::create(&state.db, user.id, name, &hash).await?;
@@ -339,18 +267,6 @@ async fn start_session(
Ok(jar.add(build_session_cookie(raw, &state.auth)))
}
// CSRF posture: `SameSite=Lax` is the project's primary CSRF defense.
// Browsers refuse to attach this cookie to cross-site POST / PATCH /
// DELETE requests, which covers every state-changing endpoint (auth
// mutations, uploads, bookmarks, collections, admin user management,
// etc. — all JSON over POST/PATCH/DELETE). Lax DOES still attach the
// cookie on top-level cross-site GETs, so this defense breaks the
// instant anyone adds a state-changing GET. If you reach for one,
// switch to `SameSite=Strict` here AND add an explicit CSRF-token
// check on the new endpoint. The Bearer-token branch in the
// extractor is unaffected (bots authenticate with the token header,
// not the cookie) and admin routes reject Bearer entirely — see
// `auth::extractor::RequireAdmin`.
fn build_session_cookie(raw: String, cfg: &AuthConfig) -> Cookie<'static> {
let mut builder = Cookie::build((SESSION_COOKIE_NAME, raw))
.http_only(true)
@@ -377,38 +293,7 @@ fn build_expired_cookie(cfg: &AuthConfig) -> Cookie<'static> {
builder.build()
}
/// Consume one token from the shared auth rate limiter. Called at the
/// start of `register`, `login`, and `change_password` so credential
/// stuffing / spraying / username-probe loops are throttled by the
/// configured budget (default 5/sec with a 10-request burst).
///
/// All three endpoints share one bucket — they all expose the same
/// argon2-verify-or-create work and the same enumeration channels, so
/// any one of them in a tight loop should trip the limit. `endpoint`
/// is included in the rate-limit-hit log line so operators can tell
/// which endpoint is being probed.
fn check_auth_rate_limit(state: &AppState, endpoint: &'static str) -> AppResult<()> {
use crate::auth::rate_limit::AcquireResult;
match state.auth_limiter.try_acquire() {
AcquireResult::Allowed => Ok(()),
AcquireResult::Denied { retry_after_secs } => {
tracing::warn!(
endpoint,
retry_after_secs,
"auth rate limit hit; returning 429"
);
Err(AppError::TooManyRequests {
retry_after_secs: Some(retry_after_secs),
})
}
}
}
// Exposed pub(crate) so the admin user-create handler can apply the
// same rules as self-registration. Keeping the lone canonical
// implementation here avoids the two paths drifting on min length /
// allowed character set.
pub(crate) fn validate_username(u: &str) -> AppResult<()> {
fn validate_username(u: &str) -> AppResult<()> {
if u.is_empty() {
return Err(AppError::InvalidInput("username is required".into()));
}
@@ -425,7 +310,7 @@ pub(crate) fn validate_username(u: &str) -> AppResult<()> {
Ok(())
}
pub(crate) fn validate_password(p: &str) -> AppResult<()> {
fn validate_password(p: &str) -> AppResult<()> {
if p.len() < 8 {
return Err(AppError::InvalidInput(
"password must be at least 8 characters".into(),

View File

@@ -67,7 +67,14 @@ async fn create(
// the foreign-key violation collapse into a generic 500.
repo::manga::get(&state.db, input.manga_id).await?;
if let Some(chapter_id) = input.chapter_id {
if !repo::chapter::belongs_to_manga(&state.db, chapter_id, input.manga_id).await? {
let exists: Option<(Uuid,)> = sqlx::query_as(
"SELECT id FROM chapters WHERE id = $1 AND manga_id = $2",
)
.bind(chapter_id)
.bind(input.manga_id)
.fetch_optional(&state.db)
.await?;
if exists.is_none() {
return Err(AppError::NotFound);
}
}

View File

@@ -196,14 +196,16 @@ async fn create(
async fn update(
State(state): State<AppState>,
CurrentUser(user): CurrentUser,
CurrentUser(_user): CurrentUser,
Path(id): Path<Uuid>,
Json(patch): Json<MangaPatch>,
) -> AppResult<Json<MangaDetail>> {
// TODO(auth): until uploaders are tracked (Phase 5), any signed-in
// user can edit any manga. Restrict to uploader + admin once that
// column lands.
if !repo::manga::exists(&state.db, id).await? {
return Err(AppError::NotFound);
}
require_can_edit(&state, id, user.id).await?;
if let Some(ref status) = patch.status {
let trimmed = status.trim();
@@ -267,14 +269,16 @@ async fn update(
/// `MangaDetail`.
async fn put_cover(
State(state): State<AppState>,
CurrentUser(user): CurrentUser,
CurrentUser(_user): CurrentUser,
Path(id): Path<Uuid>,
mut multipart: Multipart,
) -> AppResult<Json<MangaDetail>> {
// TODO(auth): until uploaders are tracked (Phase 5), any signed-in
// user can edit any manga's cover. Restrict to uploader + admin
// once that column lands.
if !repo::manga::exists(&state.db, id).await? {
return Err(AppError::NotFound);
}
require_can_edit(&state, id, user.id).await?;
let mut cover: Option<UploadedImage> = None;
while let Some(field) = next_field(&mut multipart).await? {
@@ -316,13 +320,13 @@ async fn put_cover(
/// with the unchanged detail.
async fn delete_cover(
State(state): State<AppState>,
CurrentUser(user): CurrentUser,
CurrentUser(_user): CurrentUser,
Path(id): Path<Uuid>,
) -> AppResult<Json<MangaDetail>> {
// TODO(auth): same caveat as put_cover.
if !repo::manga::exists(&state.db, id).await? {
return Err(AppError::NotFound);
}
require_can_edit(&state, id, user.id).await?;
if let Some(key) = repo::manga::get(&state.db, id).await?.cover_image_path {
match state.storage.delete(&key).await {
Ok(()) | Err(StorageError::NotFound) => {}
@@ -344,7 +348,6 @@ async fn attach_tag(
Path(id): Path<Uuid>,
Json(body): Json<AttachTagBody>,
) -> AppResult<(StatusCode, Json<TagRef>)> {
validate_tag_name(&body.name)?;
if !repo::manga::exists(&state.db, id).await? {
return Err(AppError::NotFound);
}
@@ -391,27 +394,6 @@ async fn detach_tag(
}
}
/// Request-side validation for `POST /mangas/:id/tags` body. Mirrors
/// the repo-level cap in `repo::tag::upsert_by_name` (max 64 chars
/// after trim) but surfaces the failure at the handler boundary with
/// the same envelope shape other validations use.
fn validate_tag_name(name: &str) -> AppResult<()> {
let trimmed = name.trim();
if trimmed.is_empty() {
return Err(AppError::ValidationFailed {
message: "tag name cannot be empty".into(),
details: json!({ "name": "required" }),
});
}
if trimmed.chars().count() > 64 {
return Err(AppError::ValidationFailed {
message: "tag name too long".into(),
details: json!({ "name": "max 64 characters" }),
});
}
Ok(())
}
fn validate_new_manga(input: &NewManga) -> AppResult<()> {
if input.title.trim().is_empty() {
return Err(AppError::ValidationFailed {
@@ -431,30 +413,6 @@ fn validate_new_manga(input: &NewManga) -> AppResult<()> {
Ok(())
}
/// Authorisation gate for manga mutations. The manga is assumed to
/// exist (the caller runs [`repo::manga::exists`] first so a missing id
/// surfaces as `NotFound`, not `Forbidden`).
///
/// Rule: a non-NULL `uploaded_by` must match the current user. Legacy
/// rows with `uploaded_by IS NULL` (pre-migration-0011) are still
/// editable by any signed-in user — there's nobody to gate on yet, and
/// the historical-data note in 0011 acknowledges the gap. Once an
/// admin role lands the NULL case can flip to admin-only.
///
/// Returns `Forbidden` (not `NotFound`) on owner mismatch — mangas
/// are listable via `GET /mangas`, so existence isn't a secret and
/// the more accurate 403 is fine. This deliberately differs from
/// `repo::collection::require_owner`, which collapses both states to
/// `NotFound` because collections are private to a user and existence
/// itself is information worth hiding from non-owners.
async fn require_can_edit(state: &AppState, manga_id: Uuid, user_id: Uuid) -> AppResult<()> {
match repo::manga::uploaded_by(&state.db, manga_id).await? {
Some(owner) if owner != user_id => Err(AppError::Forbidden),
// Some(owner) == user_id (good) or None (legacy row, no owner).
_ => Ok(()),
}
}
async fn validate_genre_ids(state: &AppState, ids: &[Uuid]) -> AppResult<()> {
if ids.is_empty() {
return Ok(());

View File

@@ -1,4 +1,3 @@
pub mod admin;
pub mod auth;
pub mod authors;
pub mod bookmarks;
@@ -29,5 +28,4 @@ pub fn routes() -> Router<AppState> {
.merge(authors::routes())
.merge(collections::routes())
.merge(history::routes())
.merge(admin::routes())
}

View File

@@ -1,12 +1,10 @@
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
use std::sync::atomic::AtomicBool;
use anyhow::Context;
use async_trait::async_trait;
use axum::extract::{DefaultBodyLimit, FromRequestParts, Request, State};
use axum::extract::DefaultBodyLimit;
use axum::http::{HeaderName, HeaderValue, Method};
use axum::middleware::{self, Next};
use axum::response::Response;
use axum::Router;
use sqlx::postgres::PgPoolOptions;
use sqlx::PgPool;
@@ -14,19 +12,15 @@ use tokio_util::sync::CancellationToken;
use tower_http::cors::{AllowOrigin, CorsLayer};
use tower_http::trace::TraceLayer;
use crate::auth::extractor::CurrentUser;
use crate::auth::rate_limit::AuthRateLimiter;
use crate::error::AppError;
use crate::config::{AuthConfig, Config, CrawlerConfig, UploadConfig};
use crate::config::{AuthConfig, Config, CrawlerConfig, CrawlerModePref, UploadConfig};
use crate::crawler::browser_manager::{self, BrowserManager};
use crate::crawler::content::{self, SyncOutcome};
use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass};
use crate::crawler::jobs::JobPayload;
use crate::crawler::pipeline::{self, MetadataStats};
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::resync::{RealResyncService, ResyncService};
use crate::crawler::safety::DownloadAllowlist;
use crate::crawler::session;
use crate::crawler::source::{target as target_source, DiscoverMode};
use crate::repo;
use crate::storage::{LocalStorage, Storage};
@@ -36,34 +30,6 @@ pub struct AppState {
pub storage: Arc<dyn Storage>,
pub auth: AuthConfig,
pub upload: UploadConfig,
/// Shared rate limiter guarding the `/auth/*` mutation endpoints.
/// One instance per AppState so tests stay isolated across the
/// same process.
pub auth_limiter: Arc<AuthRateLimiter>,
/// Admin-triggered force resync. `None` when the crawler daemon
/// is disabled (`CRAWLER_DAEMON=false`); admin handlers gate on
/// `.is_some()` and return 503 otherwise. Set by [`build`] from the
/// same wiring that builds the daemon's chapter dispatcher, so a
/// force resync uses the daemon's BrowserManager + rate limiters.
pub resync: Option<Arc<dyn ResyncService>>,
/// Crawler observability + control handle (live status, coordinated
/// browser restart, runtime session, manual run). `None` when the
/// daemon is disabled; admin handlers gate on `.is_some()` → 503.
pub crawler: Option<Arc<CrawlerControl>>,
}
/// Shared handle the admin crawler endpoints use to observe and control
/// the running daemon. Bundled so the handlers take one optional field on
/// `AppState` rather than many.
pub struct CrawlerControl {
pub browser_manager: Arc<BrowserManager>,
pub session: Arc<crate::crawler::session_control::SessionController>,
pub status: crate::crawler::status::StatusHandle,
/// Used by the "run metadata pass now" endpoint; `None` when no
/// `CRAWLER_START_URL` is configured (cron disabled).
pub metadata_pass: Option<Arc<dyn MetadataPass>>,
/// Drain budget for a manually-triggered coordinated browser restart.
pub drain_deadline: std::time::Duration,
}
/// Bundle returned by [`build`]. The router is what `axum::serve` consumes;
@@ -89,63 +55,35 @@ pub async fn build(config: Config) -> anyhow::Result<AppHandle> {
.await?;
sqlx::migrate!("./migrations").run(&db).await?;
if let Some((username, password)) = config.admin_bootstrap.as_ref() {
repo::user::bootstrap_admin(&db, username, password)
.await
.context("bootstrap_admin from ADMIN_USERNAME/ADMIN_PASSWORD env")?;
tracing::info!(admin_username = %username, "admin bootstrap ensured");
}
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(config.storage_dir.clone()));
let (daemon, resync, crawler) = if config.crawler.daemon_enabled {
let spawned = spawn_crawler_daemon(db.clone(), Arc::clone(&storage), &config.crawler).await?;
(Some(spawned.handle), Some(spawned.resync), Some(spawned.crawler))
let daemon = if config.crawler.daemon_enabled {
Some(spawn_crawler_daemon(db.clone(), Arc::clone(&storage), &config.crawler).await?)
} else {
tracing::info!("crawler daemon disabled (CRAWLER_DAEMON=false)");
(None, None, None)
None
};
let auth_limiter = Arc::new(AuthRateLimiter::new(config.auth.rate_limit));
let state = AppState {
db,
storage,
auth: config.auth.clone(),
upload: config.upload.clone(),
auth_limiter,
resync,
crawler,
};
let router = router(state).layer(cors_layer(&config.cors_allowed_origins));
Ok(AppHandle { router, daemon })
}
/// Bundle returned by [`spawn_crawler_daemon`]. The handle owns the
/// daemon's tasks; `resync` is the operator-trigger service shared with
/// `AppState` so admin endpoints can call into the same browser /
/// rate-limit machinery.
struct SpawnedDaemon {
handle: daemon::DaemonHandle,
resync: Arc<dyn ResyncService>,
crawler: Arc<CrawlerControl>,
}
async fn spawn_crawler_daemon(
db: PgPool,
storage: Arc<dyn Storage>,
cfg: &CrawlerConfig,
) -> anyhow::Result<SpawnedDaemon> {
// Reqwest client with a shared cookie jar so CDN image fetches include
// PHPSESSID. The same `Arc<Jar>` is held by the SessionController, so a
// runtime session refresh rewrites it in place. Initial value: a
// persisted runtime session (survives restart) takes precedence over
// CRAWLER_PHPSESSID env.
) -> anyhow::Result<daemon::DaemonHandle> {
// Reqwest client with cookie jar pre-seeded so CDN image fetches
// include PHPSESSID. Same shape as bin/crawler.rs main().
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
let initial_sid = crate::crawler::session_control::SessionController::load_persisted(&db)
.await
.or_else(|| cfg.phpsessid.clone());
if let (Some(sid), Some(domain), Some(start_url)) =
(&initial_sid, &cfg.cookie_domain, &cfg.start_url)
(&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url)
{
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
let seed_url = reqwest::Url::parse(start_url)
@@ -155,7 +93,7 @@ async fn spawn_crawler_daemon(
let mut http_builder = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.no_proxy()
.cookie_provider(Arc::clone(&cookie_jar));
.cookie_provider(cookie_jar);
if let Some(ua) = &cfg.user_agent {
http_builder = http_builder.user_agent(ua);
}
@@ -171,73 +109,29 @@ async fn spawn_crawler_daemon(
}
let rate = Arc::new(rate);
let tor = crate::crawler::tor::TorController::from_parts(
cfg.tor_control_url.as_deref(),
cfg.tor_control_password.as_deref(),
cfg.tor_control_cookie_path.as_deref(),
)
.context("build TorController from CRAWLER_TOR_CONTROL_* env")?
.map(Arc::new);
if let Some(t) = &tor {
tracing::info!(?t, "TOR control configured; transient pages will trigger NEWNYM");
}
let tor_recircuit_max = cfg.tor_recircuit_max_attempts;
// Session controller + sticky session-expired flag. Created before the
// browser so the on_launch hook can read the *current* session value
// (rather than a value captured at startup), and so a runtime refresh
// updates the cookie everywhere.
let session_expired = Arc::new(AtomicBool::new(false));
let session_controller = crate::crawler::session_control::SessionController::new(
initial_sid,
Arc::clone(&cookie_jar),
cfg.cookie_domain.clone(),
cfg.start_url.clone(),
db.clone(),
Arc::clone(&session_expired),
);
// Live status surface, sized to the worker count.
let status = crate::crawler::status::StatusHandle::new(cfg.chapter_workers);
// Browser manager. on_launch re-injects PHPSESSID on every fresh
// chromium spawn so an idle teardown followed by re-launch stays
// authenticated without operator action.
let mut launch_opts = cfg.browser.clone();
if let Some(proxy) = &cfg.proxy {
let chromium_proxy = crate::crawler::url_utils::chromium_proxy_arg(proxy);
launch_opts.extra_args.push(format!("--proxy-server={chromium_proxy}"));
launch_opts.extra_args.push(format!("--proxy-server={proxy}"));
}
let on_launch = match (&cfg.cookie_domain, &cfg.start_url) {
(Some(domain), Some(start_url)) => {
let on_launch = match (&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url) {
(Some(sid), Some(domain), Some(start_url)) => {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url.clone();
let tor_for_launch = tor.as_ref().map(Arc::clone);
let sc = Arc::clone(&session_controller);
let on_launch: browser_manager::OnLaunch = Arc::new(move |browser| {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url.clone();
let tor_for_launch = tor_for_launch.as_ref().map(Arc::clone);
let sc = Arc::clone(&sc);
Box::pin(async move {
// Read the *current* session each launch so a runtime
// refresh is picked up on the next (re)launch. No session
// configured → run unauthenticated (metadata needs no auth).
let Some(sid) = sc.current().await else {
tracing::info!("on_launch: no session set — skipping inject + probe");
return Ok(());
};
session::inject_phpsessid(&browser, &sid, &domain)
.await
.context("on_launch: inject_phpsessid")?;
session::verify_session_with_recircuit(
&browser,
&start_url,
tor_for_launch.as_deref(),
tor_recircuit_max,
)
.await
.context("on_launch: verify_session")?;
session::verify_session(&browser, &start_url)
.await
.context("on_launch: verify_session")?;
Ok(())
})
});
@@ -247,6 +141,8 @@ async fn spawn_crawler_daemon(
};
let browser_manager = BrowserManager::new(launch_opts, cfg.idle_timeout, on_launch);
let session_expired = Arc::new(AtomicBool::new(false));
let metadata_pass: Option<Arc<dyn MetadataPass>> = cfg.start_url.as_ref().map(|url| {
let m: Arc<dyn MetadataPass> = Arc::new(RealMetadataPass {
browser_manager: Arc::clone(&browser_manager),
@@ -255,40 +151,18 @@ async fn spawn_crawler_daemon(
http: http.clone(),
rate: Arc::clone(&rate),
start_url: url.clone(),
manga_limit: cfg.manga_limit,
download_allowlist: cfg.download_allowlist.clone(),
max_image_bytes: cfg.max_image_bytes,
metadata_max_consecutive_failures: cfg.metadata_max_consecutive_failures,
status: status.clone(),
tor: tor.as_ref().map(Arc::clone),
mode_pref: cfg.mode,
incremental_stop_after: cfg.incremental_stop_after,
});
m
});
let dispatcher: Arc<dyn ChapterDispatcher> = Arc::new(RealChapterDispatcher {
browser_manager: Arc::clone(&browser_manager),
db: db.clone(),
storage: Arc::clone(&storage),
http: http.clone(),
rate: Arc::clone(&rate),
download_allowlist: cfg.download_allowlist.clone(),
max_image_bytes: cfg.max_image_bytes,
transient_failures: Arc::new(AtomicU32::new(0)),
restart_threshold: cfg.browser_restart_threshold,
drain_deadline: cfg.job_timeout,
status: status.clone(),
tor: tor.as_ref().map(Arc::clone),
});
let resync: Arc<dyn ResyncService> = Arc::new(RealResyncService {
browser_manager: Arc::clone(&browser_manager),
db: db.clone(),
storage: Arc::clone(&storage),
http,
rate: Arc::clone(&rate),
download_allowlist: cfg.download_allowlist.clone(),
max_image_bytes: cfg.max_image_bytes,
tor: tor.as_ref().map(Arc::clone),
});
// Shared cancellation: daemon shutdown cancels the BrowserManager's
@@ -314,32 +188,18 @@ async fn spawn_crawler_daemon(
db,
cancel,
DaemonConfig {
metadata_pass: metadata_pass.clone(),
metadata_pass,
dispatcher,
chapter_workers: cfg.chapter_workers,
daily_at: cfg.daily_at,
tz: cfg.tz,
retention_days: cfg.retention_days,
session_expired,
status: status.clone(),
job_timeout: cfg.job_timeout,
extra_tasks: vec![reaper_task, shutdown_task],
},
);
let crawler = Arc::new(CrawlerControl {
browser_manager: Arc::clone(&browser_manager),
session: session_controller,
status,
metadata_pass,
drain_deadline: cfg.job_timeout,
});
Ok(SpawnedDaemon {
handle: daemon_handle,
resync,
crawler,
})
Ok(daemon_handle)
}
// Real impls of the daemon traits, owning the browser manager + I/O. Kept
@@ -354,71 +214,76 @@ struct RealMetadataPass {
http: reqwest::Client,
rate: Arc<HostRateLimiters>,
start_url: String,
manga_limit: usize,
download_allowlist: DownloadAllowlist,
max_image_bytes: usize,
metadata_max_consecutive_failures: u32,
status: crate::crawler::status::StatusHandle,
tor: Option<Arc<crate::crawler::tor::TorController>>,
mode_pref: CrawlerModePref,
incremental_stop_after: usize,
}
#[async_trait]
impl MetadataPass for RealMetadataPass {
async fn run(&self) -> anyhow::Result<MetadataStats> {
let result = pipeline::run_metadata_pass(
let mode = resolve_mode(
&self.db,
target_source::SOURCE_ID,
self.mode_pref,
self.incremental_stop_after,
)
.await?;
pipeline::run_metadata_pass(
&self.browser_manager,
&self.db,
self.storage.as_ref(),
&self.http,
&self.rate,
&self.start_url,
self.manga_limit,
0,
false,
&self.download_allowlist,
self.max_image_bytes,
self.metadata_max_consecutive_failures,
Some(&self.status),
self.tor.as_deref(),
)
.await;
if let Err(e) = &result {
if crate::crawler::nav::anyhow_looks_browser_dead(e) {
self.browser_manager.invalidate().await;
}
}
// Cover backfill follows the metadata pass even when the pass
// errored — the early-stop walk can complete its work and bail
// late, and a transient browser failure shouldn't cancel the
// residual cover backlog. The backfill has its own per-call cap
// so a runaway error stream can't monopolise the tick. It sets the
// CoverBackfill{index,total} phase + current_cover per entry.
match pipeline::backfill_missing_covers(
&self.browser_manager,
&self.db,
self.storage.as_ref(),
&self.http,
&self.rate,
pipeline::COVER_BACKFILL_DEFAULT_MAX,
&self.download_allowlist,
self.max_image_bytes,
Some(&self.status),
self.tor.as_deref(),
mode,
)
.await
{
Ok(stats) => {
if stats.considered > 0 {
tracing::info!(?stats, "cover backfill complete");
}
}
/// Pick the active mode for this tick. `Explicit` short-circuits the
/// DB lookup. `Auto` reads `seed_completed_at`: missing → Backfill
/// (initial seed for this source), present → Incremental with the
/// configured threshold.
///
/// A DB error during the Auto lookup propagates as `Err` rather than
/// silently degrading to Backfill — the daemon's `run_tick` catches
/// the error, logs, and skips the tick. That's safer than running a
/// full re-backfill (including a drop pass against stale-looking rows)
/// when the DB is flaky.
async fn resolve_mode(
db: &PgPool,
source_id: &str,
pref: CrawlerModePref,
incremental_stop_after: usize,
) -> anyhow::Result<DiscoverMode> {
match pref {
CrawlerModePref::Explicit(m) => {
tracing::info!(?m, "crawler mode: explicit (CRAWLER_MODE override)");
Ok(m)
}
CrawlerModePref::Auto => {
let seeded = repo::crawler::seed_completed_at(db, source_id)
.await
.context("seed_completed_at lookup for mode auto-detection")?;
match seeded {
Some(at) => {
tracing::info!(
seed_completed_at = %at.to_rfc3339(),
"crawler mode: auto → incremental (seed previously completed)"
);
Ok(DiscoverMode::Incremental {
stop_after_unchanged: incremental_stop_after,
})
}
}
Err(e) => {
tracing::warn!(error = ?e, "cover backfill failed");
if crate::crawler::nav::anyhow_looks_browser_dead(&e) {
self.browser_manager.invalidate().await;
None => {
tracing::info!("crawler mode: auto → backfill (no seed marker for source)");
Ok(DiscoverMode::Backfill)
}
}
}
result
}
}
@@ -428,19 +293,6 @@ struct RealChapterDispatcher {
storage: Arc<dyn Storage>,
http: reqwest::Client,
rate: Arc<HostRateLimiters>,
download_allowlist: DownloadAllowlist,
max_image_bytes: usize,
/// Consecutive transient chapter failures; resets on any success.
/// Drives the automatic coordinated browser restart.
transient_failures: Arc<std::sync::atomic::AtomicU32>,
/// Consecutive-failure count that triggers an auto restart.
restart_threshold: u32,
/// How long a coordinated restart waits for in-flight leases to drain.
drain_deadline: std::time::Duration,
/// Live status surface — the dispatcher registers each chapter it
/// crawls (with a realtime page count) here.
status: crate::crawler::status::StatusHandle,
tor: Option<Arc<crate::crawler::tor::TorController>>,
}
#[async_trait]
@@ -452,26 +304,24 @@ impl ChapterDispatcher for RealChapterDispatcher {
chapter_id,
source_chapter_key: _,
} => {
let row = repo::chapter::dispatch_target(&self.db, chapter_id)
.await
.context("look up chapter for dispatch")?;
let Some((manga_id, source_url, manga_title, chapter_number)) = row else {
// Look up manga_id + source_url for this chapter.
let row: Option<(uuid::Uuid, String)> = sqlx::query_as(
"SELECT c.manga_id, cs.source_url \
FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
WHERE c.id = $1 \
LIMIT 1",
)
.bind(chapter_id)
.fetch_optional(&self.db)
.await
.context("look up chapter for dispatch")?;
let Some((manga_id, source_url)) = row else {
// Chapter (or its source row) is gone — ack done.
return Ok(SyncOutcome::Skipped);
};
// Register the chapter as crawling now (live status). The
// guard removes it on every exit path — success, panic, or
// the worker's outer-timeout drop.
let _active = self.status.begin_chapter(crate::crawler::status::ActiveChapter {
manga_id,
manga_title,
chapter_id,
chapter_number,
pages_done: 0,
pages_total: None,
});
let lease = self.browser_manager.acquire().await?;
let result = content::sync_chapter_content(
let outcome = content::sync_chapter_content(
&lease,
&self.db,
self.storage.as_ref(),
@@ -481,48 +331,14 @@ impl ChapterDispatcher for RealChapterDispatcher {
manga_id,
&source_url,
false,
&self.download_allowlist,
self.max_image_bytes,
self.tor.as_deref(),
Some(&self.status),
)
.await;
.await?;
drop(lease);
match result {
Ok(outcome) => {
// Any successful dispatch (including a clean Skipped)
// means the browser is healthy — reset the streak.
self.transient_failures.store(0, Ordering::Release);
Ok(outcome)
}
Err(e) => {
let streak = self.transient_failures.fetch_add(1, Ordering::AcqRel) + 1;
if crate::crawler::nav::anyhow_looks_browser_dead(&e) {
// Hard browser-dead: lazy invalidate (next acquire
// relaunches). Reset the streak — we're recovering.
self.browser_manager.invalidate().await;
self.transient_failures.store(0, Ordering::Release);
} else if self.restart_threshold > 0 && streak >= self.restart_threshold {
// Persistent transients that TOR recircuit couldn't
// fix — proactively restart Chromium.
tracing::warn!(
streak,
threshold = self.restart_threshold,
"auto browser restart: consecutive transient chapter failures"
);
let _ = self
.browser_manager
.coordinated_restart(self.drain_deadline)
.await;
self.transient_failures.store(0, Ordering::Release);
}
Err(e)
}
}
Ok(outcome)
}
// Other payload kinds aren't dispatched by this daemon yet —
// SyncManga / SyncChapterList are handled inline by the cron's
// metadata pass.
// metadata-driven jobs (Discover/SyncManga/SyncChapterList)
// are handled inline by the cron's metadata pass.
_ => Ok(SyncOutcome::Skipped),
}
}
@@ -534,62 +350,11 @@ pub fn router(state: AppState) -> Router {
let max_request_bytes = state.upload.max_request_bytes;
Router::new()
.nest("/api/v1", crate::api::routes())
.layer(middleware::from_fn_with_state(
state.clone(),
private_mode_guard,
))
.layer(DefaultBodyLimit::max(max_request_bytes))
.with_state(state)
.layer(TraceLayer::new_for_http())
}
/// Paths reachable anonymously even when `PRIVATE_MODE=true`. Login and
/// logout are needed for the auth flow itself; `/health` is reserved
/// for load-balancer probes; `/auth/config` lets the frontend decide
/// whether to render the login form or its anonymous alternatives;
/// `/auth/register` is exempted from the gate so the handler can
/// return its informative `registration_disabled` 403 (the same code
/// public-mode deployments use when `ALLOW_SELF_REGISTER=false`) —
/// the handler itself force-blocks the request body in private mode,
/// so no account ever gets created here. Everything else demands a
/// valid session cookie or bearer token.
fn is_public_in_private_mode(path: &str) -> bool {
matches!(
path,
"/api/v1/health"
| "/api/v1/auth/config"
| "/api/v1/auth/login"
| "/api/v1/auth/logout"
| "/api/v1/auth/register"
)
}
/// Site-wide auth gate for `PRIVATE_MODE=true`. With the flag off this
/// is a no-op pass-through, so public deployments take no extra DB
/// hit. With it on, the guard reuses [`CurrentUser`] — the same
/// session-cookie-then-bearer-token logic the per-handler extractor
/// uses — so the two paths can never drift.
async fn private_mode_guard(
State(state): State<AppState>,
req: Request,
next: Next,
) -> Result<Response, AppError> {
if !state.auth.private_mode {
return Ok(next.run(req).await);
}
if is_public_in_private_mode(req.uri().path()) {
return Ok(next.run(req).await);
}
let (mut parts, body) = req.into_parts();
match CurrentUser::from_request_parts(&mut parts, &state).await {
Ok(_) => {
let req = Request::from_parts(parts, body);
Ok(next.run(req).await)
}
Err(_) => Err(AppError::Unauthenticated),
}
}
pub(crate) fn cors_layer(allowed_origins: &[String]) -> CorsLayer {
if allowed_origins.is_empty() {
// Same-origin only — no CORS headers emitted.

View File

@@ -1,19 +1,11 @@
//! Auth extractors.
//! `CurrentUser` axum extractor.
//!
//! Three extractors are available, in increasing strictness:
//! Resolves a request to a logged-in user by trying, in order:
//! 1. a `mangalord_session` cookie (session lookup by `sha256(value)`);
//! 2. an `Authorization: Bearer <token>` header (api_token lookup).
//!
//! - [`CurrentUser`] — accepts either a session cookie or an
//! `Authorization: Bearer <token>` header. Used by ordinary
//! authenticated endpoints where bot tokens are first-class clients.
//! - [`CurrentSessionUser`] — accepts only the session cookie. Used as
//! the substrate for admin extraction so bot tokens cannot authenticate
//! as the admin (see [`RequireAdmin`]).
//! - [`RequireAdmin`] — composes over [`CurrentSessionUser`] and
//! additionally requires `user.is_admin`. Returns 403 for
//! authenticated-but-not-admin, 401 otherwise.
//!
//! All lookups go by `sha256(raw_token)` — the raw value is never stored
//! in the database.
//! Both paths look up by hash, never by raw value. Failure to resolve
//! either way returns 401 via `AppError::Unauthenticated`.
use axum::async_trait;
use axum::extract::FromRequestParts;
@@ -69,54 +61,3 @@ impl FromRequestParts<AppState> for CurrentUser {
Err(AppError::Unauthenticated)
}
}
/// Cookie-only authentication. Bot/API tokens are explicitly NOT accepted
/// here — this is the substrate for [`RequireAdmin`] and exists precisely
/// to keep admin authority out of bearer-token reach.
pub struct CurrentSessionUser(pub User);
#[async_trait]
impl FromRequestParts<AppState> for CurrentSessionUser {
type Rejection = AppError;
async fn from_request_parts(
parts: &mut Parts,
state: &AppState,
) -> Result<Self, Self::Rejection> {
let jar = CookieJar::from_headers(&parts.headers);
let cookie = jar
.get(SESSION_COOKIE_NAME)
.ok_or(AppError::Unauthenticated)?;
let hash = hash_token(cookie.value());
let session = repo::session::find_active(&state.db, &hash)
.await?
.ok_or(AppError::Unauthenticated)?;
let user = repo::user::find_by_id(&state.db, session.user_id)
.await?
.ok_or(AppError::Unauthenticated)?;
Ok(CurrentSessionUser(user))
}
}
/// Admin-only. Composes over [`CurrentSessionUser`] so bot tokens are
/// rejected at the auth step (401) rather than the role step (403).
/// The user row is re-read every request, so demotion takes effect on
/// the very next call without needing to purge sessions.
pub struct RequireAdmin(pub User);
#[async_trait]
impl FromRequestParts<AppState> for RequireAdmin {
type Rejection = AppError;
async fn from_request_parts(
parts: &mut Parts,
state: &AppState,
) -> Result<Self, Self::Rejection> {
let CurrentSessionUser(user) =
CurrentSessionUser::from_request_parts(parts, state).await?;
if !user.is_admin {
return Err(AppError::Forbidden);
}
Ok(RequireAdmin(user))
}
}

View File

@@ -7,5 +7,4 @@
pub mod extractor;
pub mod password;
pub mod rate_limit;
pub mod token;

View File

@@ -1,179 +0,0 @@
//! Per-process token-bucket rate limiter for the auth endpoints.
//!
//! Protects `/auth/login`, `/auth/register`, and `/auth/me/password`
//! from credential stuffing / password spraying / username probing.
//!
//! The current deploy puts SvelteKit's hooks.server.ts proxy in front
//! of axum without forwarding the original client IP (no
//! `X-Forwarded-For`), so per-IP buckets would all collapse to the
//! proxy container's address. Until the proxy learns to forward the
//! peer address, a single global bucket gives equivalent protection
//! against mass-attack patterns and trades a small DoS surface
//! (legitimate users sharing the limit) for simplicity.
//!
//! Each `AppState` carries its own [`AuthRateLimiter`] instance, so
//! tests run in isolated buckets and won't bleed across `#[sqlx::test]`
//! cases that share a process.
use std::sync::Mutex;
use std::time::Instant;
/// Tunable limits. `per_sec == 0` disables the limiter — used by the
/// test harness and by anyone who wants to opt out via env config.
#[derive(Clone, Copy, Debug)]
pub struct RateLimitConfig {
pub per_sec: u32,
pub burst: u32,
}
impl Default for RateLimitConfig {
/// Disabled by default. The production `AuthConfig::from_env`
/// overrides to a real limit; the test harness keeps the default
/// so existing tests don't flake against shared buckets.
fn default() -> Self {
Self {
per_sec: 0,
burst: 0,
}
}
}
/// Production defaults: 5 requests/sec sustained, 10-request burst.
/// Tight enough to make brute force impractical, loose enough that a
/// real user mistyping their password three times in a row doesn't
/// hit it.
pub const PRODUCTION_PER_SEC: u32 = 5;
pub const PRODUCTION_BURST: u32 = 10;
struct Bucket {
tokens: f64,
last_refill: Instant,
}
/// Outcome of [`AuthRateLimiter::try_acquire`]. When `Denied`, the
/// caller can use `retry_after_secs` for a `Retry-After: N` header
/// (RFC 6585 §4) so well-behaved clients back off correctly rather
/// than retrying in a tight loop.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AcquireResult {
Allowed,
Denied { retry_after_secs: u64 },
}
/// Single-bucket token-bucket limiter. `try_acquire` is cheap (one
/// mutex acquire, no allocations) so the auth path doesn't pay a real
/// cost for the check.
pub struct AuthRateLimiter {
cfg: RateLimitConfig,
bucket: Mutex<Bucket>,
}
impl AuthRateLimiter {
pub fn new(cfg: RateLimitConfig) -> Self {
Self {
cfg,
bucket: Mutex::new(Bucket {
tokens: cfg.burst as f64,
last_refill: Instant::now(),
}),
}
}
/// Consume one token if available. Returns `Denied` with a
/// rounded-up seconds-until-refill so the caller can emit a
/// `Retry-After` header.
pub fn try_acquire(&self) -> AcquireResult {
if self.cfg.per_sec == 0 {
return AcquireResult::Allowed;
}
let now = Instant::now();
let mut bucket = self.bucket.lock().expect("rate limiter mutex poisoned");
let elapsed = now.duration_since(bucket.last_refill).as_secs_f64();
bucket.tokens =
(bucket.tokens + elapsed * f64::from(self.cfg.per_sec)).min(f64::from(self.cfg.burst));
bucket.last_refill = now;
if bucket.tokens >= 1.0 {
bucket.tokens -= 1.0;
AcquireResult::Allowed
} else {
// ceil((1 - tokens) / per_sec), minimum 1 — a `Retry-After: 0`
// would tell clients to retry immediately, which is what we're
// actively trying to discourage.
let deficit = 1.0 - bucket.tokens;
let wait_secs = (deficit / f64::from(self.cfg.per_sec)).ceil() as u64;
AcquireResult::Denied {
retry_after_secs: wait_secs.max(1),
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn disabled_limiter_always_allows() {
let rl = AuthRateLimiter::new(RateLimitConfig {
per_sec: 0,
burst: 0,
});
for _ in 0..1000 {
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
}
}
#[test]
fn burst_lets_through_initial_window_then_blocks() {
// 0 refill, burst 3 → first three pass, fourth blocks.
let rl = AuthRateLimiter::new(RateLimitConfig {
per_sec: 1,
burst: 3,
});
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
match rl.try_acquire() {
AcquireResult::Denied { retry_after_secs } => {
// Bucket is at ~0 tokens, refill rate 1/sec → ~1s wait.
assert!(
retry_after_secs >= 1,
"retry_after must be at least 1s, got {retry_after_secs}"
);
}
AcquireResult::Allowed => panic!("fourth request must be denied"),
}
}
#[test]
fn tokens_refill_over_time() {
// 10/sec → after ~120ms we should have at least one token back.
let rl = AuthRateLimiter::new(RateLimitConfig {
per_sec: 10,
burst: 1,
});
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
assert!(matches!(rl.try_acquire(), AcquireResult::Denied { .. }));
std::thread::sleep(std::time::Duration::from_millis(150));
assert_eq!(
rl.try_acquire(),
AcquireResult::Allowed,
"token should have refilled"
);
}
#[test]
fn retry_after_scales_inversely_with_refill_rate() {
// 1/sec → wait ~1s after burst exhausted.
// 10/sec → wait <1s, but we clamp to a minimum of 1s.
let slow = AuthRateLimiter::new(RateLimitConfig {
per_sec: 1,
burst: 1,
});
slow.try_acquire();
match slow.try_acquire() {
AcquireResult::Denied { retry_after_secs } => assert_eq!(retry_after_secs, 1),
_ => panic!("expected Denied"),
}
}
}

View File

@@ -31,6 +31,7 @@ use mangalord::crawler::content::{self, SyncOutcome};
use mangalord::crawler::pipeline;
use mangalord::crawler::rate_limit::HostRateLimiters;
use mangalord::crawler::session;
use mangalord::crawler::source::DiscoverMode;
use mangalord::storage::{LocalStorage, Storage};
use sqlx::postgres::PgPoolOptions;
use sqlx::PgPool;
@@ -62,6 +63,8 @@ async fn main() -> anyhow::Result<()> {
let cdn_rate_ms = env_u64("CRAWLER_CDN_RATE_MS", rate_ms);
let limit = env_u64("CRAWLER_LIMIT", 0) as usize;
let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false);
let incremental_stop_after = env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
let mode = parse_crawler_mode(incremental_stop_after)?;
let skip_chapter_content = env_bool("CRAWLER_SKIP_CHAPTER_CONTENT", false);
let chapter_workers = env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize;
let force_refetch_chapters = env_bool("CRAWLER_FORCE_REFETCH_CHAPTERS", false);
@@ -78,21 +81,6 @@ async fn main() -> anyhow::Result<()> {
let proxy_url = std::env::var("CRAWLER_PROXY")
.ok()
.filter(|s| !s.trim().is_empty());
let tor_control_url = std::env::var("CRAWLER_TOR_CONTROL_URL")
.ok()
.filter(|s| !s.trim().is_empty());
let tor_control_password = std::env::var("CRAWLER_TOR_CONTROL_PASSWORD")
.ok()
.filter(|s| !s.trim().is_empty());
let tor_control_cookie_path = std::env::var("CRAWLER_TOR_CONTROL_COOKIE_PATH")
.ok()
.filter(|s| !s.trim().is_empty())
.map(std::path::PathBuf::from);
let tor_recircuit_max_attempts: u32 = std::env::var("CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(3)
.max(1);
let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false);
let db = PgPoolOptions::new()
@@ -127,8 +115,7 @@ async fn main() -> anyhow::Result<()> {
let mut options = LaunchOptions::from_env();
if let Some(proxy) = &proxy_url {
let chromium_proxy = mangalord::crawler::url_utils::chromium_proxy_arg(proxy);
options.extra_args.push(format!("--proxy-server={chromium_proxy}"));
options.extra_args.push(format!("--proxy-server={proxy}"));
}
let keep_open = match (keep_browser_open, options.mode) {
(true, BrowserMode::Headed) => true,
@@ -156,21 +143,11 @@ async fn main() -> anyhow::Result<()> {
user_agent = ?user_agent,
proxy = ?proxy_url,
keep_open,
?mode,
storage_dir = %storage_dir.display(),
"starting crawler"
);
let tor = mangalord::crawler::tor::TorController::from_parts(
tor_control_url.as_deref(),
tor_control_password.as_deref(),
tor_control_cookie_path.as_deref(),
)
.context("build TorController from CRAWLER_TOR_CONTROL_* env")?
.map(Arc::new);
if let Some(t) = &tor {
tracing::info!(?t, "TOR control configured");
}
// BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium
// alive for the entire run — same lifecycle as the old direct
// `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the
@@ -180,24 +157,17 @@ async fn main() -> anyhow::Result<()> {
let sid = sid.clone();
let domain = domain.clone();
let start_url_clone = start_url.clone();
let tor_for_launch = tor.as_ref().map(Arc::clone);
Arc::new(move |browser| {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url_clone.clone();
let tor_for_launch = tor_for_launch.as_ref().map(Arc::clone);
Box::pin(async move {
session::inject_phpsessid(&browser, &sid, &domain)
.await
.context("inject_phpsessid")?;
session::verify_session_with_recircuit(
&browser,
&start_url,
tor_for_launch.as_deref(),
tor_recircuit_max_attempts,
)
.await
.context("verify_session")?;
session::verify_session(&browser, &start_url)
.await
.context("verify_session")?;
Ok(())
})
})
@@ -221,7 +191,7 @@ async fn main() -> anyhow::Result<()> {
skip_chapter_content || !session_ready,
chapter_workers,
force_refetch_chapters,
tor.clone(),
mode,
)
.await;
@@ -251,7 +221,7 @@ async fn run(
skip_chapter_content: bool,
chapter_workers: usize,
force_refetch_chapters: bool,
tor: Option<Arc<mangalord::crawler::tor::TorController>>,
mode: DiscoverMode,
) -> anyhow::Result<()> {
let mut rate = HostRateLimiters::new(Duration::from_millis(rate_ms));
if let Some(host) = cdn_host {
@@ -259,39 +229,6 @@ async fn run(
}
let rate = Arc::new(rate);
// SSRF defence: only download from the catalog host + CDN host
// (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap
// single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes.
// CRAWLER_ALLOW_ANY_HOST=true short-circuits the host check for
// sharded-CDN sources; private-IP and scheme guards still apply.
let allowlist = if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
mangalord::crawler::safety::DownloadAllowlist::allow_any()
} else {
let mut allow = mangalord::crawler::safety::DownloadAllowlist::new();
if let Ok(parsed) = reqwest::Url::parse(start_url) {
if let Some(h) = parsed.host_str() {
allow = allow.allow(h);
}
}
if let Some(host) = cdn_host {
allow = allow.allow(host);
}
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
for piece in extras.split(',') {
let trimmed = piece.trim();
if !trimmed.is_empty() {
allow = allow.allow(trimmed);
}
}
}
allow
};
let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(mangalord::crawler::safety::DEFAULT_MAX_IMAGE_BYTES);
let allowlist = Arc::new(allowlist);
let stats = pipeline::run_metadata_pass(
manager.as_ref(),
db,
@@ -301,14 +238,7 @@ async fn run(
start_url,
limit,
skip_chapters,
allowlist.as_ref(),
max_image_bytes,
// Circuit-breaker disabled for the operator-driven CLI: a manual
// sweep should push through transient failures, not self-abort.
0,
// No live status surface for the one-shot CLI.
None,
tor.as_deref(),
mode,
)
.await?;
tracing::info!(?stats, "metadata pass complete");
@@ -323,9 +253,6 @@ async fn run(
"target",
chapter_workers,
force_refetch_chapters,
Arc::clone(&allowlist),
max_image_bytes,
tor.clone(),
)
.await?;
}
@@ -349,9 +276,6 @@ async fn sync_bookmarked_chapter_content(
source_id: &str,
workers: usize,
force_refetch: bool,
allowlist: Arc<mangalord::crawler::safety::DownloadAllowlist>,
max_image_bytes: usize,
tor: Option<Arc<mangalord::crawler::tor::TorController>>,
) -> anyhow::Result<()> {
let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as(
r#"
@@ -388,8 +312,6 @@ async fn sync_bookmarked_chapter_content(
let storage = Arc::clone(&storage);
let rate = Arc::clone(&rate);
let manager = Arc::clone(&manager);
let allowlist = Arc::clone(&allowlist);
let tor = tor.clone();
let stats = &stats;
async move {
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
@@ -414,11 +336,6 @@ async fn sync_bookmarked_chapter_content(
manga_id,
&source_url,
force_refetch,
allowlist.as_ref(),
max_image_bytes,
tor.as_deref(),
// CLI one-shot — no live status surface.
None,
)
.await;
drop(lease);
@@ -480,6 +397,38 @@ fn resolve_start_url() -> anyhow::Result<String> {
})
}
/// Parse the CLI's `CRAWLER_MODE`. Defaults to `backfill` because the
/// binary is operator-driven (manual reseeds, force-refetches) — the
/// auto-detect logic lives in the daemon. `auto` is rejected because
/// the CLI has no DB state to consult before the run.
fn parse_crawler_mode(incremental_stop_after: usize) -> anyhow::Result<DiscoverMode> {
parse_crawler_mode_str(
std::env::var("CRAWLER_MODE").ok().as_deref(),
incremental_stop_after,
)
}
/// Pure variant of [`parse_crawler_mode`] — testable without env-var
/// mutation.
fn parse_crawler_mode_str(
raw: Option<&str>,
incremental_stop_after: usize,
) -> anyhow::Result<DiscoverMode> {
match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() {
None | Some("") | Some("backfill") => Ok(DiscoverMode::Backfill),
Some("incremental") => Ok(DiscoverMode::Incremental {
stop_after_unchanged: incremental_stop_after,
}),
Some("auto") => Err(anyhow!(
"CRAWLER_MODE=auto isn't supported by the CLI (use backfill or incremental); \
the daemon does auto-detection"
)),
Some(other) => Err(anyhow!(
"CRAWLER_MODE must be one of: backfill, incremental (got {other:?})"
)),
}
}
fn env_u64(name: &str, default: u64) -> u64 {
std::env::var(name)
.ok()
@@ -495,3 +444,55 @@ fn env_bool(name: &str, default: bool) -> bool {
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn cli_mode_defaults_to_backfill_when_unset_or_blank() {
let none = parse_crawler_mode_str(None, 20).unwrap();
assert!(matches!(none, DiscoverMode::Backfill));
let blank = parse_crawler_mode_str(Some(""), 20).unwrap();
assert!(matches!(blank, DiscoverMode::Backfill));
}
#[test]
fn cli_mode_recognizes_backfill_and_incremental() {
let backfill = parse_crawler_mode_str(Some("backfill"), 20).unwrap();
assert!(matches!(backfill, DiscoverMode::Backfill));
let incremental = parse_crawler_mode_str(Some("incremental"), 9).unwrap();
assert!(matches!(
incremental,
DiscoverMode::Incremental { stop_after_unchanged: 9 }
));
}
#[test]
fn cli_mode_rejects_auto_explicitly() {
let err = parse_crawler_mode_str(Some("auto"), 20).unwrap_err();
let msg = format!("{err}");
assert!(
msg.contains("daemon"),
"rejection should point operator at the daemon: {msg}"
);
}
#[test]
fn cli_mode_rejects_unknown_value() {
let err = parse_crawler_mode_str(Some("garbage"), 20).unwrap_err();
let msg = format!("{err}");
assert!(msg.contains("backfill"));
assert!(msg.contains("incremental"));
}
#[test]
fn cli_mode_is_case_insensitive_and_trims() {
let mixed = parse_crawler_mode_str(Some(" Incremental "), 4).unwrap();
assert!(matches!(
mixed,
DiscoverMode::Incremental { stop_after_unchanged: 4 }
));
}
}

View File

@@ -5,28 +5,22 @@ use chrono::NaiveTime;
use chrono_tz::Tz;
use crate::crawler::browser::LaunchOptions;
use crate::crawler::safety::{DownloadAllowlist, DEFAULT_MAX_IMAGE_BYTES};
use crate::crawler::source::DiscoverMode;
/// What `CRAWLER_MODE` was set to. `Auto` is the daemon's default —
/// pick Backfill until `seed_completed_at` is written, then flip to
/// Incremental. `Explicit` forces a single mode regardless.
#[derive(Clone, Copy, Debug)]
pub enum CrawlerModePref {
Auto,
Explicit(DiscoverMode),
}
#[derive(Clone, Debug)]
pub struct AuthConfig {
pub cookie_secure: bool,
pub cookie_domain: Option<String>,
pub session_ttl_days: i64,
pub rate_limit: crate::auth::rate_limit::RateLimitConfig,
/// When `false`, `POST /auth/register` returns 403
/// `registration_disabled` and the frontend hides its register
/// affordance. Admins can still mint accounts via
/// `POST /admin/users`. Defaults to `true` (open registration)
/// for backward compatibility.
pub allow_self_register: bool,
/// When `true`, every API path except a small allowlist
/// (`/health`, `/auth/config`, `/auth/login`, `/auth/logout`)
/// requires a valid session cookie or bearer token — anonymous
/// reads are rejected with 401. Self-registration is also
/// force-disabled regardless of [`Self::allow_self_register`]
/// so a private instance is locked down with a single switch.
/// Defaults to `false` (current public behaviour).
pub private_mode: bool,
}
impl Default for AuthConfig {
@@ -35,13 +29,6 @@ impl Default for AuthConfig {
cookie_secure: true,
cookie_domain: None,
session_ttl_days: 30,
// Disabled by default so the test harness inherits a
// non-throttling limiter. Production `from_env` overrides
// to the [`PRODUCTION_PER_SEC`]/[`PRODUCTION_BURST`]
// defaults.
rate_limit: crate::auth::rate_limit::RateLimitConfig::default(),
allow_self_register: true,
private_mode: false,
}
}
}
@@ -75,13 +62,6 @@ pub struct Config {
pub upload: UploadConfig,
pub cors_allowed_origins: Vec<String>,
pub crawler: CrawlerConfig,
/// `(username, password)` for the admin user provisioned at startup
/// when both `ADMIN_USERNAME` and `ADMIN_PASSWORD` are set. `None`
/// skips the bootstrap entirely. See `repo::user::bootstrap_admin`
/// for the create-vs-promote semantics — notably the password here
/// is used only when creating a new row, never to overwrite an
/// existing one.
pub admin_bootstrap: Option<(String, String)>,
}
/// All crawler-daemon knobs read from env. Mirrors the env vars the
@@ -106,45 +86,13 @@ pub struct CrawlerConfig {
pub cookie_domain: Option<String>,
pub user_agent: Option<String>,
pub proxy: Option<String>,
/// `tcp://host:port`, `host:port`, or bare `host` (default port
/// 9051). When `None`, TOR-recircuit-on-transient is disabled and
/// the crawler behaves identically to pre-TOR releases.
pub tor_control_url: Option<String>,
/// HashedControlPassword auth. Used only when
/// `tor_control_cookie_path` is `None`.
pub tor_control_password: Option<String>,
/// Cookie-file auth path (e.g.
/// `/var/lib/tor/control_auth_cookie`). Takes precedence over
/// password when both are set.
pub tor_control_cookie_path: Option<PathBuf>,
/// Maximum NEWNYM-and-retry cycles per recircuit-eligible failure.
/// Defaults to 3.
pub tor_recircuit_max_attempts: u32,
pub browser: LaunchOptions,
/// Hosts the crawler is allowed to download images / covers from.
/// Always seeded with the host of `start_url` and (when set) the
/// configured `cdn_host`. Additional hosts can be added via
/// `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-separated).
pub download_allowlist: DownloadAllowlist,
/// Hard upper bound on a single image download. Defaults to 32 MiB.
pub max_image_bytes: usize,
/// Max manga detail fetches per metadata pass. `0` means no cap
/// (full sweep up to the source's own bound). Sourced from
/// `CRAWLER_LIMIT`, mirroring the CLI binary.
pub manga_limit: usize,
/// Hard upper bound on a single chapter-content job dispatch. A job
/// exceeding this is acked failed (exponential backoff) instead of
/// wedging a worker. Defaults to 600s. `CRAWLER_JOB_TIMEOUT_SECS`.
pub job_timeout: Duration,
/// Consecutive `fetch_manga` failures that abort a metadata pass
/// (circuit-breaker for a source outage). The pass does NOT mark a
/// clean exit, so the next tick does a recovery sweep. Defaults to
/// 10. `CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES`.
pub metadata_max_consecutive_failures: u32,
/// Consecutive transient chapter failures (after TOR recircuit is
/// exhausted) that trigger an automatic coordinated browser restart.
/// Defaults to 3. `CRAWLER_BROWSER_RESTART_THRESHOLD`.
pub browser_restart_threshold: u32,
/// Mode preference for the metadata pass. Daemon default is `Auto`
/// (Backfill until `seed_completed_at` is written, then Incremental).
pub mode: CrawlerModePref,
/// `stop_after_unchanged` threshold supplied to Incremental in both
/// `Auto` (post-seed) and `Explicit(Incremental)` modes.
pub incremental_stop_after: usize,
}
impl Default for CrawlerConfig {
@@ -164,17 +112,9 @@ impl Default for CrawlerConfig {
cookie_domain: None,
user_agent: None,
proxy: None,
tor_control_url: None,
tor_control_password: None,
tor_control_cookie_path: None,
tor_recircuit_max_attempts: 3,
browser: LaunchOptions::headless(),
download_allowlist: DownloadAllowlist::new(),
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
manga_limit: 0,
job_timeout: Duration::from_secs(600),
metadata_max_consecutive_failures: 10,
browser_restart_threshold: 3,
mode: CrawlerModePref::Auto,
incremental_stop_after: 20,
}
}
}
@@ -195,18 +135,6 @@ impl Config {
.ok()
.filter(|s| !s.is_empty()),
session_ttl_days: env_i64("SESSION_TTL_DAYS", 30),
rate_limit: crate::auth::rate_limit::RateLimitConfig {
per_sec: env_u64(
"AUTH_RATE_PER_SEC",
crate::auth::rate_limit::PRODUCTION_PER_SEC.into(),
) as u32,
burst: env_u64(
"AUTH_RATE_BURST",
crate::auth::rate_limit::PRODUCTION_BURST.into(),
) as u32,
},
allow_self_register: env_bool("ALLOW_SELF_REGISTER", true),
private_mode: env_bool("PRIVATE_MODE", false),
},
upload: UploadConfig {
max_request_bytes: env_usize("MAX_REQUEST_BYTES", 200 * 1024 * 1024),
@@ -222,21 +150,10 @@ impl Config {
})
.unwrap_or_default(),
crawler: CrawlerConfig::from_env()?,
admin_bootstrap: admin_bootstrap_from_env(),
})
}
}
/// Returns `Some((username, password))` only when BOTH `ADMIN_USERNAME`
/// and `ADMIN_PASSWORD` are set and non-empty. Half-set configuration is
/// treated as "no bootstrap" rather than a hard error, so an operator
/// can comment out one env var without crashing the server.
fn admin_bootstrap_from_env() -> Option<(String, String)> {
let username = std::env::var("ADMIN_USERNAME").ok().filter(|s| !s.is_empty())?;
let password = std::env::var("ADMIN_PASSWORD").ok().filter(|s| !s.is_empty())?;
Some((username, password))
}
impl CrawlerConfig {
pub fn from_env() -> anyhow::Result<Self> {
// Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
@@ -252,14 +169,9 @@ impl CrawlerConfig {
.parse()
.map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
};
let start_url = std::env::var("CRAWLER_START_URL")
.ok()
.filter(|s| !s.trim().is_empty());
let cdn_host = std::env::var("CRAWLER_CDN_HOST")
.ok()
.filter(|s| !s.trim().is_empty());
let download_allowlist =
build_download_allowlist(start_url.as_deref(), cdn_host.as_deref());
let incremental_stop_after =
env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
let mode = parse_mode_env(incremental_stop_after)?;
Ok(Self {
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
daily_at,
@@ -267,9 +179,13 @@ impl CrawlerConfig {
idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
start_url,
start_url: std::env::var("CRAWLER_START_URL")
.ok()
.filter(|s| !s.trim().is_empty()),
rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
cdn_host,
cdn_host: std::env::var("CRAWLER_CDN_HOST")
.ok()
.filter(|s| !s.trim().is_empty()),
cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
phpsessid: std::env::var("CRAWLER_PHPSESSID")
.ok()
@@ -283,71 +199,37 @@ impl CrawlerConfig {
proxy: std::env::var("CRAWLER_PROXY")
.ok()
.filter(|s| !s.trim().is_empty()),
tor_control_url: std::env::var("CRAWLER_TOR_CONTROL_URL")
.ok()
.filter(|s| !s.trim().is_empty()),
tor_control_password: std::env::var("CRAWLER_TOR_CONTROL_PASSWORD")
.ok()
.filter(|s| !s.trim().is_empty()),
tor_control_cookie_path: std::env::var("CRAWLER_TOR_CONTROL_COOKIE_PATH")
.ok()
.filter(|s| !s.trim().is_empty())
.map(PathBuf::from),
tor_recircuit_max_attempts: env_u64("CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS", 3)
.max(1) as u32,
browser: LaunchOptions::from_env(),
download_allowlist,
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
manga_limit: env_usize("CRAWLER_LIMIT", 0),
job_timeout: Duration::from_secs(env_u64("CRAWLER_JOB_TIMEOUT_SECS", 600).max(1)),
metadata_max_consecutive_failures: env_u64(
"CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES",
10,
) as u32,
browser_restart_threshold: env_u64("CRAWLER_BROWSER_RESTART_THRESHOLD", 3).max(1)
as u32,
mode,
incremental_stop_after,
})
}
}
/// Build the download allowlist from env. Always includes
/// `CRAWLER_START_URL`'s host (so the crawler can fetch covers from
/// the catalog itself) and `CRAWLER_CDN_HOST` when set. Additional
/// hosts can be supplied via `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-
/// separated). Empty by default — meaning the crawler refuses to
/// download anything when no source is configured, which is the safe
/// fail-closed posture.
///
/// `CRAWLER_ALLOW_ANY_HOST=true` short-circuits the host enumeration
/// for operators whose sources shard across numbered CDN subdomains.
/// Scheme + private-IP defenses still apply.
fn build_download_allowlist(
start_url: Option<&str>,
cdn_host: Option<&str>,
) -> DownloadAllowlist {
if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
return DownloadAllowlist::allow_any();
/// Parse `CRAWLER_MODE`. Empty/unset → `Auto`. Recognized values are
/// `auto`, `backfill`, and `incremental` (case-insensitive). Anything
/// else is a hard error so a typo can't silently fall through to the
/// default and mask itself.
fn parse_mode_env(incremental_stop_after: usize) -> anyhow::Result<CrawlerModePref> {
parse_mode_str(std::env::var("CRAWLER_MODE").ok().as_deref(), incremental_stop_after)
}
/// Pure variant of [`parse_mode_env`] — testable without env-var
/// mutation. Takes the raw value (or `None` if unset).
pub(crate) fn parse_mode_str(
raw: Option<&str>,
incremental_stop_after: usize,
) -> anyhow::Result<CrawlerModePref> {
match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() {
None | Some("") | Some("auto") => Ok(CrawlerModePref::Auto),
Some("backfill") => Ok(CrawlerModePref::Explicit(DiscoverMode::Backfill)),
Some("incremental") => Ok(CrawlerModePref::Explicit(DiscoverMode::Incremental {
stop_after_unchanged: incremental_stop_after,
})),
Some(other) => Err(anyhow::anyhow!(
"CRAWLER_MODE must be one of: auto, backfill, incremental (got {other:?})"
)),
}
let mut allow = DownloadAllowlist::new();
if let Some(url) = start_url {
if let Ok(parsed) = reqwest::Url::parse(url) {
if let Some(h) = parsed.host_str() {
allow = allow.allow(h);
}
}
}
if let Some(host) = cdn_host {
allow = allow.allow(host);
}
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
for piece in extras.split(',') {
let trimmed = piece.trim();
if !trimmed.is_empty() {
allow = allow.allow(trimmed);
}
}
}
allow
}
fn env_u64(name: &str, default: u64) -> u64 {
@@ -382,88 +264,59 @@ fn env_usize(name: &str, default: usize) -> usize {
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
// Serialise env-touching tests so concurrent cargo-test threads don't
// race on the process-global env. Re-acquire on poison since a
// panicking test still leaves the env in a consistent state for us
// (we set/unset within each guard region).
static ENV_GUARD: Mutex<()> = Mutex::new(());
#[test]
fn crawler_limit_env_populates_manga_limit() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::set_var("CRAWLER_LIMIT", "96");
let cfg = CrawlerConfig::from_env().expect("from_env");
std::env::remove_var("CRAWLER_LIMIT");
assert_eq!(cfg.manga_limit, 96);
fn parse_mode_str_defaults_to_auto_when_unset_or_blank() {
let none = parse_mode_str(None, 20).unwrap();
assert!(matches!(none, CrawlerModePref::Auto));
let blank = parse_mode_str(Some(""), 20).unwrap();
assert!(matches!(blank, CrawlerModePref::Auto));
let whitespace = parse_mode_str(Some(" "), 20).unwrap();
assert!(matches!(whitespace, CrawlerModePref::Auto));
}
#[test]
fn crawler_limit_unset_defaults_to_zero() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::remove_var("CRAWLER_LIMIT");
let cfg = CrawlerConfig::from_env().expect("from_env");
assert_eq!(cfg.manga_limit, 0);
fn parse_mode_str_recognizes_each_keyword() {
let auto = parse_mode_str(Some("auto"), 20).unwrap();
assert!(matches!(auto, CrawlerModePref::Auto));
let backfill = parse_mode_str(Some("backfill"), 20).unwrap();
assert!(matches!(
backfill,
CrawlerModePref::Explicit(DiscoverMode::Backfill)
));
let incremental = parse_mode_str(Some("incremental"), 7).unwrap();
assert!(matches!(
incremental,
CrawlerModePref::Explicit(DiscoverMode::Incremental {
stop_after_unchanged: 7
})
));
}
#[test]
fn reliability_knobs_default_when_unset() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::remove_var("CRAWLER_JOB_TIMEOUT_SECS");
std::env::remove_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES");
std::env::remove_var("CRAWLER_BROWSER_RESTART_THRESHOLD");
let cfg = CrawlerConfig::from_env().expect("from_env");
assert_eq!(cfg.job_timeout, Duration::from_secs(600));
assert_eq!(cfg.metadata_max_consecutive_failures, 10);
assert_eq!(cfg.browser_restart_threshold, 3);
fn parse_mode_str_is_case_insensitive_and_trims_whitespace() {
let mixed = parse_mode_str(Some(" Incremental "), 5).unwrap();
assert!(matches!(
mixed,
CrawlerModePref::Explicit(DiscoverMode::Incremental {
stop_after_unchanged: 5
})
));
let upper = parse_mode_str(Some("BACKFILL"), 5).unwrap();
assert!(matches!(
upper,
CrawlerModePref::Explicit(DiscoverMode::Backfill)
));
}
#[test]
fn reliability_knobs_parse_from_env() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::set_var("CRAWLER_JOB_TIMEOUT_SECS", "120");
std::env::set_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES", "5");
std::env::set_var("CRAWLER_BROWSER_RESTART_THRESHOLD", "7");
let cfg = CrawlerConfig::from_env().expect("from_env");
std::env::remove_var("CRAWLER_JOB_TIMEOUT_SECS");
std::env::remove_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES");
std::env::remove_var("CRAWLER_BROWSER_RESTART_THRESHOLD");
assert_eq!(cfg.job_timeout, Duration::from_secs(120));
assert_eq!(cfg.metadata_max_consecutive_failures, 5);
assert_eq!(cfg.browser_restart_threshold, 7);
}
#[test]
fn private_mode_env_parses_true() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::set_var("PRIVATE_MODE", "true");
std::env::set_var("DATABASE_URL", "postgres://test");
let cfg = Config::from_env().expect("from_env");
std::env::remove_var("PRIVATE_MODE");
std::env::remove_var("DATABASE_URL");
assert!(cfg.auth.private_mode);
}
#[test]
fn private_mode_env_parses_false() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::set_var("PRIVATE_MODE", "false");
std::env::set_var("DATABASE_URL", "postgres://test");
let cfg = Config::from_env().expect("from_env");
std::env::remove_var("PRIVATE_MODE");
std::env::remove_var("DATABASE_URL");
assert!(!cfg.auth.private_mode);
}
#[test]
fn private_mode_defaults_to_false() {
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
std::env::remove_var("PRIVATE_MODE");
std::env::set_var("DATABASE_URL", "postgres://test");
let cfg = Config::from_env().expect("from_env");
std::env::remove_var("DATABASE_URL");
assert!(!cfg.auth.private_mode);
fn parse_mode_str_hard_errors_on_unknown_value() {
let err = parse_mode_str(Some("backfil"), 20).unwrap_err();
let msg = format!("{err}");
assert!(msg.contains("backfill"), "error should list valid values: {msg}");
assert!(msg.contains("auto"));
assert!(msg.contains("incremental"));
}
}

View File

@@ -1,17 +1,10 @@
//! Chromium launcher and lifecycle.
//!
//! By default uses `chromiumoxide`'s `fetcher` feature — first call
//! downloads a known-good revision into a cache dir and reuses it
//! forever after. Set `CRAWLER_CHROMIUM_BINARY` to skip the fetcher
//! and use a system-installed Chromium instead; required on platforms
//! where the upstream snapshot bucket has no usable build (notably
//! `Linux_arm64` / Raspberry Pi). Debian's package is at
//! `/usr/bin/chromium` or `/usr/bin/chromium-headless-shell`; Ubuntu
//! ships it as `chromium-browser` at a different path — don't paste
//! the wrong one.
//!
//! `BrowserMode` toggles headed vs headless; the headed path needs a
//! display (real `$DISPLAY` or `xvfb-run`).
//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a
//! system Chrome install — first call downloads a known-good revision
//! into a cache dir and reuses it forever after. `BrowserMode` toggles
//! headed vs headless; the headed path needs a display (real `$DISPLAY`
//! or `xvfb-run`).
//!
//! Extra Chromium command-line flags can be supplied through
//! [`LaunchOptions::extra_args`] in code, or via the
@@ -128,85 +121,54 @@ impl Handle {
}
/// Closes the browser and awaits the driver task. If other Arcs to
/// the browser are still alive we can't issue a clean CDP `close`,
/// so we abort the driver task instead — otherwise `handler.next()`
/// keeps polling forever and `Handle::close` hangs (chromiumoxide's
/// handler stream doesn't end on its own when the underlying WS
/// dies). Chromium itself is reaped by kill-on-drop once the last
/// `Arc<Browser>` is dropped.
/// the browser are still alive we fall back to drop-kills-Chromium
/// semantics and just join the driver — this is the rare case where
/// shutdown raced an outstanding worker; the OS-level kill is the
/// safety net.
pub async fn close(self) -> anyhow::Result<()> {
close_or_abort(self.browser, self.driver, |mut owned| async move {
let _ = owned.close().await;
let _ = owned.wait().await;
})
.await;
match Arc::try_unwrap(self.browser) {
Ok(mut owned) => {
let _ = owned.close().await;
let _ = owned.wait().await;
}
Err(shared) => {
tracing::warn!(
strong_count = Arc::strong_count(&shared),
"Handle::close while Arc<Browser> still shared — relying on kill-on-drop"
);
drop(shared);
}
}
let _ = self.driver.await;
Ok(())
}
}
/// Shutdown core for [`Handle::close`], extracted so it can be unit-
/// tested without launching real Chromium. When `arc` is uniquely owned,
/// `on_owned` runs against the owned value and the driver is awaited
/// normally. When other Arc holders exist, the driver is aborted before
/// awaiting it so shutdown returns promptly.
async fn close_or_abort<T, F, Fut>(arc: Arc<T>, driver: JoinHandle<()>, on_owned: F)
where
T: Send + 'static,
F: FnOnce(T) -> Fut + Send,
Fut: std::future::Future<Output = ()> + Send,
{
match Arc::try_unwrap(arc) {
Ok(owned) => {
on_owned(owned).await;
let _ = driver.await;
}
Err(shared) => {
tracing::warn!(
strong_count = Arc::strong_count(&shared),
"Handle::close while Arc still shared — aborting driver, relying on kill-on-drop"
);
drop(shared);
driver.abort();
let _ = driver.await;
}
}
}
/// Launches Chromium. If `CRAWLER_CHROMIUM_BINARY` is set, uses that
/// path directly. Otherwise downloads via the `fetcher` feature on
/// first run and hits the cache after that. The fetcher cache dir is
/// Launches Chromium. Downloads it on first run via the `fetcher`
/// feature; subsequent runs hit the cache. The cache dir is
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
/// else `./.chromium-cache` as a last-resort repo-local fallback.
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
let executable = match system_chromium_path_from_env() {
Some(path) => {
tracing::info!(path = %path.display(), "using system chromium (CRAWLER_CHROMIUM_BINARY)");
path
}
None => {
let cache = cache_dir()?;
tokio::fs::create_dir_all(&cache)
.await
.with_context(|| format!("create cache dir {}", cache.display()))?;
let cache = cache_dir()?;
tokio::fs::create_dir_all(&cache)
.await
.with_context(|| format!("create cache dir {}", cache.display()))?;
let fetcher = BrowserFetcher::new(
BrowserFetcherOptions::builder()
.with_path(&cache)
.build()
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
);
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
let info = fetcher
.fetch()
.await
.context("download chromium via fetcher")?;
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
info.executable_path
}
};
let fetcher = BrowserFetcher::new(
BrowserFetcherOptions::builder()
.with_path(&cache)
.build()
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
);
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
let info = fetcher
.fetch()
.await
.context("download chromium via fetcher")?;
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
let mut builder = BrowserConfig::builder()
.chrome_executable(executable)
.chrome_executable(info.executable_path)
// Linux containers / CI commonly lack the user namespaces
// Chromium's sandbox wants. Disable it; the crawler runs in its
// own container anyway.
@@ -263,24 +225,6 @@ fn cache_dir() -> anyhow::Result<PathBuf> {
Ok(PathBuf::from("./.chromium-cache"))
}
/// Reads `CRAWLER_CHROMIUM_BINARY` and delegates to the pure helper.
/// Thin wrapper kept separate so the decision logic can be unit-tested
/// without mutating the process environment.
fn system_chromium_path_from_env() -> Option<PathBuf> {
system_chromium_path_from_value(std::env::var_os("CRAWLER_CHROMIUM_BINARY").as_deref())
}
/// Returns `Some(path)` only when the value is set and non-empty. An
/// exported-but-blank var (common in compose `${VAR:-}` patterns when
/// the operator didn't fill it in) must behave like "unset" — otherwise
/// we'd hand chromiumoxide an empty path and fail launch in a confusing
/// way.
pub(crate) fn system_chromium_path_from_value(
raw: Option<&std::ffi::OsStr>,
) -> Option<PathBuf> {
raw.filter(|v| !v.is_empty()).map(PathBuf::from)
}
#[cfg(test)]
mod tests {
use super::*;
@@ -308,33 +252,6 @@ mod tests {
assert!(parse_args(" \t\n").is_empty());
}
#[test]
fn system_chromium_path_returns_some_when_value_set() {
let raw = std::ffi::OsString::from("/usr/bin/chromium-headless-shell");
assert_eq!(
system_chromium_path_from_value(Some(raw.as_os_str())),
Some(PathBuf::from("/usr/bin/chromium-headless-shell"))
);
}
#[test]
fn system_chromium_path_returns_none_when_unset() {
assert_eq!(system_chromium_path_from_value(None), None);
}
#[test]
fn system_chromium_path_treats_empty_as_unset() {
// Compose's `${VAR:-}` substitution produces an exported-but-empty
// env var when the operator left it blank. Treat it as unset so
// the launcher falls back to the fetcher path instead of handing
// chromiumoxide an empty path.
let raw = std::ffi::OsString::from("");
assert_eq!(
system_chromium_path_from_value(Some(raw.as_os_str())),
None
);
}
#[test]
fn default_launch_options_are_headless() {
// Headless is the production-safe default — no display required,
@@ -344,54 +261,4 @@ mod tests {
assert_eq!(LaunchOptions::headless().mode, BrowserMode::Headless);
assert_eq!(LaunchOptions::headed().mode, BrowserMode::Headed);
}
// Regression: if another Arc<Browser> outlives `Handle::close`, the
// old code awaited the driver task forever because the chromiumoxide
// handler stream doesn't return None on its own. Aborting the driver
// unblocks shutdown even when kill-on-drop can't fire yet.
#[tokio::test]
async fn close_or_abort_returns_when_arc_is_shared() {
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
let arc = Arc::new(());
let _keepalive = Arc::clone(&arc); // forces try_unwrap to fail
let driver = tokio::spawn(std::future::pending::<()>());
let on_owned_ran = Arc::new(AtomicBool::new(false));
let flag = Arc::clone(&on_owned_ran);
let fut = close_or_abort(arc, driver, move |_| {
let flag = Arc::clone(&flag);
async move { flag.store(true, Ordering::Release) }
});
tokio::time::timeout(Duration::from_secs(2), fut)
.await
.expect("close_or_abort must not hang when driver is pending and Arc is shared");
assert!(
!on_owned_ran.load(Ordering::Acquire),
"on_owned must not run when the Arc is still shared"
);
}
#[tokio::test]
async fn close_or_abort_runs_on_owned_when_arc_is_unique() {
use std::sync::atomic::{AtomicBool, Ordering};
let arc = Arc::new(());
let driver = tokio::spawn(async {}); // completes immediately
let on_owned_ran = Arc::new(AtomicBool::new(false));
let flag = Arc::clone(&on_owned_ran);
close_or_abort(arc, driver, move |_| {
let flag = Arc::clone(&flag);
async move { flag.store(true, Ordering::Release) }
})
.await;
assert!(
on_owned_ran.load(Ordering::Acquire),
"on_owned must run when the Arc is unique"
);
}
}

View File

@@ -13,7 +13,7 @@
//! until [`BrowserManager::shutdown`].
use std::ops::Deref;
use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::Duration;
@@ -71,42 +71,12 @@ impl ActiveTracker {
}
}
/// Lifecycle gate for a coordinated browser restart. `acquire()` parks
/// while not [`RestartPhase::Healthy`] so no new navigation starts mid-
/// restart; long-lived lease holders (the metadata pass) cooperate by
/// checking [`BrowserManager::is_restart_pending`] at safe boundaries.
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum RestartPhase {
/// Normal operation — acquires proceed.
Healthy,
/// Restart requested; new acquires park, waiting for in-flight leases
/// to drain.
Draining,
/// Chromium is being closed + relaunched.
Restarting,
}
const PHASE_HEALTHY: u8 = 0;
const PHASE_DRAINING: u8 = 1;
const PHASE_RESTARTING: u8 = 2;
pub struct BrowserManager {
inner: Mutex<Inner>,
active: Arc<ActiveTracker>,
launch_opts: LaunchOptions,
idle_timeout: Duration,
on_launch: OnLaunch,
/// Coarse lifecycle phase (one of the `PHASE_*` constants).
phase: AtomicU8,
/// Woken when the phase returns to `Healthy` so parked acquires resume.
resume: Notify,
/// Serialises coordinated restarts so concurrent requests collapse into
/// a single relaunch.
restart_lock: Mutex<()>,
/// Result of the most recent relaunch, so a caller that coalesced into
/// an in-progress restart reports that restart's real outcome instead
/// of a blind success.
last_restart_ok: AtomicBool,
}
struct Inner {
@@ -129,72 +99,28 @@ impl BrowserManager {
launch_opts,
idle_timeout,
on_launch,
phase: AtomicU8::new(PHASE_HEALTHY),
resume: Notify::new(),
restart_lock: Mutex::new(()),
last_restart_ok: AtomicBool::new(true),
})
}
/// Current restart phase.
pub fn phase(&self) -> RestartPhase {
match self.phase.load(Ordering::Acquire) {
PHASE_DRAINING => RestartPhase::Draining,
PHASE_RESTARTING => RestartPhase::Restarting,
_ => RestartPhase::Healthy,
}
}
fn set_phase(&self, phase: RestartPhase) {
let v = match phase {
RestartPhase::Healthy => PHASE_HEALTHY,
RestartPhase::Draining => PHASE_DRAINING,
RestartPhase::Restarting => PHASE_RESTARTING,
};
self.phase.store(v, Ordering::Release);
}
/// Whether a coordinated restart is in progress. Long-lived lease
/// holders poll this at safe boundaries and yield their lease so the
/// drain can complete promptly.
pub fn is_restart_pending(&self) -> bool {
self.phase() != RestartPhase::Healthy
}
/// Launch Chromium into `guard`, running the `on_launch` hook before
/// publishing the handle so a probe failure doesn't leave a half-
/// initialised browser behind.
async fn launch_into(&self, guard: &mut Inner) -> anyhow::Result<()> {
let handle = browser::launch(self.launch_opts.clone())
.await
.context("BrowserManager: launch chromium")?;
let shared = handle.shared();
if let Err(e) = (self.on_launch)(Arc::clone(&shared)).await {
let _ = handle.close().await;
return Err(e.context("BrowserManager: on_launch hook failed"));
}
guard.handle = Some(handle);
guard.shared = Some(shared);
Ok(())
}
/// Acquire a shared browser lease. The first acquire after a teardown
/// launches a fresh Chromium (and runs `on_launch`); subsequent acquires
/// while a process is alive just bump the counter and clone the `Arc`.
pub async fn acquire(&self) -> anyhow::Result<BrowserLease> {
// Park while a coordinated restart is draining/relaunching so no new
// navigation starts against a browser that's about to be torn down.
// The short sleep fallback guarantees liveness even if a `resume`
// notification is missed (classic Notify lost-wakeup).
while self.phase() != RestartPhase::Healthy {
tokio::select! {
_ = self.resume.notified() => {}
_ = tokio::time::sleep(Duration::from_millis(100)) => {}
}
}
let mut guard = self.inner.lock().await;
if guard.handle.is_none() {
self.launch_into(&mut guard).await?;
let handle = browser::launch(self.launch_opts.clone())
.await
.context("BrowserManager: launch chromium")?;
let shared = handle.shared();
// Run the on-launch hook before publishing the handle so a session
// probe failure doesn't leave a half-initialized browser behind.
if let Err(e) = (self.on_launch)(Arc::clone(&shared)).await {
// Close the just-launched browser since we won't be using it.
let _ = handle.close().await;
return Err(e.context("BrowserManager: on_launch hook failed"));
}
guard.handle = Some(handle);
guard.shared = Some(shared);
}
let browser = guard
.shared
@@ -208,51 +134,6 @@ impl BrowserManager {
})
}
/// Coordinated restart: block new acquires, wait for in-flight leases
/// to drain (up to `drain_deadline`, then force), close + relaunch
/// Chromium (re-running `on_launch` → re-inject session + probe), then
/// resume parked acquirers. Concurrent calls collapse into one
/// relaunch. The phase is always returned to `Healthy` — even if the
/// relaunch errors — so a failed restart never permanently wedges
/// acquisition (the next acquire retries the launch lazily).
pub async fn coordinated_restart(&self, drain_deadline: Duration) -> anyhow::Result<()> {
// Dedup: if a restart is already running, wait for it and report
// that restart's real outcome (not a blind success).
let _restart_guard = match self.restart_lock.try_lock() {
Ok(g) => g,
Err(_) => {
let _ = self.restart_lock.lock().await;
return if self.last_restart_ok.load(Ordering::Acquire) {
Ok(())
} else {
Err(anyhow::anyhow!("a concurrent coordinated browser restart failed"))
};
}
};
self.set_phase(RestartPhase::Draining);
await_drain(&self.active, drain_deadline).await;
self.set_phase(RestartPhase::Restarting);
let relaunch = {
let mut guard = self.inner.lock().await;
guard.shared = None;
if let Some(handle) = guard.handle.take() {
let _ = handle.close().await;
}
self.launch_into(&mut guard).await
};
self.last_restart_ok.store(relaunch.is_ok(), Ordering::Release);
self.set_phase(RestartPhase::Healthy);
self.resume.notify_waiters();
match &relaunch {
Ok(()) => tracing::info!("BrowserManager: coordinated restart complete"),
Err(e) => tracing::error!(error = ?e, "BrowserManager: coordinated restart relaunch failed"),
}
relaunch.context("coordinated_restart: relaunch")
}
/// Forcefully close the cached browser regardless of active count.
/// Used on daemon shutdown. After this returns the next acquire will
/// re-launch from scratch.
@@ -264,28 +145,6 @@ impl BrowserManager {
}
}
/// Mark the cached browser handle as unhealthy. The next `acquire`
/// will re-launch Chromium from scratch.
///
/// Same semantics as `shutdown` — the difference is intent:
/// `shutdown` runs once at daemon teardown, while `invalidate` is a
/// recovery hook callers fire after a CDP / connection / navigation
/// failure that suggests the underlying process has died. Calling
/// this while other workers still hold leases is safe — their
/// outstanding CDP operations will return channel-closed errors
/// and those workers will then re-acquire (re-launching Chromium).
///
/// Idempotent: calling on an already-invalidated manager is a
/// no-op.
pub async fn invalidate(&self) {
let mut guard = self.inner.lock().await;
guard.shared = None;
if let Some(handle) = guard.handle.take() {
let _ = handle.close().await;
tracing::warn!("BrowserManager: handle invalidated — next acquire will relaunch");
}
}
fn idle_timeout(&self) -> Duration {
self.idle_timeout
}
@@ -295,29 +154,6 @@ impl BrowserManager {
}
}
/// Wait for the active-lease count to reach zero, up to `deadline`. Wakes
/// on the tracker's idle signal and re-checks on a short poll so a missed
/// signal can't strand the drain. Returns when drained or when the
/// deadline elapses (the caller then force-restarts). Extracted as a free
/// fn so the timing logic is unit-testable without launching Chromium.
async fn await_drain(active: &Arc<ActiveTracker>, deadline: Duration) {
let start = tokio::time::Instant::now();
while active.current() > 0 {
let Some(remaining) = deadline.checked_sub(start.elapsed()) else {
tracing::warn!(
active = active.current(),
"coordinated_restart: drain deadline exceeded — forcing relaunch"
);
return;
};
let nap = remaining.min(Duration::from_millis(250));
tokio::select! {
_ = active.idle_signal().notified() => {}
_ = tokio::time::sleep(nap) => {}
}
}
}
/// Background reaper. Returns immediately when `idle_timeout == 0`.
/// Otherwise spawns a task that:
/// 1. Waits on `idle_signal` (woken when active hits zero).
@@ -395,80 +231,6 @@ mod tests {
assert_send_sync(&h);
}
/// Invalidate is the only `BrowserManager` method that's safe to
/// exercise in a unit test without launching Chromium — it's a
/// no-op when no handle has been cached, and that path is exactly
/// the one we want to verify is idempotent.
#[tokio::test]
async fn invalidate_is_a_noop_when_no_handle_cached() {
let mgr = BrowserManager::new(
crate::crawler::browser::LaunchOptions::default(),
Duration::ZERO,
noop_on_launch(),
);
// Two back-to-back invalidates must both complete; the second
// would hang or panic if the first had left torn state.
mgr.invalidate().await;
mgr.invalidate().await;
}
#[tokio::test]
async fn await_drain_returns_immediately_when_already_idle() {
let active = ActiveTracker::new();
let start = tokio::time::Instant::now();
await_drain(&active, Duration::from_secs(5)).await;
assert!(start.elapsed() < Duration::from_millis(200), "no wait when idle");
}
#[tokio::test]
async fn await_drain_completes_when_lease_released() {
let active = ActiveTracker::new();
active.acquire();
let bg = {
let a = Arc::clone(&active);
tokio::spawn(async move {
tokio::time::sleep(Duration::from_millis(100)).await;
a.release();
})
};
// Generous deadline; should return shortly after the release, not
// at the deadline.
let start = tokio::time::Instant::now();
await_drain(&active, Duration::from_secs(5)).await;
assert!(start.elapsed() < Duration::from_secs(2), "drained on release");
assert_eq!(active.current(), 0);
bg.await.unwrap();
}
#[tokio::test]
async fn await_drain_force_returns_after_deadline_when_stuck() {
let active = ActiveTracker::new();
active.acquire(); // never released
let start = tokio::time::Instant::now();
await_drain(&active, Duration::from_millis(300)).await;
let elapsed = start.elapsed();
assert!(elapsed >= Duration::from_millis(250), "waited ~deadline: {elapsed:?}");
assert!(elapsed < Duration::from_secs(2), "but not forever: {elapsed:?}");
assert_eq!(active.current(), 1, "still held — caller force-restarts");
}
#[test]
fn phase_transitions_reflect_is_restart_pending() {
let mgr = BrowserManager::new(
crate::crawler::browser::LaunchOptions::default(),
Duration::ZERO,
noop_on_launch(),
);
assert_eq!(mgr.phase(), RestartPhase::Healthy);
assert!(!mgr.is_restart_pending());
mgr.set_phase(RestartPhase::Draining);
assert!(mgr.is_restart_pending());
mgr.set_phase(RestartPhase::Restarting);
assert!(mgr.is_restart_pending());
mgr.set_phase(RestartPhase::Healthy);
assert!(!mgr.is_restart_pending());
}
#[tokio::test]
async fn active_tracker_signals_idle_only_on_zero_transition() {
let tracker = ActiveTracker::new();

View File

@@ -18,8 +18,7 @@ use uuid::Uuid;
use crate::crawler::detect::PageError;
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist};
use crate::crawler::session::{self, ChapterProbe};
use crate::crawler::session;
use crate::storage::Storage;
/// Parse the chapter page DOM and return the page images in `pageN`
@@ -73,125 +72,11 @@ pub enum SyncOutcome {
SessionExpired,
}
/// Per-chapter max fetch attempts when TOR is configured. `N = 3` means
/// up to 3 total page fetches with 2 NEWNYM signals between them. When
/// TOR is not configured the effective budget collapses to 1 (single
/// attempt, no retry, no recircuit — bit-for-bit pre-TOR behavior).
const CHAPTER_RECIRCUIT_MAX_ATTEMPTS: u32 = 3;
/// Outcome of [`fetch_chapter_html_with_recircuit`]. `Ok` carries the
/// final reader HTML; the other two map to `sync_chapter_content`'s
/// existing failure modes.
#[derive(Debug)]
enum ChapterFetchOutcome {
Ok(String),
/// `ChapterProbe::Unauthenticated` after exhausting recircuit
/// budget (or with budget=0). Caller returns
/// `SyncOutcome::SessionExpired`.
SessionExpired,
/// `ChapterProbe::Transient` after exhausting recircuit budget
/// (or with budget=0). Caller bails so the dispatcher does
/// exponential backoff.
PersistentTransient,
}
/// Single rate-limited Chromium navigation to the chapter URL,
/// returning the page HTML. Extracted from `sync_chapter_content` so
/// the recircuit loop can call it once per attempt.
async fn fetch_chapter_html_once(
browser: &chromiumoxide::Browser,
rate: &HostRateLimiters,
source_url: &str,
) -> anyhow::Result<String> {
rate.wait_for(source_url).await?;
let page = browser
.new_page(source_url)
.await
.with_context(|| format!("open chapter page {source_url}"))?;
crate::crawler::nav::wait_for_nav(&page)
.await
.context("wait for chapter nav")?;
// Best-effort wait for the reader marker — same partial-render
// race that bit the chapter-list parser can hit here. Timeout is
// not an error; the chapter probe + parser sentinels still catch
// real failures.
let _ = crate::crawler::nav::wait_for_selector(
&page,
"a#pic_container",
crate::crawler::nav::SELECTOR_TIMEOUT,
)
.await;
let html = page.content().await.context("read chapter html")?;
page.close().await.ok();
Ok(html)
}
/// Pure-over-IO loop: fetch + classify, up to `max_attempts` total
/// fetches. Between attempts, `recircuit` is invoked (a no-op when
/// TOR isn't configured). `max_attempts = 1` collapses to the
/// original single-shot behavior — `Unauthenticated` →
/// `SessionExpired`, `Transient` → `PersistentTransient` on the first
/// hit, no recircuit.
///
/// Semantics match [`crate::crawler::detect::retry_on_transient`] and
/// [`run_session_probe_loop`]: `N` is **total attempts including the
/// first**, so `N = 3` means 3 fetches and up to 2 NEWNYM calls.
/// `Unauthenticated` and `Transient` share the budget — the loop
/// doesn't distinguish, so a sequence like Transient → Unauth → Ok
/// counts as 3 attempts.
async fn fetch_chapter_html_with_recircuit<F, Fut, R, RFut>(
mut fetch: F,
mut recircuit: R,
max_attempts: u32,
source_url_for_msg: &str,
) -> anyhow::Result<ChapterFetchOutcome>
where
F: FnMut() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<String>>,
R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>,
{
debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
let mut attempt = 0u32;
loop {
attempt += 1;
let html = fetch().await?;
match session::classify_chapter_probe(&html) {
ChapterProbe::Ok => return Ok(ChapterFetchOutcome::Ok(html)),
ChapterProbe::Unauthenticated => {
if attempt >= max_attempts {
return Ok(ChapterFetchOutcome::SessionExpired);
}
tracing::warn!(
attempt,
max = max_attempts,
url = source_url_for_msg,
"chapter probe Unauthenticated; signaling TOR NEWNYM and retrying"
);
recircuit().await;
}
ChapterProbe::Transient => {
if attempt >= max_attempts {
return Ok(ChapterFetchOutcome::PersistentTransient);
}
tracing::warn!(
attempt,
max = max_attempts,
url = source_url_for_msg,
"chapter probe Transient; signaling TOR NEWNYM and retrying"
);
recircuit().await;
}
}
}
}
/// Fetch one chapter's images and persist them. Each image is streamed to
/// storage as it's fetched (peak memory ≈ one image, not the whole
/// chapter); the page rows + `page_count` are then written in one short
/// transaction. On any failure the chapter stays at `page_count = 0` (no
/// partial rows) and the blobs already written are deleted best-effort by
/// [`cleanup_orphans`], so a retry starts clean.
/// Fetch all images for one chapter and persist them atomically. On
/// any error after the first storage put, the DB transaction rolls
/// back so the chapter stays at `page_count = 0` and is retried on the
/// next run. Bytes already written to storage become orphans; a future
/// reaper sweeps them.
#[allow(clippy::too_many_arguments)]
pub async fn sync_chapter_content(
browser: &chromiumoxide::Browser,
@@ -203,13 +88,6 @@ pub async fn sync_chapter_content(
manga_id: Uuid,
source_url: &str,
force_refetch: bool,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
tor: Option<&crate::crawler::tor::TorController>,
// Optional live-status sink for the realtime page counter. The daemon
// dispatcher passes the shared handle (the chapter has already been
// registered via `begin_chapter`); the CLI / admin resync pass `None`.
progress: Option<&crate::crawler::status::StatusHandle>,
) -> anyhow::Result<SyncOutcome> {
// Skip if already fetched, unless caller explicitly forces.
if !force_refetch {
@@ -224,37 +102,23 @@ pub async fn sync_chapter_content(
}
}
// Fetch + classify. With TOR configured, allow up to
// CHAPTER_RECIRCUIT_MAX_ATTEMPTS total page fetches with NEWNYM
// between each. Without TOR, collapse to 1 attempt (no retry, no
// recircuit) — matches the pre-TOR single-shot behavior bit-for-bit.
let max_attempts = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 1 };
let html = match fetch_chapter_html_with_recircuit(
|| fetch_chapter_html_once(browser, rate, source_url),
|| async {
if let Some(t) = tor {
if let Err(e) = t.new_identity().await {
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
}
}
},
max_attempts,
source_url,
)
.await?
{
ChapterFetchOutcome::Ok(html) => html,
ChapterFetchOutcome::SessionExpired => return Ok(SyncOutcome::SessionExpired),
ChapterFetchOutcome::PersistentTransient => {
// Surface as a typed Err so the dispatcher path runs
// ack_failed with exponential backoff (rather than the
// session-expired sticky flag).
anyhow::bail!(
"chapter page at {source_url} returned a transient response after \
{max_attempts} attempt(s); will retry"
);
}
};
// Nav to chapter page (rate-limited per host).
rate.wait_for(source_url).await?;
let page = browser
.new_page(source_url)
.await
.with_context(|| format!("open chapter page {source_url}"))?;
page.wait_for_navigation().await.context("wait for chapter nav")?;
// Session probe: avatar present == still logged in. Missing means
// PHPSESSID expired; bail the entire crawler run.
if page.find_element("#avatar_menu").await.is_err() {
page.close().await.ok();
return Ok(SyncOutcome::SessionExpired);
}
let html = page.content().await.context("read chapter html")?;
page.close().await.ok();
let images = parse_chapter_pages(&html)
.with_context(|| format!("parse chapter pages at {source_url}"))?;
@@ -265,127 +129,45 @@ pub async fn sync_chapter_content(
// Resolve image URLs against the chapter URL (they may be relative).
let base = reqwest::Url::parse(source_url).context("parse chapter URL")?;
// Stream each image straight to storage as it's fetched, capping peak
// memory at a single image rather than the whole chapter. Track the
// keys written so they can be rolled back if a later page (or the
// final DB commit) fails — preserving the all-or-nothing guarantee
// without holding a DB transaction open across the network puts
// (which matters once `Storage` is backed by S3).
let total = images.len();
// Publish the now-known page total so the dashboard shows "0/N".
if let Some(p) = progress {
p.set_chapter_pages(chapter_id, 0, Some(total));
}
let mut written_keys: Vec<String> = Vec::with_capacity(total);
let mut stored: Vec<StoredPage> = Vec::with_capacity(total);
// Fetch every image bytes-first into memory before writing
// anything. Lets us bail the whole chapter cleanly if any image
// fails — DB stays at page_count=0, no partial rows persisted.
let mut fetched: Vec<(i32, Vec<u8>, &'static str)> = Vec::with_capacity(images.len());
for img in &images {
match download_and_store_page(
storage,
http,
rate,
&base,
source_url,
manga_id,
chapter_id,
img,
allowlist,
max_image_bytes,
)
.await
{
Ok(page) => {
written_keys.push(page.storage_key.clone());
stored.push(page);
// Live page counter: push the climbing count to subscribers.
if let Some(p) = progress {
p.set_chapter_pages(chapter_id, stored.len(), Some(total));
}
}
Err(e) => {
cleanup_orphans(storage, &written_keys).await;
return Err(e);
}
}
let url = base.join(&img.url).with_context(|| {
format!("join image URL {} onto {source_url}", img.url)
})?;
rate.wait_for(url.as_str()).await?;
let resp = http
.get(url.clone())
// Source CDNs commonly check Referer. Set it to the
// chapter page — matches what the browser would send.
.header(reqwest::header::REFERER, source_url)
.send()
.await
.with_context(|| format!("GET {url}"))?
.error_for_status()
.with_context(|| format!("non-2xx for {url}"))?;
let bytes = resp.bytes().await.context("read image body")?.to_vec();
let ext = infer::get(&bytes).map(|k| k.extension()).unwrap_or("bin");
fetched.push((img.page_number, bytes, ext));
}
// Short transaction: page rows + page_count only, no network I/O. On
// failure, roll back the stored bytes so the chapter stays at
// page_count=0 and is retried cleanly next run.
if let Err(e) = persist_pages(db, chapter_id, &stored).await {
cleanup_orphans(storage, &written_keys).await;
return Err(e);
}
Ok(SyncOutcome::Fetched { pages: stored.len() })
}
/// A page image that has been written to storage and is awaiting its DB
/// row. Carries everything `persist_pages` needs.
pub(crate) struct StoredPage {
page_number: i32,
storage_key: String,
content_type: String,
}
/// Download a single page image, validate it's really an image, and write
/// it to storage. Returns the storage key + content type. Does not touch
/// the DB — persistence is batched into one short transaction afterward.
#[allow(clippy::too_many_arguments)]
async fn download_and_store_page(
storage: &dyn Storage,
http: &reqwest::Client,
rate: &HostRateLimiters,
base: &reqwest::Url,
source_url: &str,
manga_id: Uuid,
chapter_id: Uuid,
img: &ChapterImage,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
) -> anyhow::Result<StoredPage> {
let url = base
.join(&img.url)
.with_context(|| format!("join image URL {} onto {source_url}", img.url))?;
rate.wait_for(url.as_str()).await?;
let bytes = fetch_bytes_capped(http, url.as_str(), Some(source_url), allowlist, max_image_bytes)
.await?;
// Reject any non-image response: the only valid output of an image URL
// is an image. `infer` returns None on truncated bytes too, which also
// wants to be a failure not a silent `.bin` extension.
if !looks_like_image(&bytes) {
anyhow::bail!(
"image URL {url} returned non-image bytes \
(first 16: {:?}); refusing to store as binary blob",
&bytes.get(..16.min(bytes.len()))
);
}
let ext = infer::get(&bytes)
.map(|k| k.extension())
.expect("looks_like_image asserted infer succeeded");
let key = format!(
"mangas/{manga_id}/chapters/{chapter_id}/pages/{:04}.{ext}",
img.page_number
);
storage
.put(&key, &bytes)
.await
.with_context(|| format!("put {key}"))?;
Ok(StoredPage {
page_number: img.page_number,
storage_key: key,
content_type: format!("image/{ext}"),
})
}
/// Persist the page rows + chapter `page_count` in one short transaction.
/// `(chapter_id, page_number)` is unique so re-runs are idempotent.
pub(crate) async fn persist_pages(
db: &PgPool,
chapter_id: Uuid,
stored: &[StoredPage],
) -> anyhow::Result<()> {
// Atomic write: storage puts + page row inserts + page_count
// update, all in one transaction. If anything fails, rollback +
// the chapter is retried next run. Storage orphans the bytes; a
// reaper sweeps them later.
let mut tx = db.begin().await.context("open chapter sync tx")?;
for page in stored {
for (page_number, bytes, ext) in &fetched {
let key = format!(
"mangas/{manga_id}/chapters/{chapter_id}/pages/{:04}.{ext}",
page_number
);
storage
.put(&key, bytes)
.await
.with_context(|| format!("put {key}"))?;
// (chapter_id, page_number) is unique — re-runs idempotent.
sqlx::query(
"INSERT INTO pages (chapter_id, page_number, storage_key, content_type)
VALUES ($1, $2, $3, $4)
@@ -394,41 +176,26 @@ pub(crate) async fn persist_pages(
content_type = EXCLUDED.content_type",
)
.bind(chapter_id)
.bind(page.page_number)
.bind(&page.storage_key)
.bind(&page.content_type)
.bind(page_number)
.bind(&key)
.bind(format!("image/{ext}"))
.execute(&mut *tx)
.await
.with_context(|| format!("insert page row {}", page.page_number))?;
.with_context(|| format!("insert page row {page_number}"))?;
}
sqlx::query("UPDATE chapters SET page_count = $1 WHERE id = $2")
.bind(stored.len() as i32)
.bind(fetched.len() as i32)
.bind(chapter_id)
.execute(&mut *tx)
.await
.context("update page_count")?;
tx.commit().await.context("commit chapter sync")?;
Ok(())
Ok(SyncOutcome::Fetched { pages: fetched.len() })
}
/// Best-effort delete of partially-written page blobs after a chapter sync
/// fails, so a retry doesn't accumulate orphans. Errors are logged, not
/// raised — a leftover blob is harmless and a future reaper can sweep it.
pub(crate) async fn cleanup_orphans(storage: &dyn Storage, keys: &[String]) {
for key in keys {
if let Err(e) = storage.delete(key).await {
tracing::warn!(
%key,
error = ?e,
"failed to delete orphaned page blob after chapter sync failure"
);
}
}
}
// Suppress unused-import warning for `session::registrable_domain`
// until the bin/crawler wiring lands in this branch and uses it
// through this module.
// Suppress unused-import warning for `session` until the bin/crawler
// wiring lands in this branch and uses it through this module.
#[allow(dead_code)]
fn _keep_session_in_scope() {
let _ = session::registrable_domain;
@@ -437,90 +204,6 @@ fn _keep_session_in_scope() {
#[cfg(test)]
mod tests {
use super::*;
use crate::storage::LocalStorage;
#[tokio::test]
async fn cleanup_orphans_deletes_written_keys() {
let dir = tempfile::tempdir().unwrap();
let storage = LocalStorage::new(dir.path());
let keys = vec![
"mangas/m/chapters/c/pages/0001.jpg".to_string(),
"mangas/m/chapters/c/pages/0002.jpg".to_string(),
];
for k in &keys {
storage.put(k, b"\xff\xd8\xff\xe0 jpeg-ish").await.unwrap();
assert!(storage.exists(k).await.unwrap());
}
cleanup_orphans(&storage, &keys).await;
for k in &keys {
assert!(!storage.exists(k).await.unwrap(), "{k} should be deleted");
}
}
#[tokio::test]
async fn cleanup_orphans_tolerates_missing_keys() {
// A key that was never written (e.g. the put itself failed) must
// not make cleanup error — it's best-effort.
let dir = tempfile::tempdir().unwrap();
let storage = LocalStorage::new(dir.path());
cleanup_orphans(&storage, &["never/written.jpg".to_string()]).await;
}
#[sqlx::test(migrations = "./migrations")]
async fn persist_pages_inserts_rows_and_sets_page_count(pool: PgPool) {
let manga_id = Uuid::new_v4();
let chapter_id = Uuid::new_v4();
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, 'T')")
.bind(manga_id)
.execute(&pool)
.await
.unwrap();
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, 1)")
.bind(chapter_id)
.bind(manga_id)
.execute(&pool)
.await
.unwrap();
let stored = vec![
StoredPage {
page_number: 1,
storage_key: "k/0001.jpg".into(),
content_type: "image/jpeg".into(),
},
StoredPage {
page_number: 2,
storage_key: "k/0002.jpg".into(),
content_type: "image/jpeg".into(),
},
];
persist_pages(&pool, chapter_id, &stored).await.unwrap();
let page_count: i32 =
sqlx::query_scalar("SELECT page_count FROM chapters WHERE id = $1")
.bind(chapter_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(page_count, 2);
let rows: i64 =
sqlx::query_scalar("SELECT COUNT(*) FROM pages WHERE chapter_id = $1")
.bind(chapter_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(rows, 2);
// Idempotent re-run (force refetch path): same rows, page_count stable.
persist_pages(&pool, chapter_id, &stored).await.unwrap();
let rows2: i64 =
sqlx::query_scalar("SELECT COUNT(*) FROM pages WHERE chapter_id = $1")
.bind(chapter_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(rows2, 2, "re-run is idempotent via ON CONFLICT");
}
#[test]
fn parse_chapter_pages_skips_loader_and_sorts_by_id() {
@@ -582,214 +265,4 @@ mod tests {
let err = parse_chapter_pages(html).expect_err("expected Transient");
assert!(err.is_transient(), "got non-transient: {err}");
}
// --- fetch_chapter_html_with_recircuit -------------------------------
const OK_HTML: &str = r#"<html><body><a id="pic_container"><img id="page1" src="x"/></a></body></html>"#;
const UNAUTH_HTML: &str = r#"<html><body><header><div id="logo">x</div></header><main>please log in</main></body></html>"#;
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
#[tokio::test]
async fn recircuit_loop_ok_first_attempt() {
let mut recircuits = 0u32;
let mut fetches = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetches += 1;
async { Ok(OK_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetches, 1);
assert_eq!(recircuits, 0);
}
#[tokio::test]
async fn recircuit_loop_unauth_with_single_attempt_returns_session_expired() {
// max_attempts=1 = TOR disabled, fail-fast on first Unauthenticated.
let mut recircuits = 0u32;
let mut fetches = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetches += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
1,
"https://example/c",
)
.await
.expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
assert_eq!(fetches, 1);
assert_eq!(recircuits, 0, "no recircuit when budget is 1 (TOR disabled)");
}
#[tokio::test]
async fn recircuit_loop_unauth_then_ok_within_budget() {
// max_attempts=3 = up to 3 fetches with 2 recircuits between.
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
let n = fetch_n;
async move {
if n == 1 {
Ok(UNAUTH_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetch_n, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn recircuit_loop_unauth_exhausts_budget_returns_session_expired() {
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
}
#[tokio::test]
async fn recircuit_loop_transient_then_ok_within_budget() {
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
let n = fetch_n;
async move {
if n < 3 {
Ok(TRANSIENT_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetch_n, 3);
assert_eq!(recircuits, 2);
}
#[tokio::test]
async fn recircuit_loop_transient_exhausts_budget_returns_persistent() {
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
async { Ok(TRANSIENT_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::PersistentTransient));
assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
}
#[tokio::test]
async fn recircuit_loop_mixed_transient_then_unauth_then_ok_shares_budget() {
// Audit-prompted regression: outcomes share the attempt counter.
// Sequence: Transient (attempt 1) → Unauth (attempt 2) → Ok (3).
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
let n = fetch_n;
async move {
match n {
1 => Ok(TRANSIENT_HTML.to_string()),
2 => Ok(UNAUTH_HTML.to_string()),
_ => Ok(OK_HTML.to_string()),
}
}
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetch_n, 3);
assert_eq!(recircuits, 2);
}
#[tokio::test]
async fn recircuit_loop_propagates_fetch_errors() {
let mut fetch_n = 0u32;
let err = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
async { Err(anyhow::anyhow!("nav timeout")) }
},
|| async {},
3,
"https://example/c",
)
.await
.expect_err("fetch error bubbles");
assert_eq!(fetch_n, 1);
assert!(format!("{err:#}").contains("nav timeout"));
}
}

View File

@@ -48,7 +48,6 @@ use tokio_util::sync::CancellationToken;
use crate::crawler::content::SyncOutcome;
use crate::crawler::jobs::{self, JobPayload, Lease, KIND_SYNC_CHAPTER_CONTENT};
use crate::crawler::pipeline;
use crate::crawler::status::{Phase, StatusHandle};
/// Fixed `pg_try_advisory_lock` key. ASCII "MANGALRD" interpreted as a
/// big-endian i64. Hardcoded so every replica agrees on the lock identity
@@ -57,15 +56,6 @@ pub const CRON_LOCK_KEY: i64 = 0x4D414E47414C5244;
const STATE_KEY_LAST_TICK: &str = "last_metadata_tick_at";
/// Lease window handed to `jobs::lease`. Kept short, but continuously
/// extended by the per-job heartbeat (see [`WorkerContext::process_lease`])
/// so a long-but-healthy job never lapses and gets stolen.
const LEASE_DURATION: Duration = Duration::from_secs(60);
/// How often the heartbeat renews the lease while a job runs. A third of
/// the lease window leaves two missed-beat's slack before expiry.
const LEASE_HEARTBEAT: Duration = Duration::from_secs(20);
#[async_trait]
pub trait MetadataPass: Send + Sync {
async fn run(&self) -> anyhow::Result<pipeline::MetadataStats>;
@@ -87,13 +77,6 @@ pub struct DaemonConfig {
pub tz: Tz,
pub retention_days: u32,
pub session_expired: Arc<AtomicBool>,
/// Live status surface updated by the cron + workers.
pub status: StatusHandle,
/// Hard upper bound on a single job's dispatch. A job that exceeds it
/// is acked failed (exponential backoff) rather than wedging a worker
/// forever. Must exceed [`LEASE_HEARTBEAT`] and the realistic
/// single-job runtime.
pub job_timeout: Duration,
/// Tasks that should run alongside the cron + workers and be cancelled
/// on shutdown. Used to hand the daemon ownership of the browser
/// manager's idle reaper.
@@ -140,8 +123,6 @@ pub fn spawn(pool: PgPool, cancel: CancellationToken, cfg: DaemonConfig) -> Daem
tz,
retention_days,
session_expired,
status,
job_timeout,
extra_tasks,
} = cfg;
@@ -153,7 +134,6 @@ pub fn spawn(pool: PgPool, cancel: CancellationToken, cfg: DaemonConfig) -> Daem
tz,
retention_days,
metadata,
status: status.clone(),
};
join.spawn(async move { ctx.run().await });
} else {
@@ -166,8 +146,6 @@ pub fn spawn(pool: PgPool, cancel: CancellationToken, cfg: DaemonConfig) -> Daem
cancel: cancel.clone(),
dispatcher: Arc::clone(&dispatcher),
session_expired: Arc::clone(&session_expired),
status: status.clone(),
job_timeout,
id: worker_id,
};
join.spawn(async move { ctx.run().await });
@@ -191,7 +169,6 @@ struct CronContext {
tz: Tz,
retention_days: u32,
metadata: Arc<dyn MetadataPass>,
status: StatusHandle,
}
impl CronContext {
@@ -219,11 +196,6 @@ impl CronContext {
// (NTP step, suspend/resume) don't strand us on a stale instant.
let next = next_fire(Utc::now(), self.daily_at, self.tz);
let wait = (next - Utc::now()).to_std().unwrap_or(Duration::ZERO);
self.status
.set_phase(Phase::Idle {
next_fire: Some(next),
})
.await;
tracing::info!(
next_fire_utc = %next.to_rfc3339(),
wait_seconds = wait.as_secs(),
@@ -261,41 +233,23 @@ impl CronContext {
return;
}
// Panic-isolate the tick body the same way `process_lease` does
// for worker dispatch. Without this, a panic in metadata.run
// (or any of the follow-on steps) would kill the cron task and
// no future tick would ever run — workers would keep going but
// no new metadata work would be scheduled until daemon restart.
// The advisory unlock below runs unconditionally so a panicked
// tick doesn't leave the lock held for another replica.
let metadata = &self.metadata;
let pool = &self.pool;
let retention_days = self.retention_days;
let status = &self.status;
let body = async move {
match metadata.run().await {
Ok(stats) => {
status.record_pass(&stats, Utc::now()).await;
tracing::info!(?stats, "cron: metadata pass done");
}
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
}
match pipeline::enqueue_bookmarked_pending(pool).await {
Ok(summary) => {
tracing::info!(?summary, "cron: enqueued bookmarked-pending");
}
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
}
match jobs::reap_done(pool, retention_days).await {
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
}
if let Err(e) = write_last_tick(pool, Utc::now()).await {
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
}
};
if let Err(_panic) = AssertUnwindSafe(body).catch_unwind().await {
tracing::error!("cron: tick body panicked — continuing");
match self.metadata.run().await {
Ok(stats) => tracing::info!(?stats, "cron: metadata pass done"),
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
}
match pipeline::enqueue_bookmarked_pending(&self.pool).await {
Ok(summary) => tracing::info!(?summary, "cron: enqueued bookmarked-pending"),
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
}
match jobs::reap_done(&self.pool, self.retention_days).await {
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
}
if let Err(e) = write_last_tick(&self.pool, Utc::now()).await {
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
}
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
@@ -315,8 +269,6 @@ struct WorkerContext {
cancel: CancellationToken,
dispatcher: Arc<dyn ChapterDispatcher>,
session_expired: Arc<AtomicBool>,
status: StatusHandle,
job_timeout: Duration,
id: usize,
}
@@ -337,7 +289,7 @@ impl WorkerContext {
&self.pool,
Some(KIND_SYNC_CHAPTER_CONTENT),
1,
LEASE_DURATION,
Duration::from_secs(60),
)
.await
{
@@ -365,69 +317,23 @@ impl WorkerContext {
// (because a force-refetch race or a job that was re-enqueued
// after a previous one finished), ack done without re-fetching.
if let JobPayload::SyncChapterContent { chapter_id, .. } = &lease.payload {
let page_count = crate::repo::chapter::page_count(&self.pool, *chapter_id)
.await
.ok()
.flatten();
let page_count: Option<i32> = sqlx::query_scalar(
"SELECT page_count FROM chapters WHERE id = $1",
)
.bind(chapter_id)
.fetch_optional(&self.pool)
.await
.ok()
.flatten();
if matches!(page_count, Some(n) if n > 0) {
let _ = jobs::ack_done(&self.pool, lease.id).await;
return;
}
}
// Heartbeat: keep the lease fresh while the (potentially long)
// dispatch runs, so a slow-but-healthy job is never re-leased and
// never inflates `attempts` toward `max_attempts`. Stops itself
// once the job is no longer ours (renew returns false).
let heartbeat = {
let hb_pool = self.pool.clone();
let hb_id = lease.id;
tokio::spawn(async move {
loop {
tokio::time::sleep(LEASE_HEARTBEAT).await;
match jobs::renew(&hb_pool, hb_id, LEASE_DURATION).await {
Ok(true) => {}
Ok(false) => break,
Err(e) => {
tracing::warn!(lease_id = %hb_id, ?e, "heartbeat renew failed");
}
}
}
})
};
// The "currently crawling" chapter (with its live page count) is
// registered by the dispatcher itself (RealChapterDispatcher) so it
// carries the manga/chapter identity + page progress and is removed
// via an RAII guard on every exit path.
// Outer timeout: a dispatch that exceeds `job_timeout` is acked
// failed (exponential backoff) rather than wedging the worker.
let dispatch = AssertUnwindSafe(self.dispatcher.dispatch(lease.payload.clone()))
.catch_unwind();
let outcome = tokio::time::timeout(self.job_timeout, dispatch).await;
heartbeat.abort();
let outcome = match outcome {
Ok(o) => o,
Err(_elapsed) => {
tracing::warn!(
worker = self.id,
lease_id = %lease.id,
timeout_secs = self.job_timeout.as_secs(),
"worker: dispatch timed out — ack failed"
);
let _ = jobs::ack_failed(
&self.pool,
lease.id,
"dispatch timed out",
lease.attempts,
lease.max_attempts,
)
.await;
return;
}
};
let outcome = AssertUnwindSafe(self.dispatcher.dispatch(lease.payload.clone()))
.catch_unwind()
.await;
match outcome {
Ok(Ok(SyncOutcome::Fetched { .. } | SyncOutcome::Skipped)) => {
let _ = jobs::ack_done(&self.pool, lease.id).await;
@@ -439,8 +345,6 @@ impl WorkerContext {
"session expired — workers will idle until restart"
);
self.session_expired.store(true, Ordering::Release);
// Push the session-expired flip to live status subscribers.
self.status.poke();
let _ = jobs::release(&self.pool, lease.id).await;
}
Ok(Err(e)) => {
@@ -726,19 +630,4 @@ mod tests {
let prev = previous_fire(now, at, Tz::UTC);
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
}
/// Documents the panic-isolation pattern `run_tick` now relies on:
/// `AssertUnwindSafe(...).catch_unwind().await` must yield `Err(_)`
/// when the wrapped future panics, so the surrounding loop (or in
/// our case, the unconditional advisory-unlock that follows) keeps
/// running. The shape of this test mirrors the production callsite.
#[tokio::test]
async fn assert_unwind_safe_catches_a_panicking_future() {
let result = AssertUnwindSafe(async {
panic!("boom");
})
.catch_unwind()
.await;
assert!(result.is_err(), "panicking future must yield Err");
}
}

View File

@@ -80,36 +80,13 @@ pub fn has_logo_sentinel(doc: &scraper::Html) -> bool {
/// caller can fall back on the job system's retry/backoff once the
/// inline budget is exhausted.
pub async fn retry_on_transient<F, Fut, T>(
op: F,
max_attempts: u32,
delay: Duration,
) -> Result<T, PageError>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, PageError>>,
{
retry_on_transient_with_hook(op, max_attempts, delay, || async {}).await
}
/// Like [`retry_on_transient`] but invokes `on_retry` between a
/// transient failure and the subsequent sleep+retry. The hook does
/// **not** fire on the first attempt, after a non-transient error, or
/// after the final attempt (no retry follows). Hook failures are not
/// propagated — return `()` from the future and log inside if needed.
///
/// Wire the TOR controller's `new_identity` here to rotate circuits
/// between page-fetch retries; see [`crate::crawler::tor`].
pub async fn retry_on_transient_with_hook<F, Fut, T, H, HFut>(
mut op: F,
max_attempts: u32,
delay: Duration,
mut on_retry: H,
) -> Result<T, PageError>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, PageError>>,
H: FnMut() -> HFut,
HFut: Future<Output = ()>,
{
debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
let mut attempt = 0u32;
@@ -124,9 +101,8 @@ where
attempt,
max_attempts,
error = %e,
"transient error; running on-retry hook and sleeping before retry"
"transient error; sleeping before retry"
);
on_retry().await;
tokio::time::sleep(delay).await;
}
}
@@ -271,92 +247,4 @@ mod tests {
assert_eq!(result.unwrap(), 7);
assert_eq!(attempt, 1);
}
#[tokio::test]
async fn hook_fires_once_between_transient_and_success() {
let mut attempt = 0u32;
let mut hook_calls = 0u32;
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|| {
attempt += 1;
let n = attempt;
async move {
if n < 2 {
Err(PageError::transient("once"))
} else {
Ok(99)
}
}
},
5,
Duration::from_millis(0),
|| {
hook_calls += 1;
async {}
},
)
.await;
assert_eq!(result.unwrap(), 99);
assert_eq!(attempt, 2);
assert_eq!(hook_calls, 1, "hook fires exactly once between attempts");
}
#[tokio::test]
async fn hook_does_not_fire_when_first_attempt_succeeds() {
let mut hook_calls = 0u32;
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|| async { Ok(1) },
5,
Duration::from_millis(0),
|| {
hook_calls += 1;
async {}
},
)
.await;
assert!(result.is_ok());
assert_eq!(hook_calls, 0);
}
#[tokio::test]
async fn hook_does_not_fire_after_non_transient_error() {
let mut hook_calls = 0u32;
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|| async { Err(PageError::Other(anyhow::anyhow!("permanent"))) },
5,
Duration::from_millis(0),
|| {
hook_calls += 1;
async {}
},
)
.await;
assert!(result.is_err());
assert_eq!(hook_calls, 0, "non-transient must short-circuit before hook");
}
#[tokio::test]
async fn hook_does_not_fire_after_final_failed_attempt() {
// With max_attempts=3 and three persistent transients, the hook
// should run twice (between 1→2 and 2→3) — never a third time,
// because no retry follows attempt 3.
let mut attempt = 0u32;
let mut hook_calls = 0u32;
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|| {
attempt += 1;
async { Err(PageError::transient("always")) }
},
3,
Duration::from_millis(0),
|| {
hook_calls += 1;
async {}
},
)
.await;
assert!(result.is_err());
assert_eq!(attempt, 3);
assert_eq!(hook_calls, 2, "hook fires N-1 times for N attempts that all fail transient");
}
}

View File

@@ -1,4 +1,4 @@
//! Persistent job queue and its job kinds.
//! Persistent job queue and the four job kinds.
//!
//! Backed by Postgres (the `crawler_jobs` table). Workers lease rows
//! with `SELECT ... FOR UPDATE SKIP LOCKED`, heartbeat via
@@ -12,9 +12,16 @@ use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;
use super::source::DiscoverMode;
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum JobPayload {
/// Walk the source index and enqueue `SyncManga` jobs.
Discover {
source_id: String,
mode: DiscoverMode,
},
/// Fetch one manga's detail page, upsert metadata, enqueue
/// `SyncChapterList`.
SyncManga {
@@ -66,33 +73,16 @@ pub struct Lease {
pub max_attempts: i32,
}
/// Deterministic exponential backoff base for `ack_failed` retries.
/// `attempts` is the post-increment value reported by `lease()` (so the
/// first failure has `attempts == 1` and waits 60s, the second 120s,
/// etc.). Capped at 1h to avoid runaway long sleeps that would outlive
/// the daemon process. Jitter is applied separately by [`apply_jitter`].
fn backoff_base(attempts: i32) -> Duration {
/// Exponential backoff for `ack_failed` retries. `attempts` is the
/// post-increment value reported by `lease()` (so the first failure has
/// `attempts == 1` and waits 60s, the second 120s, etc.). Capped at 1h to
/// avoid runaway long sleeps that would outlive the daemon process.
fn backoff_for(attempts: i32) -> Duration {
let shift = attempts.saturating_sub(1).clamp(0, 20) as u32;
let secs = 60u64.saturating_mul(1u64 << shift);
Duration::from_secs(secs.min(3600))
}
/// Apply ±20% jitter to a backoff duration. `jitter` is a fraction in
/// `[0.0, 1.0)` (e.g. `rand::random::<f64>()`), mapped to a multiplier in
/// `[0.8, 1.2)`. Pure so the bounds stay unit-testable. Spreading retries
/// avoids a thundering herd when a source outage fails many jobs at once.
fn apply_jitter(base: Duration, jitter: f64) -> Duration {
let frac = jitter.clamp(0.0, 1.0);
let mult = 0.8 + 0.4 * frac; // [0.8, 1.2)
Duration::from_secs((base.as_secs_f64() * mult).round() as u64)
}
/// Jittered exponential backoff for `ack_failed`. Wraps [`backoff_base`]
/// with a random ±20% spread.
fn backoff_for(attempts: i32) -> Duration {
apply_jitter(backoff_base(attempts), rand::random::<f64>())
}
/// Insert a new pending job. For `SyncChapterContent` payloads the
/// partial unique index `crawler_jobs_chapter_content_dedup_idx` blocks
/// a second `(pending|running)` insert per chapter_id, returning
@@ -121,12 +111,6 @@ pub async fn enqueue(pool: &PgPool, payload: &JobPayload) -> sqlx::Result<Enqueu
///
/// `kind_filter` matches against `payload->>'kind'`; `None` means
/// any kind.
///
/// Ties on `scheduled_at` (the common case: a cron batch enqueues
/// everything with the same default `now()`) break by `created_at`, so
/// jobs come off the queue in insertion order. The enqueue paths insert
/// chapter-content jobs in ascending `chapters.number` order, so this
/// tiebreaker is what propagates that intent through to dequeue.
pub async fn lease(
pool: &PgPool,
kind_filter: Option<&str>,
@@ -141,7 +125,7 @@ pub async fn lease(
WHERE (state = 'pending' OR (state = 'running' AND leased_until < now()))
AND scheduled_at <= now()
AND ($1::text IS NULL OR payload->>'kind' = $1)
ORDER BY scheduled_at, created_at
ORDER BY scheduled_at
LIMIT $2
FOR UPDATE SKIP LOCKED
)
@@ -176,65 +160,23 @@ pub async fn lease(
Ok(leases)
}
/// Extend the lease on a still-owned `running` job. Returns `true` if the
/// row was updated (we still hold the lease), `false` if the job is no
/// longer `running` (re-leased after a missed heartbeat, or already
/// acked) — the caller's heartbeat loop should stop. The `state =
/// 'running'` guard mirrors [`ack_done`]'s rationale.
///
/// This is the heartbeat primitive: a worker renews periodically while a
/// long-but-healthy job runs so `leased_until` never lapses, which would
/// otherwise let another worker steal the in-flight job and spuriously
/// inflate `attempts` toward `max_attempts`.
pub async fn renew(
pool: &PgPool,
lease_id: Uuid,
lease_duration: Duration,
) -> sqlx::Result<bool> {
let lease_ms: i64 = lease_duration.as_millis().min(i64::MAX as u128) as i64;
let res = sqlx::query(
"UPDATE crawler_jobs \
SET leased_until = now() + ($2::bigint || ' milliseconds')::interval, \
updated_at = now() \
WHERE id = $1 AND state = 'running'",
)
.bind(lease_id)
.bind(lease_ms)
.execute(pool)
.await?;
Ok(res.rows_affected() > 0)
}
/// Mark a leased job as successfully completed. The `state = 'running'`
/// predicate guards against a late ack from a worker whose lease expired
/// and was already re-leased by another worker: without it, the late ack
/// would clobber the new lease's `state` and `leased_until`. `rows_affected
/// == 0` means we lost the lease — surfaced as a warn rather than an
/// error because the new lease holder is doing real work; the late ack
/// just has to step aside.
/// Mark a leased job as successfully completed.
pub async fn ack_done(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
let res = sqlx::query(
sqlx::query(
"UPDATE crawler_jobs \
SET state = 'done', leased_until = NULL, updated_at = now() \
WHERE id = $1 AND state = 'running'",
WHERE id = $1",
)
.bind(lease_id)
.execute(pool)
.await?;
if res.rows_affected() == 0 {
tracing::warn!(
%lease_id,
"ack_done: lease no longer running — likely re-leased by another worker; skipping update"
);
}
Ok(())
}
/// Mark a leased job as failed. If the current attempt count has reached
/// `max_attempts` the job is terminally dead and stops retrying;
/// otherwise it goes back to `pending` with `scheduled_at` pushed into
/// the future by the exponential backoff. See [`ack_done`] for the
/// `state = 'running'` guard rationale.
/// the future by the exponential backoff.
pub async fn ack_failed(
pool: &PgPool,
lease_id: Uuid,
@@ -242,16 +184,16 @@ pub async fn ack_failed(
attempts: i32,
max_attempts: i32,
) -> sqlx::Result<()> {
let res = if attempts >= max_attempts {
if attempts >= max_attempts {
sqlx::query(
"UPDATE crawler_jobs \
SET state = 'dead', last_error = $2, leased_until = NULL, updated_at = now() \
WHERE id = $1 AND state = 'running'",
WHERE id = $1",
)
.bind(lease_id)
.bind(error)
.execute(pool)
.await?
.await?;
} else {
let backoff_ms: i64 = backoff_for(attempts).as_millis().min(i64::MAX as u128) as i64;
sqlx::query(
@@ -259,45 +201,30 @@ pub async fn ack_failed(
SET state = 'pending', last_error = $2, leased_until = NULL, \
scheduled_at = now() + ($3::bigint || ' milliseconds')::interval, \
updated_at = now() \
WHERE id = $1 AND state = 'running'",
WHERE id = $1",
)
.bind(lease_id)
.bind(error)
.bind(backoff_ms)
.execute(pool)
.await?
};
if res.rows_affected() == 0 {
tracing::warn!(
%lease_id,
"ack_failed: lease no longer running — likely re-leased by another worker; skipping update"
);
.await?;
}
Ok(())
}
/// Return a leased job to `pending` without burning a retry attempt.
/// Used on graceful shutdown and on session-expired aborts where the
/// failure isn't the job's fault. See [`ack_done`] for the
/// `state = 'running'` guard rationale — important here because
/// `attempts - 1` would otherwise spuriously decrement the new lease's
/// attempt count.
/// failure isn't the job's fault.
pub async fn release(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
let res = sqlx::query(
sqlx::query(
"UPDATE crawler_jobs \
SET state = 'pending', leased_until = NULL, \
attempts = GREATEST(0, attempts - 1), updated_at = now() \
WHERE id = $1 AND state = 'running'",
WHERE id = $1",
)
.bind(lease_id)
.execute(pool)
.await?;
if res.rows_affected() == 0 {
tracing::warn!(
%lease_id,
"release: lease no longer running — likely re-leased by another worker; skipping update"
);
}
Ok(())
}
@@ -324,48 +251,19 @@ mod tests {
use super::*;
#[test]
fn backoff_base_grows_exponentially_and_caps_at_one_hour() {
fn backoff_grows_exponentially_and_caps_at_one_hour() {
// attempts == 1 → 60s, doubling each step.
assert_eq!(backoff_base(1), Duration::from_secs(60));
assert_eq!(backoff_base(2), Duration::from_secs(120));
assert_eq!(backoff_base(3), Duration::from_secs(240));
assert_eq!(backoff_base(4), Duration::from_secs(480));
assert_eq!(backoff_base(5), Duration::from_secs(960));
assert_eq!(backoff_base(6), Duration::from_secs(1920));
assert_eq!(backoff_for(1), Duration::from_secs(60));
assert_eq!(backoff_for(2), Duration::from_secs(120));
assert_eq!(backoff_for(3), Duration::from_secs(240));
assert_eq!(backoff_for(4), Duration::from_secs(480));
assert_eq!(backoff_for(5), Duration::from_secs(960));
assert_eq!(backoff_for(6), Duration::from_secs(1920));
// 7th: 60 * 64 = 3840 → capped to 3600.
assert_eq!(backoff_base(7), Duration::from_secs(3600));
assert_eq!(backoff_base(20), Duration::from_secs(3600));
assert_eq!(backoff_for(7), Duration::from_secs(3600));
assert_eq!(backoff_for(20), Duration::from_secs(3600));
// Garbage / zero / negatives stay sane.
assert_eq!(backoff_base(0), Duration::from_secs(60));
assert_eq!(backoff_base(-5), Duration::from_secs(60));
}
#[test]
fn apply_jitter_stays_within_plus_minus_twenty_percent() {
let base = Duration::from_secs(100);
// Lower bound (jitter = 0.0) → 0.8x.
assert_eq!(apply_jitter(base, 0.0), Duration::from_secs(80));
// Midpoint (jitter = 0.5) → 1.0x.
assert_eq!(apply_jitter(base, 0.5), Duration::from_secs(100));
// Upper end (jitter → 1.0) → ~1.2x.
assert_eq!(apply_jitter(base, 1.0), Duration::from_secs(120));
// Out-of-range inputs are clamped, never panic.
assert_eq!(apply_jitter(base, -3.0), Duration::from_secs(80));
assert_eq!(apply_jitter(base, 9.0), Duration::from_secs(120));
}
#[test]
fn backoff_for_random_jitter_stays_in_band() {
// The production wrapper draws its own randomness; assert the
// result for a mid-range attempt always lands within the jitter
// band of the base, across many draws.
let base = backoff_base(3).as_secs_f64(); // 240s
for _ in 0..1000 {
let v = backoff_for(3).as_secs_f64();
assert!(
v >= base * 0.8 - 1.0 && v <= base * 1.2 + 1.0,
"jittered backoff {v} outside band of base {base}"
);
}
assert_eq!(backoff_for(0), Duration::from_secs(60));
assert_eq!(backoff_for(-5), Duration::from_secs(60));
}
}

View File

@@ -20,14 +20,7 @@ pub mod daemon;
pub mod detect;
pub mod diff;
pub mod jobs;
pub mod nav;
pub mod pipeline;
pub mod rate_limit;
pub mod resync;
pub mod safety;
pub mod session;
pub mod session_control;
pub mod source;
pub mod status;
pub mod tor;
pub mod url_utils;

View File

@@ -1,241 +0,0 @@
//! Page navigation helpers — wrap `chromiumoxide` `wait_for_navigation`
//! with a timeout so a hung TLS handshake or a page that never fires
//! `load` cannot wedge a worker (or the cron metadata pass) forever.
//!
//! [`NAV_TIMEOUT`] is the global budget. Callers in the crawler use
//! [`wait_for_nav`] to get back a typed error so transient timeouts can
//! be reported separately from underlying CDP errors.
use std::time::Duration;
use chromiumoxide::error::CdpError;
use chromiumoxide::Page;
use thiserror::Error;
/// Maximum wall-clock time we'll wait for a single page navigation. A
/// healthy Chromium reaches `load` in well under a second on the target
/// site; a 30-second cap is generous enough for slow TLS handshakes on
/// the first request after a fresh process while still catching real
/// hangs before they wedge the daemon.
pub const NAV_TIMEOUT: Duration = Duration::from_secs(30);
/// Outcome of a timed-out navigation. `Timeout` is the transient signal
/// callers translate into a retry-friendly error
/// ([`crate::crawler::detect::PageError::Transient`] in the source path,
/// a context'd anyhow elsewhere). `Cdp` carries the underlying
/// chromiumoxide error unchanged.
#[derive(Debug, Error)]
pub enum NavError {
#[error("navigation timed out after {0:?}")]
Timeout(Duration),
#[error(transparent)]
Cdp(#[from] CdpError),
}
/// Wait for the page's next navigation to complete, capped at
/// [`NAV_TIMEOUT`]. Replaces bare `page.wait_for_navigation().await`
/// throughout the crawler.
pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
match tokio::time::timeout(NAV_TIMEOUT, page.wait_for_navigation()).await {
Err(_elapsed) => Err(NavError::Timeout(NAV_TIMEOUT)),
Ok(Err(e)) => Err(NavError::Cdp(e)),
Ok(Ok(_)) => Ok(()),
}
}
/// Poll interval for [`wait_for_selector`]. 100ms is fast enough that a
/// page rendering in 200ms isn't held back noticeably, and slow enough
/// not to spam CDP with `find_element` calls on a page that's actually
/// taking its time.
const SELECTOR_POLL_INTERVAL: Duration = Duration::from_millis(100);
/// Wait until `selector` matches at least one element on `page`, or
/// `timeout` elapses. Used after a navigation to confirm a page-type-
/// specific marker is in the DOM before parsing — replaces the fixed
/// post-nav sleep that previously masked partial-render races.
///
/// chromiumoxide 0.7.0 has no built-in `wait_for_selector`, so we poll
/// `find_element` at [`SELECTOR_POLL_INTERVAL`] until success or budget
/// exhaustion. A failed `find_element` is *not* an error here — it just
/// means "not yet" — we only surface an error once the overall
/// `timeout` is up.
pub async fn wait_for_selector(
page: &Page,
selector: &str,
timeout: Duration,
) -> Result<(), NavError> {
let deadline = tokio::time::Instant::now() + timeout;
loop {
if page.find_element(selector).await.is_ok() {
return Ok(());
}
if tokio::time::Instant::now() >= deadline {
return Err(NavError::Timeout(timeout));
}
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
let sleep_for = SELECTOR_POLL_INTERVAL.min(remaining);
tokio::time::sleep(sleep_for).await;
}
}
/// Per-page-type budget for [`wait_for_selector`]. Shorter than
/// [`NAV_TIMEOUT`] because by the time we're waiting on a selector, the
/// page has already responded — we're only absorbing post-load JS
/// finishing its row injection, which on a healthy site takes well
/// under a second.
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
impl NavError {
/// Does this navigation error indicate the underlying Chromium
/// process has died or its CDP connection has dropped? Used by the
/// dispatcher to decide whether to invalidate the
/// [`crate::crawler::browser_manager::BrowserManager`] handle so
/// the next acquire re-launches.
///
/// Both variants count: a `Timeout` past [`NAV_TIMEOUT`] is in
/// practice always either a hung CDP transport or a wedged page
/// the browser can't recover from on its own, and a `Cdp` error
/// surfacing at the navigation layer means the chromium-facing
/// channel is the failing layer.
pub fn is_likely_browser_dead(&self) -> bool {
match self {
Self::Timeout(_) => true,
Self::Cdp(_) => true,
}
}
}
/// Walk an `anyhow::Error` chain looking for typed evidence that the
/// chromium-facing layer is the failing one. Two markers count:
///
/// 1. A wrapped [`NavError`] flagged by [`NavError::is_likely_browser_dead`].
/// 2. A wrapped [`CdpError`] (via `anyhow::Error::from(CdpError)` at a
/// `Browser::new_page` call site, or any other direct CDP boundary).
///
/// Earlier versions also substring-matched the chain for "connection",
/// "closed", "channel", etc. as a fallback. That was too broad —
/// reqwest TCP-reset errors during CDN image downloads, sqlx
/// connection-pool errors, and similar non-browser failures contain
/// those words and triggered spurious chromium relaunches. The typed
/// downcasts cover every place we hand a chromium error to anyhow,
/// so the fallback is unnecessary.
pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool {
for cause in err.chain() {
if let Some(nav) = cause.downcast_ref::<NavError>() {
if nav.is_likely_browser_dead() {
return true;
}
}
if cause.downcast_ref::<CdpError>().is_some() {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use std::future::pending;
/// Sanity-check the timeout pattern used by [`wait_for_nav`]: a
/// future that never resolves must yield `Elapsed` within the
/// configured budget. We can't easily stand up a real `Page` in a
/// unit test, so we assert the underlying primitive behaves the way
/// the helper depends on.
#[tokio::test(flavor = "current_thread", start_paused = true)]
async fn timeout_elapses_on_a_future_that_never_resolves() {
let result =
tokio::time::timeout(Duration::from_millis(50), pending::<()>()).await;
assert!(result.is_err(), "expected Elapsed on a hung future");
}
#[test]
fn nav_error_timeout_message_includes_duration() {
let e = NavError::Timeout(Duration::from_secs(30));
assert_eq!(e.to_string(), "navigation timed out after 30s");
}
#[test]
fn timeout_is_treated_as_likely_browser_dead() {
let e = NavError::Timeout(NAV_TIMEOUT);
assert!(e.is_likely_browser_dead());
}
#[test]
fn anyhow_with_nav_timeout_in_chain_is_flagged() {
let inner: Result<(), NavError> = Err(NavError::Timeout(NAV_TIMEOUT));
let outer = inner.unwrap_err();
let wrapped: anyhow::Error =
anyhow::Error::new(outer).context("wait for chapter nav");
assert!(anyhow_looks_browser_dead(&wrapped));
}
#[test]
fn anyhow_with_cdp_error_in_chain_is_flagged() {
// `Browser::new_page` errors get wrapped via
// `anyhow::Error::from(CdpError)` at the navigate / dispatch
// call sites. Walking the chain and downcasting to CdpError is
// what catches that path. Any CdpError variant counts; the
// Serde variant is the easiest to construct in a unit test.
let serde_err: serde_json::Error =
serde_json::from_str::<i32>("not a number").unwrap_err();
let cdp = CdpError::Serde(serde_err);
let wrapped: anyhow::Error =
anyhow::Error::from(cdp).context("open chapter page");
assert!(anyhow_looks_browser_dead(&wrapped));
}
#[test]
fn anyhow_with_innocuous_parse_error_is_not_flagged() {
let e: anyhow::Error =
anyhow::anyhow!("parse manga detail: chapter row regex did not match");
assert!(!anyhow_looks_browser_dead(&e));
}
#[test]
fn anyhow_with_reqwest_style_connection_message_is_not_flagged() {
// Regression: the earlier substring fallback flagged any error
// whose message contained "connection" or "closed" as browser-
// dead. A TCP reset from a CDN during image download, or a
// sqlx pool-connection error, would burn a chromium relaunch
// even though the browser is fine. Typed downcasts only —
// these untyped strings must pass through.
for msg in [
"error sending request: connection reset by peer",
"PoolTimedOut: timed out waiting for a connection",
"request to https://cdn/x.jpg: connection closed before message completed",
"transport error during image fetch",
] {
let e: anyhow::Error = anyhow::anyhow!("{msg}");
assert!(
!anyhow_looks_browser_dead(&e),
"must not flag non-browser error: {msg}"
);
}
}
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
/// but for the [`wait_for_selector`] polling pattern: the loop must
/// surrender on `Elapsed` rather than spinning past the deadline.
#[tokio::test(flavor = "current_thread", start_paused = true)]
async fn selector_polling_pattern_surrenders_at_deadline() {
let timeout = Duration::from_millis(300);
let start = tokio::time::Instant::now();
let deadline = start + timeout;
// Simulate find_element forever returning "not found".
let mut polls = 0u32;
let result: Result<(), NavError> = loop {
polls += 1;
if tokio::time::Instant::now() >= deadline {
break Err(NavError::Timeout(timeout));
}
tokio::time::sleep(SELECTOR_POLL_INTERVAL).await;
};
assert!(matches!(result, Err(NavError::Timeout(_))));
// 300ms / 100ms poll interval ≈ 3 iterations plus the final check
// that breaks out. Allow some slack since the first poll happens
// before any sleep.
assert!(polls >= 3, "expected at least 3 poll iterations, got {polls}");
}
}

View File

@@ -2,8 +2,6 @@
//! that fan out chapter-content work. Shared between the daemon (cron tick)
//! and the CLI (`bin/crawler.rs`) so behavior stays in lockstep.
use std::collections::HashSet;
use anyhow::Context;
use sqlx::PgPool;
use uuid::Uuid;
@@ -11,11 +9,9 @@ use uuid::Uuid;
use crate::crawler::browser_manager::BrowserManager;
use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist};
use crate::crawler::source::target::TargetSource;
use crate::crawler::source::{FetchContext, Source, SourceMangaRef};
use crate::crawler::source::{DiscoverMode, FetchContext, Source};
use crate::repo;
use crate::repo::crawler::UpsertStatus;
use crate::storage::Storage;
/// Coarse counters surfaced for logging at the end of a metadata pass.
@@ -27,53 +23,16 @@ pub struct MetadataStats {
pub mangas_failed: usize,
}
/// Decide whether the per-ref loop should stop on the manga just
/// processed. The walk halts only when (a) the previous run exited
/// cleanly — so the index tail is known to be caught up and we're not
/// in a recovery sweep — AND (b) this manga's metadata hash matched
/// storage (`Unchanged`) AND (c) the chapter sync confirmed zero new
/// chapters. A `None` chapter count (skip_chapters, or a chapter-sync
/// error we logged-and-swallowed) refuses the stop because we can't
/// verify the tail is unchanged from a single piece of evidence.
///
/// Pure function so the rule is unit-testable without the walker, DB,
/// or browser.
pub(crate) fn should_stop(
was_clean: bool,
status: UpsertStatus,
chapters_new: Option<usize>,
) -> bool {
was_clean
&& matches!(status, UpsertStatus::Unchanged)
&& chapters_new == Some(0)
}
/// Whether the just-finished walk should be recorded as a clean exit.
/// `true` writes the recovery flag back to `completed: true`; `false`
/// leaves it `false` so the next tick treats this run as crashed and
/// does a recovery sweep.
///
/// `hit_limit` (the caller-imposed `CRAWLER_LIMIT` cap) is *not* an
/// argument: a limit cap by definition does not reach the catalog tail,
/// so it can never count as a clean exit. Encoding that in the type
/// (rather than as an `&& !hit_limit` clause inline) prevents a future
/// edit from accidentally adding it back to the truth table.
pub(crate) fn should_mark_clean_exit(
walked_to_completion: bool,
hit_stop_condition: bool,
) -> bool {
walked_to_completion || hit_stop_condition
}
/// Circuit-breaker: abort the walk once `consecutive` `fetch_manga`
/// failures reach `threshold`. A `threshold` of 0 disables the breaker
/// (unbounded — the legacy behaviour). When it fires the caller must NOT
/// mark a clean exit, so the next tick does a recovery sweep over the
/// catalog tail the aborted pass never reached.
///
/// Pure so the rule is unit-testable without the walker.
pub(crate) fn should_abort_pass(consecutive: u32, threshold: u32) -> bool {
threshold > 0 && consecutive >= threshold
/// Decide whether the per-ref loop should stop based on the Incremental
/// streak counter. Pulled out as a pure function so the rule is unit-
/// testable without standing up the walker or DB.
pub(crate) fn should_stop(mode: DiscoverMode, consecutive_unchanged: usize) -> bool {
match mode {
DiscoverMode::Backfill => false,
DiscoverMode::Incremental { stop_after_unchanged } => {
consecutive_unchanged >= stop_after_unchanged
}
}
}
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
@@ -83,25 +42,15 @@ pub(crate) fn should_abort_pass(consecutive: u32, threshold: u32) -> bool {
/// `limit == 0` means no cap (full sweep up to the source's own bound).
/// `skip_chapters == true` is the "metadata-only" mode (parser doesn't
/// extract chapters, and `sync_manga_chapters` is skipped — otherwise an
/// empty chapter list would soft-drop existing rows). In this mode the
/// stop condition never fires because chapter freshness can't be
/// confirmed, so the walk always runs to end-of-source.
/// empty chapter list would soft-drop existing rows).
///
/// The walk is always newest-first. Steady-state runs stop on the first
/// manga where metadata is `Unchanged` AND chapter sync reports zero
/// new chapters — the source orders by `update_date DESC`, so anything
/// with a fresh chapter or fresh metadata is bumped to the top and will
/// be processed before we hit a fully-caught-up manga.
///
/// A per-source recovery flag stored in `crawler_state`
/// (`last_run_completed:<source_id>`) gates the early stop: it's set to
/// `false` right after `ensure_source` and back to `true` only when the
/// run exits via end-of-walk OR the intentional stop. A crash, panic,
/// or SIGKILL leaves the flag at `false`, so the next tick reads it,
/// recognizes the previous run did not exit cleanly, and walks the
/// full catalog (ignoring the stop condition) to re-cover anything the
/// crashed run missed past its crash point. Once that recovery sweep
/// reaches end-of-walk, steady-state resumes.
/// `mode` controls the walk:
/// - `Backfill` — oldest-first, no early exit. The only mode that runs
/// the end-of-walk drop pass + writes `seed_completed_at`.
/// - `Incremental { stop_after_unchanged }` — newest-first, breaks out
/// after N consecutive Unchanged upserts. Drop pass is skipped (the
/// tail of the index is never visited, so its `last_seen_at` is
/// stale and using it to soft-drop would be unsafe).
#[allow(clippy::too_many_arguments)]
pub async fn run_metadata_pass(
browser_manager: &BrowserManager,
@@ -112,20 +61,13 @@ pub async fn run_metadata_pass(
start_url: &str,
limit: usize,
skip_chapters: bool,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
max_consecutive_failures: u32,
status: Option<&crate::crawler::status::StatusHandle>,
tor: Option<&crate::crawler::tor::TorController>,
mode: DiscoverMode,
) -> anyhow::Result<MetadataStats> {
let lease = browser_manager
.acquire()
.await
.context("acquire browser lease for metadata pass")?;
let browser_ref: &chromiumoxide::Browser = &lease;
if let Some(s) = status {
s.set_phase(crate::crawler::status::Phase::WalkingList).await;
}
let source = {
let s = TargetSource::new(start_url.to_string());
@@ -138,7 +80,6 @@ pub async fn run_metadata_pass(
let ctx = FetchContext {
browser: browser_ref,
rate,
tor,
};
let source_id = source.id();
@@ -151,41 +92,20 @@ pub async fn run_metadata_pass(
.await
.context("ensure_source")?;
// Read BEFORE flipping to "in-flight" — a `false` here means the
// previous run didn't reach a clean exit, and this run must walk
// the full catalog (recovery sweep) instead of bailing on the
// first caught-up manga.
let was_clean = repo::crawler::last_run_completed_cleanly(db, source_id)
.await
.context("read last_run_completed_cleanly")?;
repo::crawler::mark_run_started(db, source_id)
.await
.context("mark_run_started")?;
let run_started_at = chrono::Utc::now();
let max_refs = (limit > 0).then_some(limit);
tracing::info!(was_clean, ?max_refs, "starting metadata pass");
tracing::info!(?mode, ?max_refs, "starting metadata pass");
let mut walker = source
.discover(&ctx)
.discover(&ctx, mode)
.await
.context("discover failed")?;
let mut stats = MetadataStats::default();
// Run-scoped dedup of `source_manga_key`s already processed this pass.
// A shift in the source index causes the slot-last item of the page
// we just read to reappear at slot 0 of the next page; skipping it
// here prevents redundant fetch_manga + upsert and avoids spuriously
// tripping the stop condition with a re-confirm of an entry we
// already counted.
let mut seen: HashSet<String> = HashSet::new();
let mut consecutive_unchanged: usize = 0;
let mut walked_to_completion = false;
let mut hit_limit = false;
let mut hit_stop_condition = false;
// Circuit-breaker state: consecutive fetch_manga failures. A sustained
// run abort (source outage) leaves the pass un-clean → recovery sweep
// next tick.
let mut consecutive_failures = 0u32;
let mut hit_failure_breaker = false;
let mut hit_incremental_stop = false;
'outer: loop {
let batch = match walker.next_batch(&ctx).await? {
@@ -196,58 +116,19 @@ pub async fn run_metadata_pass(
}
};
for r in batch {
// Cooperative checkpoint: if a coordinated browser restart is
// pending, yield our (long-lived) lease so the drain can
// proceed instead of stalling for the rest of the walk. The
// pass exits un-clean, so the next tick recovery-sweeps the
// tail we didn't reach.
if browser_manager.is_restart_pending() {
tracing::info!(
"metadata pass: browser restart pending — yielding (recovery sweep next tick)"
);
break 'outer;
}
if max_refs.map(|m| stats.discovered >= m).unwrap_or(false) {
hit_limit = true;
tracing::info!(cap = ?max_refs, "max_results reached; halting walk");
break 'outer;
}
// Skip refs we've already *successfully* processed this pass.
// Checking `contains` here (rather than `insert`) keeps the key
// out of `seen` on failure paths below, so a transient fetch or
// upsert error gets a second chance if the ref reappears in
// another batch. Done *before* counting toward
// `stats.discovered` (the skipped ref did no work) and *before*
// touching the stop check (a `continue` here doesn't let a
// re-confirm trip the stop condition). The matching
// `seen.insert(...)` lives just after the successful upsert
// below.
if seen.contains(&r.source_manga_key) {
tracing::debug!(
key = %r.source_manga_key,
"skip already-seen key in this run"
);
continue;
}
stats.discovered += 1;
if let Some(s) = status {
s.set_phase(crate::crawler::status::Phase::FetchingMetadata {
index: stats.discovered,
total: max_refs,
title: r.title.clone(),
})
.await;
}
tracing::info!(
idx = stats.discovered,
key = %r.source_manga_key,
"fetching metadata"
);
let manga = match source.fetch_manga(&ctx, &r).await {
Ok(m) => {
consecutive_failures = 0;
m
}
Ok(m) => m,
Err(e) => {
tracing::warn!(
key = %r.source_manga_key,
@@ -256,63 +137,10 @@ pub async fn run_metadata_pass(
"fetch_manga failed"
);
stats.mangas_failed += 1;
consecutive_failures += 1;
if should_abort_pass(consecutive_failures, max_consecutive_failures) {
hit_failure_breaker = true;
tracing::error!(
consecutive_failures,
threshold = max_consecutive_failures,
"metadata pass: too many consecutive fetch_manga failures; \
aborting (recovery sweep on next tick)"
);
break 'outer;
}
continue;
}
};
// Partial-render guard: an empty chapter list paired with a
// prior count > 0 is overwhelmingly a chromium snapshot
// taken between the #chapter_table wrapper render and its
// rows render. The wait_for_selector wait in `navigate`
// narrows this window but cannot close it for slow renders
// beyond the selector budget. Treat as a transient failure
// here — skip upsert, skip seen.insert — so the next batch
// (or the next tick) retries. Skipped in `skip_chapters`
// mode because the parser is configured to return an empty
// Vec by design there.
if !skip_chapters && manga.chapters.is_empty() {
match repo::crawler::live_chapter_count_for_source_manga(
db, source_id, &r.source_manga_key,
)
.await
{
Ok(prior) if prior > 0 => {
tracing::warn!(
key = %r.source_manga_key,
url = %r.url,
prior_chapter_count = prior,
"fetch_manga returned empty chapters but prior count > 0; treating as partial-render transient and skipping"
);
stats.mangas_failed += 1;
continue;
}
Ok(_) => {}
Err(e) => {
// DB lookup failed — fail safe: skip rather
// than risk a soft-drop on a manga whose prior
// count we couldn't confirm.
tracing::warn!(
key = %r.source_manga_key,
error = ?e,
"live_chapter_count_for_source_manga failed; skipping cautiously"
);
stats.mangas_failed += 1;
continue;
}
}
}
let upsert = match repo::crawler::upsert_manga_from_source(
db, source_id, &r.url, &manga,
)
@@ -330,10 +158,6 @@ pub async fn run_metadata_pass(
}
};
stats.upserted += 1;
// Record success in the dedup set. Cover and chapter-sync
// failures below are non-fatal and don't roll this back —
// metadata is the durable source of truth for the dedup.
seen.insert(r.source_manga_key.clone());
tracing::info!(
key = %manga.source_manga_key,
manga_id = %upsert.manga_id,
@@ -349,14 +173,7 @@ pub async fn run_metadata_pass(
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
if needs_cover {
if let Some(cover_url) = manga.cover_url.as_deref() {
if let Some(s) = status {
s.set_current_cover(Some(crate::crawler::status::CoverTarget {
manga_id: upsert.manga_id,
manga_title: manga.title.clone(),
}))
.await;
}
let cover_result = download_and_store_cover(
match download_and_store_cover(
db,
storage,
http,
@@ -364,14 +181,9 @@ pub async fn run_metadata_pass(
&r.url,
upsert.manga_id,
cover_url,
allowlist,
max_image_bytes,
)
.await;
if let Some(s) = status {
s.set_current_cover(None).await;
}
match cover_result {
.await
{
Ok(()) => stats.covers_fetched += 1,
Err(e) => tracing::warn!(
manga_id = %upsert.manga_id,
@@ -382,13 +194,7 @@ pub async fn run_metadata_pass(
}
}
// Chapter sync. `chapters_new` feeds the stop check below:
// `None` (skip_chapters mode, or a logged-and-swallowed sync
// error) refuses to stop on this manga because we can't
// confirm "no new chapters."
let chapters_new: Option<usize> = if skip_chapters {
None
} else {
if !skip_chapters {
match repo::crawler::sync_manga_chapters(
db,
source_id,
@@ -397,65 +203,79 @@ pub async fn run_metadata_pass(
)
.await
{
Ok(diff) => {
tracing::info!(
manga_id = %upsert.manga_id,
new = diff.new,
refreshed = diff.refreshed,
dropped = diff.dropped,
"chapters synced"
);
Some(diff.new)
}
Err(e) => {
tracing::warn!(
manga_id = %upsert.manga_id,
error = ?e,
"chapter sync failed"
);
None
}
Ok(diff) => tracing::info!(
manga_id = %upsert.manga_id,
new = diff.new,
refreshed = diff.refreshed,
dropped = diff.dropped,
"chapters synced"
),
Err(e) => tracing::warn!(
manga_id = %upsert.manga_id,
error = ?e,
"chapter sync failed"
),
}
};
}
if should_stop(was_clean, upsert.status, chapters_new) {
hit_stop_condition = true;
// Incremental stop: count consecutive Unchanged upserts and
// bail once the threshold is reached. New/Updated resets the
// streak so a fresh entry mid-batch doesn't accidentally trip
// the cutoff.
match upsert.status {
repo::crawler::UpsertStatus::Unchanged => {
consecutive_unchanged += 1;
}
repo::crawler::UpsertStatus::New | repo::crawler::UpsertStatus::Updated => {
consecutive_unchanged = 0;
}
}
if should_stop(mode, consecutive_unchanged) {
hit_incremental_stop = true;
tracing::info!(
key = %manga.source_manga_key,
"stop condition met (Unchanged metadata + 0 new chapters); halting walk"
consecutive_unchanged,
"incremental stop threshold reached; halting walk"
);
break 'outer;
}
}
}
// Recovery-flag write. Only on a clean exit (end-of-walk OR the
// intentional stop). `hit_limit` is a caller-imposed early break
// and does NOT count — the catalog tail wasn't reached, so a future
// tick still needs to walk past where we stopped. The truth table is
// pinned by `should_mark_clean_exit` so a future edit that adds
// `hit_limit` back into the disjunction trips its unit test. Flag-
// write errors are warned and swallowed: the run already did its
// work, and a stale `false` flag just buys a recovery sweep on the
// next tick.
let exited_cleanly = should_mark_clean_exit(walked_to_completion, hit_stop_condition);
if exited_cleanly {
if let Err(e) = repo::crawler::mark_run_completed(db, source_id).await {
tracing::warn!(error = ?e, "mark_run_completed failed");
// Drop pass: only when the walk truly covered everything the source
// surfaces. `last_seen_at` on un-visited rows is stale, so running
// the drop on a partial walk would soft-drop the tail of the index.
let full_walk = walked_to_completion && !hit_limit && !hit_incremental_stop;
let backfill_complete = full_walk && matches!(mode, DiscoverMode::Backfill);
if full_walk {
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
}
} else {
tracing::info!(
?mode,
hit_limit,
hit_incremental_stop,
"partial sync — skipping drop pass"
);
}
if backfill_complete {
if let Err(e) = repo::crawler::mark_seed_completed(db, source_id, run_started_at).await {
tracing::warn!(error = ?e, "mark_seed_completed failed");
} else {
tracing::info!(source_id, "seed marked complete");
}
}
tracing::info!(
was_clean,
?mode,
discovered = stats.discovered,
upserted = stats.upserted,
covers_fetched = stats.covers_fetched,
mangas_failed = stats.mangas_failed,
walked_to_completion,
hit_limit,
hit_stop_condition,
hit_failure_breaker,
exited_cleanly,
hit_incremental_stop,
"metadata pass complete"
);
@@ -463,20 +283,8 @@ pub async fn run_metadata_pass(
Ok(stats)
}
/// Quarantine window for chapters whose latest `SyncChapterContent` job is
/// `dead`. The partial dedup index `crawler_jobs_chapter_content_dedup_idx`
/// only blocks `(pending|running)` duplicates, so without this gate a
/// permanently-failing chapter is re-enqueued every cron tick, burns
/// `max_attempts` retries, dies again, and spins forever. With the gate,
/// dead chapters get a week of silence before the next attempt — long
/// enough for a transient site issue to resolve, short enough that
/// permanent failures don't stay permanent if conditions change.
const CHAPTER_DEAD_QUARANTINE_DAYS: i64 = 7;
/// Enqueue a `SyncChapterContent` job for every chapter of *any* bookmarked
/// manga that still has `page_count = 0` and a non-dropped source row.
/// Chapters whose latest job is `dead` within `CHAPTER_DEAD_QUARANTINE_DAYS`
/// are excluded to break the dead-letter spin.
/// Returns `(inserted, skipped)` counts. Dedup index handles repeats.
pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<EnqueueSummary> {
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
@@ -487,18 +295,10 @@ pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<Enqueue
JOIN chapter_sources cs ON cs.chapter_id = c.id
WHERE c.page_count = 0
AND cs.dropped_at IS NULL
AND NOT EXISTS (
SELECT 1 FROM crawler_jobs cj
WHERE cj.payload->>'kind' = 'sync_chapter_content'
AND cj.payload->>'chapter_id' = c.id::text
AND cj.state = 'dead'
AND cj.updated_at > now() - ($1::bigint || ' days')::interval
)
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.number, c.created_at
ORDER BY c.manga_id, c.number ASC, c.created_at ASC
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.created_at
ORDER BY c.manga_id, c.created_at ASC
"#,
)
.bind(CHAPTER_DEAD_QUARANTINE_DAYS)
.fetch_all(pool)
.await
.context("query bookmarked-pending chapters")?;
@@ -527,34 +327,23 @@ pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<Enqueue
}
/// Enqueue chapter-content jobs for a *single* manga (the bookmark-create
/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`], including
/// the dead-letter quarantine — a freshly bookmarked manga should not
/// burn retries on chapters that just died on the cron tick.
/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`].
pub async fn enqueue_pending_for_manga(
pool: &PgPool,
manga_id: Uuid,
) -> anyhow::Result<EnqueueSummary> {
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
r#"
SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
SELECT DISTINCT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
FROM chapters c
JOIN chapter_sources cs ON cs.chapter_id = c.id
WHERE c.manga_id = $1
AND c.page_count = 0
AND cs.dropped_at IS NULL
AND NOT EXISTS (
SELECT 1 FROM crawler_jobs cj
WHERE cj.payload->>'kind' = 'sync_chapter_content'
AND cj.payload->>'chapter_id' = c.id::text
AND cj.state = 'dead'
AND cj.updated_at > now() - ($2::bigint || ' days')::interval
)
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.number, c.created_at
ORDER BY c.number ASC, c.created_at ASC, cs.source_id
ORDER BY cs.source_id, c.id
"#,
)
.bind(manga_id)
.bind(CHAPTER_DEAD_QUARANTINE_DAYS)
.fetch_all(pool)
.await
.context("query pending chapters for manga")?;
@@ -589,149 +378,11 @@ pub struct EnqueueSummary {
pub failed: usize,
}
#[derive(Debug, Default, Clone, Copy)]
pub struct CoverBackfillStats {
pub considered: usize,
pub fetched: usize,
pub failed: usize,
}
/// Default per-tick cap for [`backfill_missing_covers`]. The metadata pass
/// already retries covers when its walk reaches the affected manga; this
/// backfill exists to catch the residual case where the early-stop
/// optimisation prevents the walk from reaching mangas whose cover failed
/// on first attempt. A small cap is enough because the backlog only grows
/// from sporadic download failures, not from systematic misses.
pub const COVER_BACKFILL_DEFAULT_MAX: usize = 10;
/// Re-attempt cover downloads for mangas where `cover_image_path IS NULL`
/// but a live `manga_sources` row exists. Refetches the source detail
/// page (which is where the cover URL lives) and downloads the cover.
///
/// Bounded by `max_mangas` per call so a steady stream of failing covers
/// — e.g. a CDN host that's persistently 502 — can't monopolise a cron
/// tick. Orders by `manga_sources.last_seen_at DESC` so the freshest
/// missing-cover mangas are addressed first.
///
/// Failures are logged and counted, not raised: a single bad cover URL
/// must not stall every other backfill behind it.
#[allow(clippy::too_many_arguments)]
pub async fn backfill_missing_covers(
browser_manager: &BrowserManager,
db: &PgPool,
storage: &dyn Storage,
http: &reqwest::Client,
rate: &HostRateLimiters,
max_mangas: usize,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
status: Option<&crate::crawler::status::StatusHandle>,
tor: Option<&crate::crawler::tor::TorController>,
) -> anyhow::Result<CoverBackfillStats> {
let mut stats = CoverBackfillStats::default();
if max_mangas == 0 {
return Ok(stats);
}
let entries = repo::crawler::list_missing_covers(db, max_mangas as i64)
.await
.context("list_missing_covers")?;
if entries.is_empty() {
return Ok(stats);
}
let lease = browser_manager
.acquire()
.await
.context("acquire browser lease for cover backfill")?;
let browser_ref: &chromiumoxide::Browser = &lease;
let ctx = FetchContext { browser: browser_ref, rate, tor };
let total = entries.len();
for (index, entry) in entries.into_iter().enumerate() {
stats.considered += 1;
if let Some(s) = status {
s.set_phase(crate::crawler::status::Phase::CoverBackfill { index, total })
.await;
}
// Metadata-only TargetSource: skip chapter-list parsing so a
// missing-cover refetch doesn't soft-drop chapters on a partial
// render. Cover URL alone is what we need.
let source = TargetSource::new(entry.source_url.clone()).without_chapter_parsing();
let r = SourceMangaRef {
source_manga_key: entry.source_manga_key.clone(),
title: String::new(),
url: entry.source_url.clone(),
};
let manga = match source.fetch_manga(&ctx, &r).await {
Ok(manga) => manga,
Err(e) => {
tracing::warn!(
manga_id = %entry.manga_id,
url = %entry.source_url,
error = ?e,
"cover backfill: fetch_manga failed"
);
stats.failed += 1;
continue;
}
};
let Some(cover_url) = manga.cover_url.clone() else {
tracing::warn!(
manga_id = %entry.manga_id,
url = %entry.source_url,
"cover backfill: source returned no cover_url"
);
stats.failed += 1;
continue;
};
if let Some(s) = status {
s.set_current_cover(Some(crate::crawler::status::CoverTarget {
manga_id: entry.manga_id,
manga_title: manga.title.clone(),
}))
.await;
}
let cover_result = download_and_store_cover(
db,
storage,
http,
rate,
&entry.source_url,
entry.manga_id,
&cover_url,
allowlist,
max_image_bytes,
)
.await;
if let Some(s) = status {
s.set_current_cover(None).await;
}
match cover_result {
Ok(()) => stats.fetched += 1,
Err(e) => {
tracing::warn!(
manga_id = %entry.manga_id,
url = %entry.source_url,
error = ?e,
"cover backfill: download failed"
);
stats.failed += 1;
}
}
}
drop(lease);
Ok(stats)
}
/// Download a cover image and persist its storage path. Local to the
/// pipeline because the CLI still calls it from its inline chapter-content
/// loop; once the worker pool fully replaces that path we can fold this
/// into `pipeline` proper.
#[allow(clippy::too_many_arguments)]
pub(crate) async fn download_and_store_cover(
async fn download_and_store_cover(
db: &PgPool,
storage: &dyn Storage,
http: &reqwest::Client,
@@ -739,8 +390,6 @@ pub(crate) async fn download_and_store_cover(
manga_url: &str,
manga_id: Uuid,
cover_url: &str,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
) -> anyhow::Result<()> {
let absolute = reqwest::Url::parse(manga_url)
.context("parse manga URL")?
@@ -748,22 +397,17 @@ pub(crate) async fn download_and_store_cover(
.context("join cover URL onto manga URL")?;
rate.wait_for(absolute.as_str()).await?;
let bytes = fetch_bytes_capped(
http,
absolute.as_str(),
Some(manga_url),
allowlist,
max_image_bytes,
)
.await?;
if !looks_like_image(&bytes) {
anyhow::bail!(
"cover URL {absolute} returned non-image bytes; refusing to store as binary blob"
);
}
let ext = infer::get(&bytes)
.map(|k| k.extension())
.expect("looks_like_image asserted infer succeeded");
let resp = http
.get(absolute.clone())
.header(reqwest::header::REFERER, manga_url)
.send()
.await
.with_context(|| format!("GET {absolute}"))?
.error_for_status()
.with_context(|| format!("non-2xx for {absolute}"))?;
let bytes = resp.bytes().await.context("read cover body")?;
let kind = infer::get(&bytes);
let ext = kind.map(|k| k.extension()).unwrap_or("bin");
let key = format!("mangas/{manga_id}/cover.{ext}");
storage
@@ -783,124 +427,41 @@ pub(crate) async fn download_and_store_cover(
Ok(())
}
use crate::crawler::url_utils::origin_of;
fn origin_of(url: &str) -> Option<String> {
let (scheme, rest) = url.split_once("://")?;
let host = rest.split('/').next()?;
Some(format!("{scheme}://{host}"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn stop_condition_fires_on_unchanged_metadata_and_zero_new_chapters() {
// The whole point of the rule: in steady state, a manga whose
// metadata hash matches AND whose chapter list gained no new
// entries proves we've reached the caught-up tail of a
// newest-first index.
assert!(should_stop(true, UpsertStatus::Unchanged, Some(0)));
fn backfill_never_stops_regardless_of_streak() {
assert!(!should_stop(DiscoverMode::Backfill, 0));
assert!(!should_stop(DiscoverMode::Backfill, 100));
assert!(!should_stop(DiscoverMode::Backfill, usize::MAX));
}
#[test]
fn stop_condition_refuses_when_chapters_added() {
// Unchanged metadata + N new chapters means the source bumped
// this manga because of the chapter add; the rest of the index
// is still ahead of us. Don't bail.
assert!(!should_stop(true, UpsertStatus::Unchanged, Some(1)));
assert!(!should_stop(true, UpsertStatus::Unchanged, Some(42)));
fn incremental_stops_when_streak_meets_threshold() {
let mode = DiscoverMode::Incremental {
stop_after_unchanged: 3,
};
assert!(!should_stop(mode, 0));
assert!(!should_stop(mode, 2));
assert!(should_stop(mode, 3), "stops at exactly the threshold");
assert!(should_stop(mode, 100), "stops at anything past threshold");
}
#[test]
fn stop_condition_refuses_when_metadata_changed() {
// Updated or New metadata always continues — even with zero new
// chapters — because the change-of-metadata bump itself is what
// the walk is following.
assert!(!should_stop(true, UpsertStatus::Updated, Some(0)));
assert!(!should_stop(true, UpsertStatus::New, Some(0)));
}
#[test]
fn stop_condition_refuses_when_chapter_count_unknown() {
// skip_chapters mode (CLI metadata-only sweep) or a
// logged-and-swallowed chapter sync error: we can't claim "no
// new chapters" from absence of evidence, so don't stop. The
// operator who runs metadata-only intentionally wants a full
// walk anyway.
assert!(!should_stop(true, UpsertStatus::Unchanged, None));
}
#[test]
fn stop_condition_disabled_in_recovery_mode() {
// was_clean = false means the previous run did not exit cleanly;
// the catalog past its crash point is potentially un-synced. Walk
// to end-of-source no matter what individual mangas report.
assert!(!should_stop(false, UpsertStatus::Unchanged, Some(0)));
assert!(!should_stop(false, UpsertStatus::Unchanged, Some(1)));
assert!(!should_stop(false, UpsertStatus::Updated, Some(0)));
assert!(!should_stop(false, UpsertStatus::New, None));
}
#[test]
fn abort_pass_fires_at_threshold_and_respects_disable() {
// Disabled (0) never fires, no matter how many failures.
assert!(!should_abort_pass(0, 0));
assert!(!should_abort_pass(100, 0));
// Below threshold: keep going.
assert!(!should_abort_pass(9, 10));
// At/above threshold: abort.
assert!(should_abort_pass(10, 10));
assert!(should_abort_pass(11, 10));
}
#[test]
fn clean_exit_when_walked_to_completion() {
// End-of-walk reached the catalog tail — the recovery flag may
// safely flip back to `true`.
assert!(should_mark_clean_exit(true, false));
}
#[test]
fn clean_exit_when_stop_condition_fired() {
// First Unchanged + 0-new-chapter manga is a complete steady-
// state exit: every manga newer than this point was synced, and
// by source-side `update_date DESC` ordering everything past
// this point is at least as caught-up.
assert!(should_mark_clean_exit(false, true));
}
#[test]
fn dirty_exit_when_neither_completion_nor_stop_fired() {
// The walk ended for some other reason — including the
// caller-imposed `hit_limit` cap, which is the regression case
// this test exists for. `should_mark_clean_exit` does not take
// `hit_limit` as a parameter, so a future edit that adds
// `|| hit_limit` to the inline expression in `run_metadata_pass`
// would need to also touch this helper, and would fail this
// assertion when it did.
assert!(!should_mark_clean_exit(false, false));
}
#[test]
fn run_scoped_seen_set_skips_duplicate_source_manga_keys() {
// Pins the per-ref loop contract: `contains` gates whether work
// runs, and `insert` only fires on the success path (after upsert).
// A failed ref that reappears later in the same pass must get a
// second chance — that's why the loop uses contains-then-insert
// instead of insert-and-skip-on-collision.
let mut seen: HashSet<String> = HashSet::new();
// First sighting of a key: not yet seen → loop proceeds.
assert!(!seen.contains("manga-a"), "first sighting is unseen");
// Simulate a failed fetch_manga: do NOT insert. Next sighting must
// still be considered unseen so the loop retries it.
assert!(!seen.contains("manga-a"), "failed key is still retryable");
// Now simulate a successful upsert — insert is called.
seen.insert("manga-a".to_string());
// Subsequent sightings of the same key are skipped.
assert!(seen.contains("manga-a"), "successful key is now seen");
// Distinct keys never collide.
assert!(!seen.contains("manga-b"), "different key independent");
seen.insert("manga-b".to_string());
assert!(seen.contains("manga-b"));
assert!(seen.contains("manga-a"), "first key still recorded");
fn incremental_with_zero_threshold_stops_immediately() {
// A nonsensical config (no Unchanged needed to stop) shouldn't
// panic — it just means the very first ref triggers the bail.
let mode = DiscoverMode::Incremental {
stop_after_unchanged: 0,
};
assert!(should_stop(mode, 0));
}
}

View File

@@ -98,9 +98,15 @@ impl HostRateLimiters {
}
}
// `host_of` was duplicated across session/rate_limit/pipeline; the
// canonical version now lives in `crawler::url_utils`.
use crate::crawler::url_utils::host_of;
/// Extract the host (no port) from a URL string. Returns `None` for
/// inputs without a `scheme://host` shape — those would never have
/// reached the network layer anyway.
fn host_of(url: &str) -> Option<String> {
let after_scheme = url.split_once("://")?.1;
let host_with_port = after_scheme.split('/').next()?;
let host = host_with_port.rsplit_once(':').map_or(host_with_port, |(h, _)| h);
(!host.is_empty()).then(|| host.to_ascii_lowercase())
}
#[cfg(test)]
mod tests {

View File

@@ -1,279 +0,0 @@
//! Admin-triggered resync of a single manga's metadata + cover, or a
//! single chapter's content.
//!
//! The cron tick already retries covers and chapter content on its own
//! schedule. This module exists for the operator-controlled path:
//! "this manga's metadata is stale / its cover never landed / this
//! chapter is broken — pull from source now, not at the next daily
//! tick." Wired into the admin API, never into the queue, so the work
//! happens synchronously with the HTTP request and the admin sees the
//! refreshed row in the response.
//!
//! Shares the daemon's [`BrowserManager`], rate limiter, HTTP client,
//! and TOR controller so a force resync respects the same per-host
//! pacing and recircuit budget the daily crawl uses — admin actions
//! must not let an operator accidentally hammer the source.
use std::sync::Arc;
use anyhow::Context;
use async_trait::async_trait;
use sqlx::PgPool;
use uuid::Uuid;
use crate::crawler::browser_manager::BrowserManager;
use crate::crawler::content::{self, SyncOutcome};
use crate::crawler::pipeline;
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::safety::DownloadAllowlist;
use crate::crawler::source::target::TargetSource;
use crate::crawler::source::{FetchContext, Source, SourceMangaRef};
use crate::crawler::tor::TorController;
use crate::repo;
use crate::repo::crawler::UpsertStatus;
use crate::storage::Storage;
/// Outcome of [`ResyncService::resync_manga`]. Mirrors the bits the
/// admin UI cares about — was the row actually re-upserted, did the
/// cover land — so the response can show "metadata refreshed, cover
/// re-downloaded" or "metadata unchanged" without a second round-trip.
#[derive(Debug, Clone, Copy)]
pub struct MangaResyncOutcome {
pub manga_id: Uuid,
pub metadata_status: UpsertStatus,
pub cover_fetched: bool,
}
/// Outcome of [`ResyncService::resync_chapter`]. `Fetched(pages)` is the
/// success case; `Skipped` means the source row was already gone or the
/// chapter had no live source.
#[derive(Debug, Clone)]
pub enum ChapterResyncOutcome {
Fetched { chapter_id: Uuid, pages: usize },
Skipped { chapter_id: Uuid, reason: String },
}
/// Service exposed by the daemon to the admin API. Optional on
/// [`AppState`] — `None` when the crawler daemon is disabled
/// (`CRAWLER_DAEMON=false`), in which case admin handlers return 503.
#[async_trait]
pub trait ResyncService: Send + Sync {
async fn resync_manga(&self, manga_id: Uuid) -> anyhow::Result<MangaResyncOutcome>;
async fn resync_chapter(&self, chapter_id: Uuid) -> anyhow::Result<ChapterResyncOutcome>;
}
/// Errors with a stable shape so the API layer can map them to the
/// right HTTP status (404 vs 422 vs 5xx). Anything else surfaces as a
/// generic 500.
#[derive(Debug, thiserror::Error)]
pub enum ResyncError {
#[error("manga has no source to resync from")]
NoMangaSource,
#[error("chapter has no source to resync from")]
NoChapterSource,
}
pub struct RealResyncService {
pub browser_manager: Arc<BrowserManager>,
pub db: PgPool,
pub storage: Arc<dyn Storage>,
pub http: reqwest::Client,
pub rate: Arc<HostRateLimiters>,
pub download_allowlist: DownloadAllowlist,
pub max_image_bytes: usize,
pub tor: Option<Arc<TorController>>,
}
#[async_trait]
impl ResyncService for RealResyncService {
async fn resync_manga(&self, manga_id: Uuid) -> anyhow::Result<MangaResyncOutcome> {
// Pick the freshest live source row. Multi-source mangas
// (theoretical — only one Source impl today) get the row whose
// `last_seen_at` is newest; soft-dropped rows are skipped.
let row: Option<(String, String, String)> = sqlx::query_as(
"SELECT source_id, source_manga_key, source_url \
FROM manga_sources \
WHERE manga_id = $1 AND dropped_at IS NULL \
ORDER BY last_seen_at DESC \
LIMIT 1",
)
.bind(manga_id)
.fetch_optional(&self.db)
.await
.context("look up manga_sources for resync")?;
let Some((_source_id, source_manga_key, source_url)) = row else {
return Err(ResyncError::NoMangaSource.into());
};
let lease = self
.browser_manager
.acquire()
.await
.context("acquire browser lease for manga resync")?;
let browser_ref: &chromiumoxide::Browser = &lease;
let ctx = FetchContext {
browser: browser_ref,
rate: &self.rate,
tor: self.tor.as_deref(),
};
// Parse chapters too — a force resync is "make this manga fully
// current," not just metadata. The full pipeline handles the
// partial-render guard for us; we replicate the same caution
// here by skipping the chapter sync when the parser returned
// empty but the manga previously had chapters.
let source = TargetSource::new(source_url.clone());
let r = SourceMangaRef {
source_manga_key: source_manga_key.clone(),
title: String::new(),
url: source_url.clone(),
};
let manga = source
.fetch_manga(&ctx, &r)
.await
.with_context(|| format!("fetch_manga during resync of {manga_id}"))?;
// Partial-render guard: same logic as run_metadata_pass.
let source_id = source.id();
if !manga.chapters.is_empty() || {
let prior = repo::crawler::live_chapter_count_for_source_manga(
&self.db,
source_id,
&source_manga_key,
)
.await
.unwrap_or(0);
prior == 0
} {
// Either the new fetch surfaced chapters, or there were
// none before either — chapter sync is safe to run.
} else {
tracing::warn!(
%manga_id,
source_url = %source_url,
"resync_manga: fetch returned empty chapters but prior count > 0; skipping chapter sync to avoid soft-drop"
);
}
let upsert = repo::crawler::upsert_manga_from_source(
&self.db,
source_id,
&source_url,
&manga,
)
.await
.with_context(|| format!("upsert_manga_from_source during resync of {manga_id}"))?;
// Cover refetch: force-download regardless of UpsertStatus.
// Admin clicked "resync" because they want the cover too.
let mut cover_fetched = false;
if let Some(cover_url) = manga.cover_url.as_deref() {
match pipeline::download_and_store_cover(
&self.db,
self.storage.as_ref(),
&self.http,
&self.rate,
&source_url,
upsert.manga_id,
cover_url,
&self.download_allowlist,
self.max_image_bytes,
)
.await
{
Ok(()) => cover_fetched = true,
Err(e) => tracing::warn!(
%manga_id,
error = ?e,
"resync_manga: cover download failed"
),
}
}
// Chapter sync — only when the partial-render guard above
// didn't bail.
let prior_chapter_count = repo::crawler::live_chapter_count_for_source_manga(
&self.db,
source_id,
&source_manga_key,
)
.await
.unwrap_or(0);
if !manga.chapters.is_empty() || prior_chapter_count == 0 {
match repo::crawler::sync_manga_chapters(
&self.db,
source_id,
upsert.manga_id,
&manga.chapters,
)
.await
{
Ok(diff) => tracing::info!(
%manga_id,
new = diff.new,
refreshed = diff.refreshed,
dropped = diff.dropped,
"resync_manga: chapters synced"
),
Err(e) => tracing::warn!(
%manga_id,
error = ?e,
"resync_manga: chapter sync failed"
),
}
}
drop(lease);
Ok(MangaResyncOutcome {
manga_id: upsert.manga_id,
metadata_status: upsert.status,
cover_fetched,
})
}
async fn resync_chapter(&self, chapter_id: Uuid) -> anyhow::Result<ChapterResyncOutcome> {
let row = repo::chapter::dispatch_target(&self.db, chapter_id)
.await
.context("look up chapter_sources for resync")?;
let Some((manga_id, source_url, _title, _number)) = row else {
return Err(ResyncError::NoChapterSource.into());
};
let lease = self
.browser_manager
.acquire()
.await
.context("acquire browser lease for chapter resync")?;
let result = content::sync_chapter_content(
&lease,
&self.db,
self.storage.as_ref(),
&self.http,
&self.rate,
chapter_id,
manga_id,
&source_url,
true,
&self.download_allowlist,
self.max_image_bytes,
self.tor.as_deref(),
// Admin resync isn't a daemon worker slot — no live status.
None,
)
.await;
drop(lease);
match result? {
SyncOutcome::Fetched { pages } => {
Ok(ChapterResyncOutcome::Fetched { chapter_id, pages })
}
SyncOutcome::Skipped => Ok(ChapterResyncOutcome::Skipped {
chapter_id,
reason: "chapter already had pages on disk".to_string(),
}),
SyncOutcome::SessionExpired => {
anyhow::bail!("source session expired — operator must refresh PHPSESSID")
}
}
}
}

View File

@@ -1,558 +0,0 @@
//! Defensive helpers for the image-download paths.
//!
//! Two threats this module addresses:
//!
//! - **SSRF**: a scraped chapter or manga page can embed an absolute
//! `<img src="http://10.0.0.1/...">`. The crawler runs inside the
//! backend container with intra-compose access to `postgres:5432`
//! and possibly other internal services; without a host check the
//! crawler would happily probe them. [`is_safe_url`] rejects
//! anything whose host isn't on the operator-configured allowlist,
//! plus any IP literal in RFC1918 / loopback / link-local / unique-
//! local space (including IPv4-mapped IPv6 like `::ffff:127.0.0.1`)
//! as a second defence for the case where an allowlisted hostname's
//! DNS happens to resolve to a literal private address.
//!
//! **DNS rebinding is not covered.** A hostname like `cdn.allowed.com`
//! that *resolves* to `127.0.0.1` via hostile DNS bypasses the IP
//! check entirely — `is_safe_url` only inspects URL strings, not
//! resolved IPs. Mitigating that requires a custom reqwest resolver
//! that filters IPs after DNS, which would mean rebuilding reqwest's
//! connector. The allowlist + good operator DNS hygiene is the
//! realistic mitigation today.
//!
//! - **Unbounded download**: `Response::bytes().await` reads the full
//! body before returning. A malicious source serving a 10 GiB image
//! would fill memory and then disk. [`accumulate_capped`] streams
//! the body chunk-by-chunk into a [`bytes::BytesMut`] and bails as
//! soon as the running total exceeds the cap.
//!
//! Both helpers are pure-data: the SSRF check is keyed off a parsed
//! URL string, and the byte accumulator is keyed off a generic stream.
//! Easy to unit-test without a live network or browser.
use std::net::IpAddr;
use anyhow::{bail, Context};
use bytes::BytesMut;
use futures_util::StreamExt;
use reqwest::Url;
/// Default per-image download cap. A page image is generally <2 MiB;
/// 32 MiB leaves headroom for high-resolution covers while still
/// stopping a misbehaving CDN dead. Override via `CRAWLER_MAX_IMAGE_BYTES`.
pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
/// Hosts that are always allowed in addition to the operator's
/// configured allowlist. None by default — keeping the surface area
/// minimal so the only way a URL gets through is if it matches an
/// explicit catalog/CDN entry.
///
/// `allow_any` flips the host check off entirely (private-IP and
/// scheme checks still apply). It exists for operators whose sources
/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
/// where enumerating each host upfront is impractical. Off by default.
#[derive(Clone, Debug, Default)]
pub struct DownloadAllowlist {
hosts: Vec<String>,
allow_any: bool,
}
impl DownloadAllowlist {
pub fn new() -> Self {
Self {
hosts: Vec::new(),
allow_any: false,
}
}
/// Bypass the host allowlist. Scheme, localhost, and private-IP
/// checks in [`is_safe_url`] continue to apply — this only opens
/// up public hosts that weren't pre-enumerated.
pub fn allow_any() -> Self {
Self {
hosts: Vec::new(),
allow_any: true,
}
}
/// Add a host (case-insensitive match). Sub-domains are *not*
/// implied: pass `cdn.example.com` and `example.com` separately
/// if both should be reachable.
pub fn allow(mut self, host: impl Into<String>) -> Self {
let h = host.into().to_ascii_lowercase();
if !h.is_empty() && !self.hosts.iter().any(|existing| existing == &h) {
self.hosts.push(h);
}
self
}
pub fn is_empty(&self) -> bool {
self.hosts.is_empty()
}
pub fn contains(&self, host: &str) -> bool {
if self.allow_any {
return true;
}
let lower = host.to_ascii_lowercase();
self.hosts.iter().any(|h| h == &lower)
}
}
/// Verify a URL is safe for the crawler to fetch.
///
/// Rejects:
/// - non-http(s) schemes (file://, gopher://, …),
/// - any IP literal in private / loopback / link-local / unique-local
/// space (defense in depth — a DNS allowlist alone wouldn't cover an
/// attacker that places an entry like `cdn.evil` pointing at
/// `192.168.1.1`),
/// - the literal hostname `localhost`,
/// - hosts that aren't on the supplied allowlist.
///
/// An empty allowlist rejects everything (the conservative default —
/// callers must explicitly allow the catalog and CDN hosts).
pub fn is_safe_url(raw_url: &str, allow: &DownloadAllowlist) -> Result<(), UrlSafetyError> {
let url = Url::parse(raw_url).map_err(|_| UrlSafetyError::Unparseable)?;
let scheme = url.scheme();
if scheme != "http" && scheme != "https" {
return Err(UrlSafetyError::BadScheme(scheme.to_string()));
}
let host = url.host_str().ok_or(UrlSafetyError::NoHost)?;
let lower_host = host.to_ascii_lowercase();
if lower_host == "localhost" {
return Err(UrlSafetyError::Loopback);
}
// Reject IP literals in private/loopback ranges regardless of the
// allowlist — if someone puts an IP literal on the allowlist they
// almost certainly didn't mean a private range.
// reqwest::Url normalises IPv6 literals as `[::1]` (brackets
// included) in `host_str()`. Strip the brackets before parsing.
let ip_candidate = lower_host
.strip_prefix('[')
.and_then(|s| s.strip_suffix(']'))
.unwrap_or(&lower_host);
if let Ok(ip) = ip_candidate.parse::<IpAddr>() {
if is_private_ip(&ip) {
return Err(UrlSafetyError::PrivateIp(ip));
}
}
if !allow.contains(&lower_host) {
return Err(UrlSafetyError::HostNotAllowed(lower_host));
}
Ok(())
}
fn is_private_ip(ip: &IpAddr) -> bool {
match ip {
IpAddr::V4(v4) => {
v4.is_loopback()
|| v4.is_private()
|| v4.is_link_local()
|| v4.is_unspecified()
|| v4.is_broadcast()
// CGNAT 100.64.0.0/10
|| (v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64)
// 169.254/16 link-local already covered, but 0.0.0.0/8 is special-use
|| v4.octets()[0] == 0
}
IpAddr::V6(v6) => {
// IPv4-mapped IPv6 (::ffff:0:0/96): unwrap to the embedded
// IPv4 and recurse so `::ffff:127.0.0.1` is caught by the
// IPv4 loopback check rather than passing through.
// `Ipv6Addr::is_loopback()` only matches `::1` exactly.
if let Some(v4) = v6.to_ipv4_mapped() {
return is_private_ip(&IpAddr::V4(v4));
}
v6.is_loopback()
|| v6.is_unspecified()
// fc00::/7 unique-local
|| (v6.segments()[0] & 0xfe00) == 0xfc00
// fe80::/10 link-local
|| (v6.segments()[0] & 0xffc0) == 0xfe80
}
}
}
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
pub enum UrlSafetyError {
#[error("URL is not parseable")]
Unparseable,
#[error("scheme {0:?} is not http or https")]
BadScheme(String),
#[error("URL is missing a host")]
NoHost,
#[error("host points at the loopback interface")]
Loopback,
#[error("host is a private/internal IP: {0}")]
PrivateIp(IpAddr),
#[error("host {0:?} is not on the crawler download allowlist")]
HostNotAllowed(String),
}
/// Drain a byte stream into a single buffer, bailing out as soon as
/// the running total exceeds `max_bytes`. Generic over the stream so
/// it's testable without a live HTTP response.
pub async fn accumulate_capped<S, E>(stream: S, max_bytes: usize) -> anyhow::Result<bytes::Bytes>
where
S: futures_core::Stream<Item = Result<bytes::Bytes, E>>,
E: std::error::Error + Send + Sync + 'static,
{
let mut buf = BytesMut::new();
let mut stream = std::pin::pin!(stream);
while let Some(chunk) = stream.next().await {
let chunk = chunk.map_err(|e| anyhow::anyhow!("stream chunk: {e}"))?;
if buf.len().saturating_add(chunk.len()) > max_bytes {
bail!(
"response exceeds {max_bytes}-byte cap (received >{}+{})",
buf.len(),
chunk.len()
);
}
buf.extend_from_slice(&chunk);
}
Ok(buf.freeze())
}
/// Send `req` and stream the response into a length-limited buffer.
/// Combines [`is_safe_url`] check + [`accumulate_capped`] so each
/// call-site is one line.
pub async fn fetch_bytes_capped(
http: &reqwest::Client,
url: &str,
referer: Option<&str>,
allow: &DownloadAllowlist,
max_bytes: usize,
) -> anyhow::Result<bytes::Bytes> {
is_safe_url(url, allow).with_context(|| format!("reject unsafe URL {url}"))?;
let mut req = http.get(url);
if let Some(r) = referer {
req = req.header(reqwest::header::REFERER, r);
}
let resp = req
.send()
.await
.with_context(|| format!("GET {url}"))?
.error_for_status()
.with_context(|| format!("non-2xx for {url}"))?;
accumulate_capped(resp.bytes_stream(), max_bytes)
.await
.with_context(|| format!("download body for {url}"))
}
/// True when `bytes` sniffs as one of the *renderable* image formats
/// the `/files/*key` endpoint can serve with a correct Content-Type:
/// JPEG, PNG, WebP, GIF, AVIF. Matches the upload pipeline's
/// whitelist in `upload::parse_image`.
///
/// `infer::MatcherType::Image` is intentionally NOT used — it also
/// matches BMP, TIFF, HEIF, ICO, PSD, and JP2. Those would sniff as
/// "image" here but [`api::files::content_type_for`] would fall back
/// to `application/octet-stream`, prompting browsers to download
/// instead of render. Keep the two layers aligned.
pub fn looks_like_image(bytes: &[u8]) -> bool {
matches!(
infer::get(bytes).map(|k| k.mime_type()),
Some("image/jpeg" | "image/png" | "image/webp" | "image/gif" | "image/avif")
)
}
#[cfg(test)]
mod tests {
use super::*;
use futures_util::stream;
fn allow_just(host: &str) -> DownloadAllowlist {
DownloadAllowlist::new().allow(host)
}
#[test]
fn allow_any_admits_arbitrary_public_host() {
// Operators who can't pre-enumerate a numbered-CDN fleet
// (cdn1, cdn2, …) opt into allow_any. Any public host passes.
let allow = DownloadAllowlist::allow_any();
assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
}
#[test]
fn allow_any_still_blocks_private_ips() {
// The point of the bypass is the host-allowlist check, not the
// SSRF defense. Private/loopback IPs stay refused.
let allow = DownloadAllowlist::allow_any();
for url in [
"http://10.0.0.1/",
"http://192.168.1.1/",
"http://169.254.169.254/",
"http://127.0.0.1/",
"http://[::1]/",
"http://[::ffff:127.0.0.1]/",
] {
assert!(
matches!(
is_safe_url(url, &allow).unwrap_err(),
UrlSafetyError::PrivateIp(_)
),
"allow_any must still reject {url}"
);
}
}
#[test]
fn allow_any_still_blocks_localhost() {
let allow = DownloadAllowlist::allow_any();
assert!(matches!(
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
UrlSafetyError::Loopback
));
}
#[test]
fn allow_any_still_blocks_non_http_schemes() {
let allow = DownloadAllowlist::allow_any();
assert!(matches!(
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
UrlSafetyError::BadScheme(_)
));
}
#[test]
fn safe_url_allows_listed_host() {
let allow = allow_just("cdn.example.com");
assert!(is_safe_url("https://cdn.example.com/img.jpg", &allow).is_ok());
}
#[test]
fn safe_url_blocks_unlisted_host() {
let allow = allow_just("cdn.example.com");
let err = is_safe_url("https://evil.example.org/img.jpg", &allow).unwrap_err();
assert!(matches!(err, UrlSafetyError::HostNotAllowed(h) if h == "evil.example.org"));
}
#[test]
fn safe_url_blocks_localhost_even_if_allowlisted() {
let allow = allow_just("localhost");
assert!(matches!(
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
UrlSafetyError::Loopback
));
}
#[test]
fn safe_url_blocks_loopback_ipv4() {
let allow = allow_just("127.0.0.1");
assert!(matches!(
is_safe_url("http://127.0.0.1/", &allow).unwrap_err(),
UrlSafetyError::PrivateIp(_)
));
}
#[test]
fn safe_url_blocks_rfc1918() {
let allow = allow_just("10.0.0.1");
for url in [
"http://10.0.0.1/",
"http://192.168.1.1/",
"http://172.16.0.5/",
"http://172.31.255.255/",
] {
assert!(
matches!(
is_safe_url(url, &allow).unwrap_err(),
UrlSafetyError::PrivateIp(_)
),
"should reject {url}"
);
}
}
#[test]
fn safe_url_blocks_link_local() {
let allow = allow_just("169.254.169.254");
// 169.254.169.254 is the AWS/GCP metadata service — the most
// dangerous SSRF target on a default cloud VM.
assert!(matches!(
is_safe_url("http://169.254.169.254/", &allow).unwrap_err(),
UrlSafetyError::PrivateIp(_)
));
}
#[test]
fn safe_url_blocks_ipv6_loopback_and_ula() {
// Debug what host_str returns first — reqwest::Url normalises
// IPv6 literals as `[::1]` with brackets, which doesn't parse
// as `IpAddr` directly. The implementation strips them.
let allow = allow_just("[::1]");
let err = is_safe_url("http://[::1]/", &allow).unwrap_err();
assert!(
matches!(err, UrlSafetyError::PrivateIp(_)),
"expected PrivateIp, got {err:?}"
);
let allow = allow_just("[fd00::1]");
let err = is_safe_url("http://[fd00::1]/", &allow).unwrap_err();
assert!(
matches!(err, UrlSafetyError::PrivateIp(_)),
"expected PrivateIp, got {err:?}"
);
}
#[test]
fn safe_url_blocks_ipv4_mapped_ipv6_loopback() {
// `Ipv6Addr::is_loopback()` only matches `::1` exactly, so
// `::ffff:127.0.0.1` would slip through without the
// to_ipv4_mapped() unwrap in is_private_ip.
let allow = allow_just("[::ffff:127.0.0.1]");
let err = is_safe_url("http://[::ffff:127.0.0.1]/", &allow).unwrap_err();
assert!(
matches!(err, UrlSafetyError::PrivateIp(_)),
"expected PrivateIp, got {err:?}"
);
}
#[test]
fn safe_url_blocks_ipv4_mapped_ipv6_rfc1918() {
let allow = allow_just("[::ffff:10.0.0.1]");
let err = is_safe_url("http://[::ffff:10.0.0.1]/", &allow).unwrap_err();
assert!(matches!(err, UrlSafetyError::PrivateIp(_)));
}
#[test]
fn safe_url_blocks_non_http_schemes() {
let allow = allow_just("anywhere");
assert!(matches!(
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
UrlSafetyError::BadScheme(_)
));
assert!(matches!(
is_safe_url("gopher://anywhere:70/", &allow).unwrap_err(),
UrlSafetyError::BadScheme(_)
));
}
#[test]
fn safe_url_rejects_unparseable() {
let allow = allow_just("anywhere");
assert!(matches!(
is_safe_url("not a url", &allow).unwrap_err(),
UrlSafetyError::Unparseable
));
}
#[test]
fn safe_url_empty_allowlist_rejects_everything() {
let allow = DownloadAllowlist::new();
let err = is_safe_url("https://cdn.example.com/img.jpg", &allow).unwrap_err();
assert!(matches!(err, UrlSafetyError::HostNotAllowed(_)));
}
#[test]
fn allowlist_matches_case_insensitively() {
let allow = DownloadAllowlist::new().allow("CDN.Example.COM");
assert!(is_safe_url("https://cdn.example.com/x.jpg", &allow).is_ok());
assert!(is_safe_url("https://CDN.EXAMPLE.com/x.jpg", &allow).is_ok());
}
#[tokio::test]
async fn accumulate_capped_returns_full_body_under_cap() {
let chunks: Vec<Result<bytes::Bytes, std::io::Error>> = vec![
Ok(bytes::Bytes::from_static(b"hello ")),
Ok(bytes::Bytes::from_static(b"world")),
];
let s = stream::iter(chunks);
let out = accumulate_capped(s, 100).await.unwrap();
assert_eq!(out.as_ref(), b"hello world");
}
#[tokio::test]
async fn accumulate_capped_bails_past_cap() {
let chunks: Vec<Result<bytes::Bytes, std::io::Error>> = vec![
Ok(bytes::Bytes::from(vec![0u8; 50])),
Ok(bytes::Bytes::from(vec![0u8; 60])),
];
let s = stream::iter(chunks);
let err = accumulate_capped(s, 100).await.unwrap_err();
assert!(err.to_string().contains("100-byte cap"));
}
#[tokio::test]
async fn accumulate_capped_surfaces_stream_errors() {
let chunks: Vec<Result<bytes::Bytes, std::io::Error>> = vec![
Ok(bytes::Bytes::from_static(b"ok")),
Err(std::io::Error::other("network blip")),
];
let s = stream::iter(chunks);
let err = accumulate_capped(s, 100).await.unwrap_err();
assert!(err.to_string().contains("network blip"));
}
#[test]
fn looks_like_image_accepts_jpeg() {
// JPEG SOI + APP0 segment.
let jpeg = [0xff, 0xd8, 0xff, 0xe0, 0, 0x10, b'J', b'F', b'I', b'F'];
assert!(looks_like_image(&jpeg));
}
#[test]
fn looks_like_image_accepts_png() {
let png = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0, 0, 0, 0];
assert!(looks_like_image(&png));
}
#[test]
fn looks_like_image_rejects_html_disguised_as_image() {
let html = b"<html><body>not an image</body></html>";
assert!(!looks_like_image(html));
}
#[test]
fn looks_like_image_rejects_empty() {
assert!(!looks_like_image(&[]));
}
#[test]
fn looks_like_image_rejects_renderable_but_unsupported_formats() {
// BMP, TIFF, ICO, PSD are `infer::MatcherType::Image` but the
// /files/*key handler doesn't have Content-Type mappings for
// them, so they'd be served as application/octet-stream and
// download instead of render. Reject at the crawler so we
// never land them in storage.
// BMP magic: "BM" + 4-byte size.
let bmp = [b'B', b'M', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
assert!(!looks_like_image(&bmp), "BMP must be rejected (not renderable by /files)");
// TIFF little-endian magic: "II" + 42.
let tiff = [0x49, 0x49, 0x2a, 0x00, 0, 0, 0, 0];
assert!(!looks_like_image(&tiff), "TIFF must be rejected");
// ICO magic: 0x00,0x00,0x01,0x00.
let ico = [0x00, 0x00, 0x01, 0x00, 1, 0, 16, 16, 0, 0, 1, 0, 0x18, 0, 0x40, 0, 0, 0, 0x16, 0, 0, 0];
assert!(!looks_like_image(&ico), "ICO must be rejected");
}
#[test]
fn looks_like_image_accepts_webp_gif_avif() {
// Cover the three remaining whitelisted formats so a future
// tightening that drops one would fail noisily.
let webp = [
b'R', b'I', b'F', b'F',
0, 0, 0, 0,
b'W', b'E', b'B', b'P',
b'V', b'P', b'8', b' ',
];
assert!(looks_like_image(&webp));
let gif = [b'G', b'I', b'F', b'8', b'7', b'a', 0, 0, 0, 0];
assert!(looks_like_image(&gif));
let avif = [
0x00, 0x00, 0x00, 0x18,
b'f', b't', b'y', b'p',
b'a', b'v', b'i', b'f',
0x00, 0x00, 0x00, 0x00,
b'm', b'i', b'f', b'1',
b'a', b'v', b'i', b'f',
];
assert!(looks_like_image(&avif));
}
}

View File

@@ -42,9 +42,36 @@ pub enum SessionProbe {
Transient,
}
/// Re-export so existing callers keep working after the helper moved
/// to `crawler::url_utils`. The body lives there.
pub use crate::crawler::url_utils::registrable_domain;
/// Compute the cookie domain (e.g. `.example.com`) from a start URL.
/// The leading dot makes the cookie cover every subdomain — the source
/// often redirects between `www.` and other prefixes mid-crawl, and a
/// host-only cookie would silently drop on the cross-subdomain hop.
///
/// Caveat: this takes the last two dot-labels, which is wrong for
/// multi-part TLDs (`.co.uk`, `.com.br` would resolve to `.co.uk` and
/// attach to every site on `.co.uk`). For those, the operator should
/// override via `CRAWLER_COOKIE_DOMAIN` rather than relying on this
/// function — pulling in the Public Suffix List for one knob isn't
/// worth it yet.
pub fn registrable_domain(url: &str) -> Option<String> {
let after_scheme = url.split_once("://")?.1;
let host_with_port = after_scheme.split('/').next()?;
let host = host_with_port
.rsplit_once(':')
.map_or(host_with_port, |(h, _)| h)
.to_ascii_lowercase();
if host.is_empty() {
return None;
}
let labels: Vec<&str> = host.split('.').filter(|l| !l.is_empty()).collect();
if labels.len() < 2 {
// Bare hostname (e.g. `localhost`) — return as-is, no leading
// dot. Setting `.localhost` as cookie domain is invalid.
return Some(host);
}
let registrable = &labels[labels.len() - 2..];
Some(format!(".{}", registrable.join(".")))
}
/// Inject the PHPSESSID cookie into the browser's cookie store for the
/// catalog domain. Must be called before any navigation that depends on
@@ -100,54 +127,6 @@ pub fn classify_probe(html: &str) -> SessionProbe {
}
}
/// Three-way classification of a chapter page response.
///
/// Reader pages don't render `#logo`, so [`classify_probe`] can't be
/// reused as-is. The chapter-specific marker is `a#pic_container`
/// (asserted by the reader-page parser at `parse_chapter_pages`).
///
/// Order matters: broken-page body wins over selector matches, so a
/// transient site-wide 5xx that happens to render the avatar widget
/// elsewhere doesn't falsely reach `Ok`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ChapterProbe {
/// `a#pic_container` present — reader rendered. Whether
/// `#avatar_menu` is also there is informational; if the reader
/// loaded the session is by definition still good.
Ok,
/// Site rendered a "logged out" or "please log in" page (no
/// reader, no broken-page body, and no avatar widget either).
/// Distinguishes the genuine expired-session case from a
/// transient site hiccup.
Unauthenticated,
/// Broken-page body, or reader didn't render but the user is
/// still logged in (avatar widget present). Caller should retry
/// rather than blame the session.
Transient,
}
pub fn classify_chapter_probe(html: &str) -> ChapterProbe {
if is_broken_page_body(html) {
return ChapterProbe::Transient;
}
let doc = scraper::Html::parse_document(html);
let container = scraper::Selector::parse("a#pic_container").unwrap();
if doc.select(&container).next().is_some() {
return ChapterProbe::Ok;
}
let avatar = scraper::Selector::parse("#avatar_menu").unwrap();
if doc.select(&avatar).next().is_some() {
// Logged-in user, but the reader didn't render — most likely
// the layout shifted or the site is serving an interstitial.
ChapterProbe::Transient
} else {
// No reader, no avatar, no broken-body marker — site rendered
// the "please log in" page, which is the genuine session-
// expired signal on this route.
ChapterProbe::Unauthenticated
}
}
/// In-startup retry budget for the session probe. Small but non-zero —
/// startup hitting a 5-second site hiccup shouldn't fail the operator
/// with "PHPSESSID expired" when the session is actually fine.
@@ -162,123 +141,37 @@ const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
/// minutes into a backfill costs 30 minutes.
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
verify_session_with_recircuit(browser, probe_url, None, 0).await
}
/// Like [`verify_session`] but, when `tor` is `Some`, signals
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
/// `Unauthenticated` as recoverable (up to `tor_max_attempts` total
/// probes, calling NEWNYM between each).
///
/// `verify_session` is `verify_session_with_recircuit(..., None, _)`,
/// which collapses the `Unauthenticated` budget to 1 attempt — i.e.
/// fail-fast, exactly the pre-TOR behavior.
pub async fn verify_session_with_recircuit(
browser: &Browser,
probe_url: &str,
tor: Option<&crate::crawler::tor::TorController>,
tor_max_attempts: u32,
) -> anyhow::Result<()> {
let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 };
run_session_probe_loop(
|| fetch_probe_html(browser, probe_url),
|| async {
if let Some(t) = tor {
if let Err(e) = t.new_identity().await {
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
}
}
},
PROBE_MAX_ATTEMPTS,
unauth_max_attempts,
PROBE_RETRY_DELAY,
probe_url,
)
.await
}
/// Pure-over-IO loop body for the session probe. Generic over the
/// fetch and recircuit closures so it can be unit-tested without a
/// real browser or TOR daemon.
///
/// Both budgets count **total attempts**, including the first — so
/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits
/// between them, and `unauth_max_attempts = 1` means "fail-fast, no
/// retry". This matches [`crate::crawler::detect::retry_on_transient`]
/// and the content-path recircuit loop.
///
/// Outcomes:
/// - `SessionProbe::Ok` → return `Ok(())`.
/// - `SessionProbe::Unauthenticated` → recircuit + retry while
/// under the unauth budget. After the cap, bail with the
/// "PHPSESSID expired" diagnostic, mentioning the attempt count so
/// a TOR-misconfig diagnosis is easier.
/// - `SessionProbe::Transient` → same shape against the transient
/// budget; bails with "site down or rate-limiting" after the cap.
async fn run_session_probe_loop<F, Fut, R, RFut>(
mut fetch_html: F,
mut recircuit: R,
transient_max_attempts: u32,
unauth_max_attempts: u32,
retry_delay: Duration,
probe_url_for_msg: &str,
) -> anyhow::Result<()>
where
F: FnMut() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<String>>,
R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>,
{
debug_assert!(transient_max_attempts >= 1);
debug_assert!(unauth_max_attempts >= 1);
let mut transient_attempts = 0u32;
let mut unauth_attempts = 0u32;
let mut attempt = 0u32;
loop {
let html = fetch_html().await?;
attempt += 1;
let html = fetch_probe_html(browser, probe_url).await?;
match classify_probe(&html) {
SessionProbe::Ok => {
tracing::info!(
transient_attempts,
unauth_attempts,
"session probe ok — #logo + #avatar_menu present"
);
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
return Ok(());
}
SessionProbe::Unauthenticated => {
unauth_attempts += 1;
if unauth_attempts >= unauth_max_attempts {
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
after {unauth_attempts} attempt(s); PHPSESSID is missing, \
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
}
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url} \
(page rendered the normal layout); PHPSESSID is missing, expired, \
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
}
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
tracing::warn!(
attempt = unauth_attempts,
max_attempts = unauth_max_attempts,
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
NEWNYM and retrying"
attempt,
max_attempts = PROBE_MAX_ATTEMPTS,
"session probe got a transient page; retrying"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
tokio::time::sleep(PROBE_RETRY_DELAY).await;
}
SessionProbe::Transient => {
transient_attempts += 1;
if transient_attempts >= transient_max_attempts {
return Err(anyhow!(
"session probe failed — probe page at {probe_url_for_msg} returned \
a broken-page response after {transient_max_attempts} attempts. \
The site appears to be down or rate-limiting us; try again \
later before refreshing CRAWLER_PHPSESSID."
));
}
tracing::warn!(
attempt = transient_attempts,
max_attempts = transient_max_attempts,
"session probe got a transient page; recircuit + retry"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
return Err(anyhow!(
"session probe failed — probe page at {probe_url} returned a \
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
The site appears to be down or rate-limiting us; try again \
later before refreshing CRAWLER_PHPSESSID."
));
}
}
}
@@ -289,18 +182,7 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
.new_page(probe_url)
.await
.with_context(|| format!("open probe page {probe_url}"))?;
crate::crawler::nav::wait_for_nav(&page)
.await
.context("wait for nav on probe")?;
// Best-effort wait for the layout marker. Timeout is fine — the
// probe classifier handles a missing `#logo` as Transient anyway,
// and the verify loop retries on Transient.
let _ = crate::crawler::nav::wait_for_selector(
&page,
"#logo",
crate::crawler::nav::SELECTOR_TIMEOUT,
)
.await;
page.wait_for_navigation().await.context("wait for nav on probe")?;
let html = page.content().await.context("read probe html")?;
page.close().await.ok();
Ok(html)
@@ -310,8 +192,44 @@ async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<
mod tests {
use super::*;
// registrable_domain tests live in crawler::url_utils now —
// it's the canonical home for that helper.
#[test]
fn registrable_domain_strips_subdomain() {
assert_eq!(
registrable_domain("https://www.target-site.com/manga/foo/").as_deref(),
Some(".target-site.com")
);
assert_eq!(
registrable_domain("https://m.example.org").as_deref(),
Some(".example.org")
);
}
#[test]
fn registrable_domain_keeps_two_label_host() {
assert_eq!(
registrable_domain("https://example.com/").as_deref(),
Some(".example.com")
);
}
#[test]
fn registrable_domain_handles_port() {
assert_eq!(
registrable_domain("http://www.foo.bar:8080/x").as_deref(),
Some(".foo.bar")
);
}
#[test]
fn registrable_domain_bare_hostname_no_leading_dot() {
// .localhost would be invalid as a cookie Domain.
assert_eq!(registrable_domain("http://localhost:5173").as_deref(), Some("localhost"));
}
#[test]
fn registrable_domain_returns_none_for_garbage() {
assert!(registrable_domain("not a url").is_none());
}
#[test]
fn classify_probe_ok_when_logo_and_avatar_present() {
@@ -355,271 +273,6 @@ mod tests {
assert_eq!(classify_probe(""), SessionProbe::Transient);
}
#[test]
fn classify_chapter_probe_ok_when_reader_rendered() {
let html = r#"
<html><body>
<a id="pic_container">
<img id="page1" src="https://cdn/1.jpg">
</a>
</body></html>
"#;
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
}
#[test]
fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() {
// What a logged-out hit on a chapter URL renders: a normal
// site layout (header etc.) with a "please log in" body, but
// no reader and no avatar widget.
let html = r#"
<html><body>
<header><div id="logo">Catalog</div></header>
<main>Please log in to read this chapter.</main>
</body></html>
"#;
assert_eq!(
classify_chapter_probe(html),
ChapterProbe::Unauthenticated
);
}
#[test]
fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() {
// Avatar shows the session is still valid; reader didn't
// render — site is serving an interstitial or the layout
// momentarily shifted. Retry, don't blame the session.
let html = r#"
<html><body>
<header><div id="logo">Catalog</div><div id="avatar_menu"></div></header>
<main>Site maintenance — back in 5 minutes.</main>
</body></html>
"#;
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
}
#[test]
fn classify_chapter_probe_transient_on_broken_page_body() {
let html =
"<html><body><p>we're sorry, the request file are not found.</p></body></html>";
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
}
#[test]
fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() {
// Regression for the original bug: the binary
// find_element("#avatar_menu") check treated "no avatar" as
// session-expired even when a transient hiccup was the real
// cause. classify_chapter_probe must NOT trip on that pattern
// when pic_container *is* present.
let html = r#"
<html><body>
<a id="pic_container">
<img id="page1" src="https://cdn/1.jpg">
</a>
</body></html>
"#;
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
}
// --- run_session_probe_loop -----------------------------------------
//
// These tests exercise the recircuit-aware loop without a real
// browser. The fetch and recircuit closures are mocked over Vecs of
// canned outcomes / counters.
const OK_HTML: &str = r#"<html><body><div id="logo"></div><div id="avatar_menu"></div></body></html>"#;
const UNAUTH_HTML: &str = r#"<html><body><div id="logo"></div></body></html>"#;
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
#[tokio::test]
async fn probe_loop_ok_on_first_attempt_does_not_recircuit() {
let mut recircuits = 0u32;
let mut fetched = 0u32;
run_session_probe_loop(
|| {
fetched += 1;
async { Ok(OK_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
3,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("ok on first attempt");
assert_eq!(fetched, 1);
assert_eq!(recircuits, 0);
}
#[tokio::test]
async fn probe_loop_unauth_then_ok_when_attempt_budget_available() {
// Budget = 3 total attempts. Unauth on call 1, ok on call 2.
let mut recircuits = 0u32;
let mut call = 0u32;
run_session_probe_loop(
|| {
call += 1;
let n = call;
async move {
if n == 1 {
Ok(UNAUTH_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
3,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("recovers after one recircuit");
assert_eq!(call, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() {
// Budget = 1 total attempt = no retry (matches no-TOR behavior).
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
1,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("budget=1 → fail-fast");
assert_eq!(call, 1, "no retry when budget is 1");
assert_eq!(recircuits, 0);
let msg = format!("{err:#}");
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}");
}
#[tokio::test]
async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
10, // transient budget irrelevant here
3, // 3 attempts total, 2 recircuits between
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("exhausts unauth budget");
assert_eq!(call, 3);
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}");
}
#[tokio::test]
async fn probe_loop_transient_repeats_until_max_then_errors() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(TRANSIENT_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
1,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("transient until max → fail");
assert_eq!(call, 3);
// Recircuit fires between attempts: 3 attempts → 2 recircuits.
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
}
#[tokio::test]
async fn probe_loop_transient_then_ok_returns_ok_after_one_recircuit() {
let mut recircuits = 0u32;
let mut call = 0u32;
run_session_probe_loop(
|| {
call += 1;
let n = call;
async move {
if n == 1 {
Ok(TRANSIENT_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
1,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("ok on second try");
assert_eq!(call, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn probe_loop_propagates_fetch_errors_immediately() {
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Err(anyhow!("nav timeout")) }
},
|| async {},
5,
5,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("fetch error bubbles");
assert_eq!(call, 1);
assert!(format!("{err:#}").contains("nav timeout"));
}
#[test]
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
// Defensive: if a broken-page body somehow contains an

View File

@@ -1,180 +0,0 @@
//! Runtime-updatable crawler session (PHPSESSID).
//!
//! At startup the session comes from `CRAWLER_PHPSESSID`, but it expires
//! and previously needed a container restart to refresh. This controller
//! lets an admin push a fresh cookie at runtime: it rewrites the reqwest
//! cookie jar (CDN image fetches), updates the in-memory value the browser
//! `on_launch` hook reads, persists it to `crawler_state` (so it survives
//! a restart), and clears the sticky `session_expired` flag. A subsequent
//! coordinated browser restart re-runs `on_launch`, re-injecting the new
//! cookie into Chromium and re-probing.
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use anyhow::Context;
use serde_json::json;
use sqlx::PgPool;
use tokio::sync::RwLock;
const STATE_KEY_RUNTIME_SESSION: &str = "runtime_session";
pub struct SessionController {
/// Current PHPSESSID — what `on_launch` injects into a fresh browser.
phpsessid: RwLock<Option<String>>,
/// The same `Arc<Jar>` handed to the reqwest client; updating it here
/// updates the client's cookies (the jar is internally mutable).
cookie_jar: Arc<reqwest::cookie::Jar>,
cookie_domain: Option<String>,
start_url: Option<String>,
db: PgPool,
session_expired: Arc<AtomicBool>,
}
impl SessionController {
pub fn new(
initial: Option<String>,
cookie_jar: Arc<reqwest::cookie::Jar>,
cookie_domain: Option<String>,
start_url: Option<String>,
db: PgPool,
session_expired: Arc<AtomicBool>,
) -> Arc<Self> {
Arc::new(Self {
phpsessid: RwLock::new(initial),
cookie_jar,
cookie_domain,
start_url,
db,
session_expired,
})
}
/// The PHPSESSID a fresh browser should inject (None when unset).
pub async fn current(&self) -> Option<String> {
self.phpsessid.read().await.clone()
}
/// Whether the sticky session-expired flag is set (chapter workers
/// idle while true).
pub fn is_expired(&self) -> bool {
self.session_expired.load(Ordering::Acquire)
}
/// Clear the session-expired flag without changing the cookie — used
/// when the operator knows the session is fine and wants workers to
/// resume immediately.
pub fn clear_expired(&self) {
self.session_expired.store(false, Ordering::Release);
}
/// Update the session everywhere: reqwest jar, in-memory value, and
/// persisted `crawler_state`. Clears the session-expired flag. Does
/// NOT relaunch the browser — the caller triggers a coordinated
/// restart so `on_launch` re-injects + re-probes.
pub async fn update(&self, sid: &str) -> anyhow::Result<()> {
let sid = sid.trim().to_string();
anyhow::ensure!(!sid.is_empty(), "PHPSESSID must not be empty");
// The value is spliced into a cookie string and a CDP CookieParam.
// Reject control chars and cookie delimiters so a pasted value
// can't smuggle extra attributes / break out of the cookie.
anyhow::ensure!(
sid.chars().all(|c| !c.is_control() && c != ';' && c != ','),
"PHPSESSID contains invalid characters"
);
if let (Some(domain), Some(start_url)) = (&self.cookie_domain, &self.start_url) {
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
let seed_url =
reqwest::Url::parse(start_url).context("parse start_url for cookie seed")?;
self.cookie_jar.add_cookie_str(&cookie_str, &seed_url);
}
*self.phpsessid.write().await = Some(sid.clone());
persist(&self.db, &sid).await.context("persist runtime session")?;
self.session_expired.store(false, Ordering::Release);
tracing::info!("crawler session updated at runtime");
Ok(())
}
/// Read a persisted runtime session (if any) from `crawler_state`.
/// Called at startup so a mid-day refresh survives a restart.
pub async fn load_persisted(db: &PgPool) -> Option<String> {
let row: Option<serde_json::Value> =
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
.bind(STATE_KEY_RUNTIME_SESSION)
.fetch_optional(db)
.await
.ok()
.flatten();
row.and_then(|v| {
v.get("phpsessid")
.and_then(|s| s.as_str())
.map(|s| s.to_string())
})
}
}
async fn persist(db: &PgPool, sid: &str) -> sqlx::Result<()> {
sqlx::query(
"INSERT INTO crawler_state (key, value, updated_at) \
VALUES ($1, $2, now()) \
ON CONFLICT (key) DO UPDATE \
SET value = EXCLUDED.value, updated_at = now()",
)
.bind(STATE_KEY_RUNTIME_SESSION)
.bind(json!({ "phpsessid": sid }))
.execute(db)
.await?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn controller(db: PgPool) -> Arc<SessionController> {
SessionController::new(
None,
Arc::new(reqwest::cookie::Jar::default()),
Some("example.com".into()),
Some("https://example.com/".into()),
db,
Arc::new(AtomicBool::new(true)),
)
}
#[sqlx::test(migrations = "./migrations")]
async fn update_rejects_empty_and_control_chars(pool: PgPool) {
let c = controller(pool);
assert!(c.update(" ").await.is_err(), "empty rejected");
assert!(c.update("abc\r\ndef").await.is_err(), "CRLF rejected");
assert!(c.update("ab;Domain=evil").await.is_err(), "semicolon rejected");
assert!(c.update("x,y").await.is_err(), "comma rejected");
}
#[sqlx::test(migrations = "./migrations")]
async fn update_persists_and_clears_expired_then_round_trips(pool: PgPool) {
let c = controller(pool.clone());
c.update("good-sid-123").await.unwrap();
assert_eq!(c.current().await.as_deref(), Some("good-sid-123"));
assert!(!c.is_expired(), "update clears the expired flag");
// Persisted to crawler_state and readable by a fresh load.
assert_eq!(
SessionController::load_persisted(&pool).await.as_deref(),
Some("good-sid-123")
);
}
#[sqlx::test(migrations = "./migrations")]
async fn clear_expired_flips_sticky_flag_without_touching_session(pool: PgPool) {
// The flag starts `true` per `controller(pool)`'s test wiring.
let c = controller(pool);
assert!(c.is_expired(), "test fixture starts with the flag set");
c.clear_expired();
assert!(!c.is_expired(), "clear_expired flips the sticky flag to false");
assert!(
c.current().await.is_none(),
"clear_expired does not invent a session"
);
}
}

View File

@@ -8,6 +8,19 @@ pub mod target;
use async_trait::async_trait;
use chromiumoxide::browser::Browser;
use serde::{Deserialize, Serialize};
/// How a `discover` job should walk the source's index.
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub enum DiscoverMode {
/// Walk every index page from last back to first. Used for the
/// initial seed of a source.
Backfill,
/// Walk index pages from page 1 forward, stopping after
/// `stop_after_unchanged` consecutive mangas whose `metadata_hash`
/// matches storage. Used for the recurring cron tick.
Incremental { stop_after_unchanged: usize },
}
/// Pointer at a manga in the source's index, before we've fetched the
/// detail page. The `source_manga_key` is whatever stable id the source
@@ -67,21 +80,17 @@ pub struct SourceChapter {
pub struct FetchContext<'a> {
pub browser: &'a Browser,
pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
/// Optional TOR control-port client. When `Some`, retry helpers
/// signal `NEWNYM` between transient-page attempts so the next try
/// draws a fresh exit. `None` keeps pre-TOR behavior.
pub tor: Option<&'a crate::crawler::tor::TorController>,
}
/// Lazy iterator over discovered manga refs. The caller drives the
/// walk one batch at a time, so it can break out as soon as the
/// downstream stop condition is met (the first manga where metadata is
/// `Unchanged` and chapter sync reports zero new chapters) without
/// paying for pages it won't use.
/// walk one batch at a time, so it can break out as soon as a
/// downstream stop condition is met (e.g. N consecutive Unchanged
/// upserts in Incremental mode) without paying for pages it won't use.
///
/// Batches are typically one source-index page each. Within a batch
/// refs are in the source's natural newest-first ordering — the same
/// `update_date DESC` sort that makes the stop condition meaningful.
/// refs are already in the right per-page order for the active mode
/// (Backfill reverses each page to oldest-first; Incremental leaves
/// the source's natural newest-first ordering).
#[async_trait]
pub trait DiscoverWalk: Send {
/// Return the next batch of refs, or `Ok(None)` when the source has
@@ -98,14 +107,16 @@ pub trait Source: Send + Sync {
/// Stable identifier — also the row key in the `sources` table.
fn id(&self) -> &'static str;
/// Begin discovery. Returns a walker the caller drives page-by-page
/// via `next_batch`. The initial page-1 probe (used to determine
/// `last_page` and warm the cache for sites that can't be paged
/// without knowing the bound) happens inside this call, so a fresh
/// walker is ready to yield its first batch without further setup.
/// Begin discovery in `mode`. Returns a walker the caller drives
/// page-by-page via `next_batch`. The initial page-1 probe (used
/// to determine `last_page` and warm the cache for sites that
/// can't be paged without knowing the bound) happens inside this
/// call, so a fresh walker is ready to yield its first batch
/// without further setup.
async fn discover(
&self,
ctx: &FetchContext<'_>,
mode: DiscoverMode,
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>>;
async fn fetch_manga(

View File

@@ -7,6 +7,7 @@
//! (`td:has(label:contains("Author:"))`) are implemented by walking
//! the parsed tree.
use std::collections::VecDeque;
use std::time::Duration;
use anyhow::Context;
@@ -14,24 +15,22 @@ use async_trait::async_trait;
use sha2::{Digest, Sha256};
use super::{
DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
SourceMangaRef,
DiscoverMode, DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef,
SourceManga, SourceMangaRef,
};
use crate::crawler::detect::{
has_logo_sentinel, is_broken_page_body, retry_on_transient_with_hook, PageError,
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError,
};
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
/// `sources.id` value for this Source impl. Exposed as a const so the
/// daemon can look up per-source state (e.g. the recovery flag) before
/// constructing the Source itself.
/// daemon can look up per-source state (e.g. `seed_completed_at`)
/// before constructing the Source itself.
pub const SOURCE_ID: &str = "target";
/// In-loop retry budget for transient pages encountered during a single
/// `discover` walk. Bounded small because the next cron tick will pick up
/// where this run left off via the recovery flag — these inline retries
/// only need to absorb a brief site hiccup mid-walk, not a sustained
/// outage.
/// `discover` walk. Bounded small because the job system itself retries
/// the whole `Discover` job on failure — these inline retries only need
/// to absorb a brief site hiccup mid-walk.
const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3;
const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2);
@@ -73,25 +72,36 @@ impl Source for TargetSource {
async fn discover(
&self,
ctx: &FetchContext<'_>,
mode: DiscoverMode,
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
// Probe page 1 up front (with transient retry) for two reasons:
// a broken first page should abort cleanly rather than mid-walk,
// and the HTML is handed straight to the first `next_batch` call
// so the walker doesn't re-fetch it. Page count is discovered
// incrementally — see `TargetSourceWalker::next_batch`.
let first_html = retry_on_transient_with_hook(
|| async {
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
},
// Always visit page 1 first because that's the only way to
// discover `last_page`. Retry it on transient — a broken first
// page would otherwise abort the whole walk before we've even
// started.
let first_html = retry_on_transient(
|| async { navigate(ctx, self.base_url.as_str()).await },
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
|| async { recircuit_if_configured(ctx.tor).await },
)
.await?;
let last_page = {
let doc = scraper::Html::parse_document(&first_html);
parse_last_page(&doc)
};
let backfill = matches!(mode, DiscoverMode::Backfill);
let order = build_page_order(last_page, backfill);
tracing::info!(
?mode,
last_page = ?last_page,
page_count = order.len(),
"walking pagination"
);
Ok(Box::new(TargetSourceWalker {
base_url: self.base_url.clone(),
next_page: 1,
backfill,
pages_remaining: order,
first_page_html: Some(first_html),
}))
}
@@ -101,17 +111,7 @@ impl Source for TargetSource {
ctx: &FetchContext<'_>,
r: &SourceMangaRef,
) -> anyhow::Result<SourceManga> {
// When we'll parse the chapter table, wait for at least one
// chapter row to appear — that's the marker most sensitive to
// the post-load JS partial-render race. When we won't, fall
// back to the layout-level `#logo` so we still wait for the
// page to settle.
let marker = if self.parse_chapters {
DETAIL_PAGE_CHAPTERS_MARKER
} else {
DETAIL_PAGE_LAYOUT_MARKER
};
let html = navigate(ctx, r.url.as_str(), marker).await?;
let html = navigate(ctx, r.url.as_str()).await?;
// Convert PageError → anyhow::Error via `?`. PageError stays
// downcastable from the wrapped anyhow::Error so the pipeline
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
@@ -137,19 +137,28 @@ impl Source for TargetSource {
}
}
/// Walker returned by [`TargetSource::discover`]. Walks pages `1..` in
/// order, terminating as soon as a page renders cleanly with zero entries
/// — that's the "we ran off the end of the index" signal. Page 1's HTML
/// is cached at construction time (discover already had to fetch it for
/// the transient probe) so the first batch doesn't re-fetch.
///
/// A genuinely empty `Ok(vec![])` from `parse_manga_list_from` is what
/// stops us: the parser's `#logo` sentinel converts unrendered pages
/// into transient errors before they reach this loop, so an empty
/// parse result reliably means "no more entries."
/// Build the queue of page numbers `TargetSource::discover` will walk.
/// Backfill is oldest-first: pages `last..=1` (within each page the
/// walker reverses entries, since the source orders by update_date
/// DESC). Incremental is newest-first: pages `1..=last` in natural
/// order. If `last_page` is unknown (source surfaces no pagination)
/// only page 1 is visited.
fn build_page_order(last_page: Option<i32>, backfill: bool) -> VecDeque<i32> {
match (last_page, backfill) {
(None, _) => VecDeque::from([1]),
(Some(last), true) => (1..=last).rev().collect(),
(Some(last), false) => (1..=last).collect(),
}
}
/// Walker returned by [`TargetSource::discover`]. Pops one source-index
/// page per `next_batch` call. Page 1's HTML is cached at construction
/// time (the discover call needed it to read `last_page` anyway) so the
/// batch covering page 1 doesn't re-fetch.
struct TargetSourceWalker {
base_url: String,
next_page: i32,
backfill: bool,
pages_remaining: VecDeque<i32>,
first_page_html: Option<String>,
}
@@ -159,104 +168,70 @@ impl DiscoverWalk for TargetSourceWalker {
&mut self,
ctx: &FetchContext<'_>,
) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
let page_num = self.next_page;
let page_refs = if page_num == 1 {
let Some(page_num) = self.pages_remaining.pop_front() else {
return Ok(None);
};
let mut page_refs = if page_num == 1 {
// Reuse the cached page-1 HTML from the initial probe. Take
// it (rather than clone) so a future re-entry that somehow
// revisits page 1 still falls back to a real fetch.
// it (rather than clone) so a malformed page-order queue
// that re-visits page 1 still falls back to a real fetch.
match self.first_page_html.take() {
Some(html) => {
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)?
}
None => {
retry_on_transient_with_hook(
retry_on_transient(
|| async {
let html = navigate(
ctx,
self.base_url.as_str(),
LIST_PAGE_MARKER,
)
.await?;
let html = navigate(ctx, self.base_url.as_str()).await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
|| async { recircuit_if_configured(ctx.tor).await },
)
.await?
}
}
} else {
retry_on_transient_with_hook(
retry_on_transient(
|| async {
let url = page_url(&self.base_url, page_num);
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
let html = navigate(ctx, &url).await?;
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
},
PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY,
|| async { recircuit_if_configured(ctx.tor).await },
)
.await?
};
tracing::info!(page_num, count = page_refs.len(), "page walked");
if page_refs.is_empty() {
return Ok(None);
if self.backfill {
page_refs.reverse();
}
self.next_page += 1;
tracing::info!(page_num, count = page_refs.len(), "page walked");
Ok(Some(page_refs))
}
}
/// Per-page-type markers used by `navigate`'s post-navigation wait.
/// Each is the most specific element the parser will later look for —
/// waiting on it closes the partial-render race (e.g. `#chapter_table`
/// wrapper present but rows still being injected by post-load JS) that
/// the old fixed 1s sleep masked. See [`navigate`].
const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
/// Single point of rate-limited navigation. Every Source request goes
/// through here, so the per-host limiter map is the only knob that
/// controls per-origin RPS. Also the choke point for transient-page
/// detection — every fetched body is screened by
/// [`classify_navigate_html`] before being handed to a selector.
///
/// `marker` is a CSS selector the caller expects to find on the loaded
/// page. The wait is best-effort: a timeout is **not** an error
/// (legitimately-empty pages may never render the marker), it just
/// caps how long we'll hold for post-load JS to finish injecting
/// content. The parser's own sentinels and the universal broken-page
/// body check still catch real failures.
async fn navigate(
ctx: &FetchContext<'_>,
url: &str,
marker: &str,
) -> Result<String, PageError> {
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> Result<String, PageError> {
ctx.rate.wait_for(url).await?;
let page = ctx
.browser
.new_page(url)
.await
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
match wait_for_nav(&page).await {
Ok(()) => {}
Err(NavError::Timeout(_)) => {
page.close().await.ok();
return Err(PageError::transient("nav timeout"));
}
Err(NavError::Cdp(e)) => {
page.close().await.ok();
return Err(PageError::Other(anyhow::Error::from(e)));
}
}
// Best-effort wait for the page-type marker. We deliberately
// discard a timeout here — see fn-level doc.
let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
page.wait_for_navigation()
.await
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
// Stopgap until we wait on a specific selector per page type —
// gives any post-load JS a beat to finish injecting content.
tokio::time::sleep(Duration::from_secs(1)).await;
let html = page
.content()
.await
@@ -277,20 +252,20 @@ fn classify_navigate_html(html: String) -> Result<String, PageError> {
Ok(html)
}
/// Hook for [`retry_on_transient_with_hook`]: when TOR is configured,
/// signal `NEWNYM` so the next navigation draws a fresh exit. Errors
/// from the controller are logged and swallowed — failing to recircuit
/// shouldn't take down the crawl, the next attempt just runs on the
/// same circuit as before.
async fn recircuit_if_configured(tor: Option<&crate::crawler::tor::TorController>) {
if let Some(t) = tor {
if let Err(e) = t.new_identity().await {
tracing::warn!(error = %e, "TOR NEWNYM failed; retrying on same circuit");
}
}
fn parse_last_page(doc: &scraper::Html) -> Option<i32> {
// Pagination links carry their page number as text. Take the
// numeric maximum so we don't depend on a specific layout (Prev,
// Next, ellipses, etc. all get filtered out by .parse).
let sel = scraper::Selector::parse("#left_side .pagination a").unwrap();
doc.select(&sel)
.filter_map(|a| {
collapse_whitespace(&a.text().collect::<String>())
.parse::<i32>()
.ok()
})
.max()
}
/// Substitutes the first `/N/` path segment with the target page
/// number. Source impls that paginate via a different URL shape can
/// override this — for the modeled site the segment is always present.
@@ -394,7 +369,7 @@ fn parse_manga_detail(
.collect();
let chapters = if include_chapters {
parse_chapter_list(&doc)?
parse_chapter_list(&doc)
} else {
Vec::new()
};
@@ -452,22 +427,9 @@ fn strip_tag_count(s: &str) -> String {
trimmed.to_string()
}
/// Parse the chapter table on a manga detail page. Returns `Transient` if
/// `#chapter_table` isn't in the DOM at all — the table is required even
/// for mangas with no published chapters yet (the source renders an empty
/// `<table>`), so an absent table signals a partial render (post-load JS
/// not done, layout drift) rather than a legitimately empty list. Without
/// this sentinel, an empty `Vec` reaches `sync_manga_chapters` and the
/// soft-drop branch flips every existing chapter to `dropped_at`.
fn parse_chapter_list(doc: &scraper::Html) -> Result<Vec<SourceChapterRef>, PageError> {
if !has_chapter_table_sentinel(doc) {
return Err(PageError::transient(
"manga detail: #chapter_table sentinel missing",
));
}
fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap();
Ok(doc
.select(&sel)
doc.select(&sel)
.filter_map(|a| {
let url = a.value().attr("href")?.trim().to_string();
if url.is_empty() {
@@ -482,16 +444,7 @@ fn parse_chapter_list(doc: &scraper::Html) -> Result<Vec<SourceChapterRef>, Page
url,
})
})
.collect())
}
/// Returns true when the chapter-table container is present in the DOM.
/// Source-specific: the target site uses `#chapter_table` as the wrapper
/// element. Distinguishes "table is present but empty" (legit edge case
/// for new mangas) from "table is missing entirely" (partial render).
fn has_chapter_table_sentinel(doc: &scraper::Html) -> bool {
let sel = scraper::Selector::parse("#chapter_table").expect("valid selector");
doc.select(&sel).next().is_some()
.collect()
}
fn parse_chapter_number(text: &str) -> Option<i32> {
@@ -791,7 +744,7 @@ mod tests {
"../../../tests/fixtures/target/chapter_list_uu.html"
);
let doc = scraper::Html::parse_document(html);
let chapters = parse_chapter_list(&doc).expect("fixture has the table");
let chapters = parse_chapter_list(&doc);
assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
@@ -842,6 +795,29 @@ mod tests {
assert_eq!(parse_chapter_number("Special"), None);
}
#[test]
fn parse_last_page_picks_highest_pagination_link() {
let html = r#"
<div id="left_side"><div class="pagination">
<a href="/list/1/">Prev</a>
<ol>
<li><a href="/list/1/">1</a></li>
<li><a href="/list/2/">2</a></li>
<li><a href="/list/47/">47</a></li>
<li><a href="/list/2/">Next</a></li>
</ol>
</div></div>
"#;
let doc = scraper::Html::parse_document(html);
assert_eq!(parse_last_page(&doc), Some(47));
}
#[test]
fn parse_last_page_none_when_no_pagination() {
let doc = scraper::Html::parse_document("<html></html>");
assert!(parse_last_page(&doc).is_none());
}
#[test]
fn page_url_substitutes_numeric_path_segment() {
assert_eq!(
@@ -915,17 +891,9 @@ mod tests {
#[test]
fn missing_optional_fields_parse_to_none() {
// Minimal but well-formed detail page: title is required, every
// other field is optional, but the chapter table is structural —
// its absence is treated as Transient (a freshly added manga
// renders the table empty, not absent). See
// `parse_chapter_list_returns_transient_when_table_missing` for
// the negative case.
let html = r#"<html><body>\
<header><div id="logo">Target</div></header>\
<div class="w-title"><h1>Minimal</h1></div>\
<table id="chapter_table"></table>\
</body></html>"#;
<div class="w-title"><h1>Minimal</h1></div></body></html>"#;
let m = parse_manga_detail(html, "min", true).unwrap();
assert_eq!(m.title, "Minimal");
assert!(m.summary.is_none());
@@ -991,62 +959,35 @@ mod tests {
}
#[test]
fn parse_chapter_list_returns_transient_when_table_missing() {
// Partial render (post-load JS hadn't injected the table, layout
// drift, etc). Returning Vec::new() would silently soft-drop every
// existing chapter for the manga via sync_manga_chapters; Transient
// is the signal the job system retries on.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<div class="w-title"><h1>Test</h1></div>
</body></html>"#;
let doc = scraper::Html::parse_document(html);
let err = parse_chapter_list(&doc).expect_err("expected Transient");
assert!(err.is_transient(), "got non-transient: {err}");
fn build_page_order_backfill_is_last_to_one() {
// Backfill walks pages oldest-first: queue is [last, last-1, ..., 1]
// so popping from the front yields the last page first.
let order = build_page_order(Some(3), true);
assert_eq!(Vec::from(order), vec![3, 2, 1]);
}
#[test]
fn parse_chapter_list_ok_empty_when_table_present_but_no_rows() {
// A freshly-added manga with no chapters yet — the source renders
// the `<table id="chapter_table">` wrapper but no `<tr>` rows
// inside. Must stay distinguishable from a missing-table render.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<table id="chapter_table"></table>
</body></html>"#;
let doc = scraper::Html::parse_document(html);
let chapters = parse_chapter_list(&doc).expect("present table is not transient");
assert!(chapters.is_empty());
fn build_page_order_incremental_is_one_to_last() {
// Incremental walks newest-first in natural source order.
let order = build_page_order(Some(3), false);
assert_eq!(Vec::from(order), vec![1, 2, 3]);
}
#[test]
fn parse_manga_detail_propagates_chapter_table_transient() {
// End-to-end: a detail page that survives the #logo sentinel but
// has the chapter table stripped must fail Transient at the parser
// boundary, not return a SourceManga with empty chapters.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<div class="w-title"><h1>Test Title</h1></div>
<div class="cover"><img src="/cover.jpg"></div>
<!-- intentionally no #chapter_table -->
</body></html>"#;
let err = parse_manga_detail(html, "key", true).expect_err("expected Transient");
assert!(err.is_transient(), "got non-transient: {err}");
fn build_page_order_falls_back_to_page_one_only_without_pagination() {
let backfill = build_page_order(None, true);
assert_eq!(Vec::from(backfill), vec![1]);
let incremental = build_page_order(None, false);
assert_eq!(Vec::from(incremental), vec![1]);
}
#[test]
fn parse_manga_detail_skips_chapter_sentinel_when_include_chapters_false() {
// Metadata-only mode (`skip_chapters` upstream) must not require
// the chapter table — pipeline.rs avoids calling sync_manga_chapters
// for these mangas, so the absent table is not a correctness issue
// and shouldn't surface as Transient.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<div class="w-title"><h1>Test Title</h1></div>
<div class="cover"><img src="/cover.jpg"></div>
</body></html>"#;
let manga = parse_manga_detail(html, "key", false)
.expect("metadata-only parse must not require chapter table");
assert!(manga.chapters.is_empty());
fn build_page_order_single_page_index_yields_one_entry() {
// Sources with exactly one page should not yield duplicates
// regardless of mode.
let backfill = build_page_order(Some(1), true);
assert_eq!(Vec::from(backfill), vec![1]);
let incremental = build_page_order(Some(1), false);
assert_eq!(Vec::from(incremental), vec![1]);
}
}

View File

@@ -1,355 +0,0 @@
//! Live, in-process crawler status.
//!
//! The metadata pass runs inline in the cron tick (it is not a
//! `crawler_jobs` row), so without this surface "what is the crawler doing
//! right now" is unanswerable from the dashboard. The daemon publishes its
//! current [`Phase`], the chapters being crawled right now (with a live
//! page count), and the cover being fetched into a shared [`StatusHandle`];
//! the admin endpoint reads a [`CrawlerStatus`] snapshot and composes it
//! with DB-derived counts + the session/browser flags.
//!
//! NOTE: this is per-process state. The deployment is a single server
//! (see CLAUDE.md), so an in-memory handle is sufficient; durable signals
//! (last-pass summary, runtime session) are persisted in `crawler_state`.
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use chrono::{DateTime, Utc};
use serde::Serialize;
use tokio::sync::{watch, RwLock};
use uuid::Uuid;
use crate::crawler::pipeline::MetadataStats;
/// What the daemon's metadata pass is doing right now. Serialised with an
/// internal `state` tag so the frontend can switch on it.
#[derive(Clone, Debug, Serialize)]
#[serde(tag = "state", rename_all = "snake_case")]
pub enum Phase {
/// Sleeping until the next scheduled metadata pass.
Idle { next_fire: Option<DateTime<Utc>> },
/// Walking the source catalog list pages.
WalkingList,
/// Fetching one manga's metadata. `index`/`total` drive a progress bar
/// (`total` is `None` when the source size is unknown / uncapped).
FetchingMetadata {
index: usize,
total: Option<usize>,
title: String,
},
/// Backfilling covers that failed on first attempt. `index`/`total`
/// track progress through this tick's batch.
CoverBackfill { index: usize, total: usize },
}
/// A chapter being downloaded right now, with a live page count. Keyed in
/// the status by `chapter_id`; inserted by the dispatcher when a job starts
/// and removed (via an RAII guard) when it finishes, panics, or times out.
#[derive(Clone, Debug, Serialize)]
pub struct ActiveChapter {
pub manga_id: Uuid,
pub manga_title: String,
pub chapter_id: Uuid,
pub chapter_number: i32,
pub pages_done: usize,
/// `None` until the chapter page list has been parsed.
pub pages_total: Option<usize>,
}
/// The manga whose cover is being downloaded right now.
#[derive(Clone, Debug, Serialize)]
pub struct CoverTarget {
pub manga_id: Uuid,
pub manga_title: String,
}
/// Summary of the most recent metadata pass (persisted across restarts in
/// `crawler_state` by the cron; mirrored here for the live read).
#[derive(Clone, Debug, Serialize, Default)]
pub struct LastPass {
pub at: Option<DateTime<Utc>>,
pub discovered: usize,
pub upserted: usize,
pub covers_fetched: usize,
pub mangas_failed: usize,
}
/// A point-in-time snapshot returned by [`StatusHandle::snapshot`]. The
/// session/browser/queue fields are composed at read time by the endpoint
/// (they live elsewhere), so they are not stored here.
#[derive(Clone, Debug, Serialize)]
pub struct CrawlerStatus {
pub phase: Phase,
/// Number of configured chapter workers (for "N busy / M workers").
pub worker_count: usize,
/// Chapters being downloaded right now, with live page counts.
pub active_chapters: Vec<ActiveChapter>,
pub last_pass: LastPass,
/// The cover being downloaded right now, if any.
pub current_cover: Option<CoverTarget>,
}
/// Scalar status state held under the async `RwLock`. Active chapters live
/// in a separate sync map so per-page updates and RAII removal don't need
/// to `.await` (removal happens in `Drop`).
#[derive(Clone, Debug)]
struct Scalar {
phase: Phase,
worker_count: usize,
last_pass: LastPass,
current_cover: Option<CoverTarget>,
}
/// Cloneable handle the daemon tasks use to publish status. Cheap to clone
/// (`Arc`). All writers funnel through the helper methods so locking stays
/// localised. Every mutation bumps a `watch` version so SSE subscribers
/// get pushed an update instead of polling.
#[derive(Clone)]
pub struct StatusHandle {
scalar: Arc<RwLock<Scalar>>,
/// Currently-downloading chapters keyed by `chapter_id`. A sync mutex so
/// the RAII [`ChapterGuard`]'s `Drop` can remove without `.await`.
active: Arc<Mutex<HashMap<Uuid, ActiveChapter>>>,
/// Monotonic version bumped on every change. SSE handlers `subscribe()`
/// and `await .changed()` for instant pushes; `watch` has no
/// lost-wakeup so a change between snapshots is never missed.
version: Arc<watch::Sender<u64>>,
}
/// Lock the active map, recovering from a poisoned mutex (we never hold the
/// lock across a panic-prone section, so the data is still consistent).
fn lock_active(
m: &Mutex<HashMap<Uuid, ActiveChapter>>,
) -> std::sync::MutexGuard<'_, HashMap<Uuid, ActiveChapter>> {
m.lock().unwrap_or_else(|e| e.into_inner())
}
impl StatusHandle {
pub fn new(num_workers: usize) -> Self {
let (version, _rx) = watch::channel(0u64);
Self {
scalar: Arc::new(RwLock::new(Scalar {
phase: Phase::Idle { next_fire: None },
worker_count: num_workers.max(1),
last_pass: LastPass::default(),
current_cover: None,
})),
active: Arc::new(Mutex::new(HashMap::new())),
version: Arc::new(version),
}
}
fn bump(&self) {
self.version.send_modify(|v| *v = v.wrapping_add(1));
}
/// A receiver whose `.changed()` resolves on the next status change.
pub fn subscribe(&self) -> watch::Receiver<u64> {
self.version.subscribe()
}
/// Signal a change without mutating in-memory state — used when an
/// *external* signal the live snapshot reflects (browser phase,
/// session-expired flag, queue counts) has changed, so subscribers
/// recompose promptly.
pub fn poke(&self) {
self.bump();
}
pub async fn set_phase(&self, phase: Phase) {
self.scalar.write().await.phase = phase;
self.bump();
}
/// Set (or clear) the cover being downloaded right now.
pub async fn set_current_cover(&self, cover: Option<CoverTarget>) {
self.scalar.write().await.current_cover = cover;
self.bump();
}
/// Register a chapter as crawling now; returns a guard that removes it
/// when dropped (on completion, panic-unwind, or timeout-drop).
pub fn begin_chapter(&self, chapter: ActiveChapter) -> ChapterGuard {
let id = chapter.chapter_id;
lock_active(&self.active).insert(id, chapter);
self.bump();
ChapterGuard {
active: Arc::clone(&self.active),
version: Arc::clone(&self.version),
chapter_id: id,
}
}
/// Update the live page count of an in-flight chapter. Sync (no
/// `.await`) so it's cheap to call once per stored page.
pub fn set_chapter_pages(&self, chapter_id: Uuid, done: usize, total: Option<usize>) {
{
let mut map = lock_active(&self.active);
if let Some(c) = map.get_mut(&chapter_id) {
c.pages_done = done;
c.pages_total = total;
}
}
self.bump();
}
/// Record a finished metadata pass. Stamps `at` with `now`.
pub async fn record_pass(&self, stats: &MetadataStats, at: DateTime<Utc>) {
self.scalar.write().await.last_pass = LastPass {
at: Some(at),
discovered: stats.discovered,
upserted: stats.upserted,
covers_fetched: stats.covers_fetched,
mangas_failed: stats.mangas_failed,
};
self.bump();
}
/// Seed the last-pass summary from a persisted `crawler_state` value on
/// startup so the dashboard isn't blank until the first tick.
pub async fn set_last_pass(&self, last: LastPass) {
self.scalar.write().await.last_pass = last;
self.bump();
}
pub async fn snapshot(&self) -> CrawlerStatus {
let scalar = self.scalar.read().await.clone();
let mut active_chapters: Vec<ActiveChapter> =
lock_active(&self.active).values().cloned().collect();
// Stable, readable order: by chapter number then id.
active_chapters.sort_by(|a, b| {
a.chapter_number
.cmp(&b.chapter_number)
.then(a.chapter_id.cmp(&b.chapter_id))
});
CrawlerStatus {
phase: scalar.phase,
worker_count: scalar.worker_count,
active_chapters,
last_pass: scalar.last_pass,
current_cover: scalar.current_cover,
}
}
}
/// RAII handle removing an [`ActiveChapter`] from the live status when the
/// chapter dispatch finishes, panics, or is dropped on timeout.
pub struct ChapterGuard {
active: Arc<Mutex<HashMap<Uuid, ActiveChapter>>>,
version: Arc<watch::Sender<u64>>,
chapter_id: Uuid,
}
impl Drop for ChapterGuard {
fn drop(&mut self) {
lock_active(&self.active).remove(&self.chapter_id);
self.version.send_modify(|v| *v = v.wrapping_add(1));
}
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_chapter(n: i32) -> ActiveChapter {
ActiveChapter {
manga_id: Uuid::new_v4(),
manga_title: "M".into(),
chapter_id: Uuid::new_v4(),
chapter_number: n,
pages_done: 0,
pages_total: None,
}
}
#[tokio::test]
async fn begin_chapter_shows_in_snapshot_and_guard_removes_on_drop() {
let h = StatusHandle::new(2);
let chap = sample_chapter(7);
let cid = chap.chapter_id;
{
let _guard = h.begin_chapter(chap);
let snap = h.snapshot().await;
assert_eq!(snap.active_chapters.len(), 1);
assert_eq!(snap.active_chapters[0].chapter_id, cid);
assert_eq!(snap.worker_count, 2);
}
// Guard dropped → entry removed.
let snap = h.snapshot().await;
assert!(snap.active_chapters.is_empty());
}
#[tokio::test]
async fn set_chapter_pages_updates_live_count() {
let h = StatusHandle::new(1);
let chap = sample_chapter(1);
let cid = chap.chapter_id;
let _guard = h.begin_chapter(chap);
h.set_chapter_pages(cid, 3, Some(20));
let snap = h.snapshot().await;
assert_eq!(snap.active_chapters[0].pages_done, 3);
assert_eq!(snap.active_chapters[0].pages_total, Some(20));
// Updating an unknown chapter is a no-op, not a panic.
h.set_chapter_pages(Uuid::new_v4(), 9, Some(9));
}
#[tokio::test]
async fn snapshot_sorts_active_chapters_by_number() {
let h = StatusHandle::new(2);
let _g1 = h.begin_chapter(sample_chapter(5));
let _g2 = h.begin_chapter(sample_chapter(2));
let snap = h.snapshot().await;
assert_eq!(snap.active_chapters[0].chapter_number, 2);
assert_eq!(snap.active_chapters[1].chapter_number, 5);
}
#[tokio::test]
async fn set_current_cover_round_trips() {
let h = StatusHandle::new(1);
let mid = Uuid::new_v4();
h.set_current_cover(Some(CoverTarget {
manga_id: mid,
manga_title: "One Piece".into(),
}))
.await;
assert_eq!(
h.snapshot().await.current_cover.map(|c| c.manga_id),
Some(mid)
);
h.set_current_cover(None).await;
assert!(h.snapshot().await.current_cover.is_none());
}
#[tokio::test]
async fn record_pass_captures_stats_and_timestamp() {
let h = StatusHandle::new(1);
let stats = MetadataStats {
discovered: 5,
upserted: 3,
covers_fetched: 2,
mangas_failed: 1,
};
let at = Utc::now();
h.record_pass(&stats, at).await;
let snap = h.snapshot().await;
assert_eq!(snap.last_pass.discovered, 5);
assert_eq!(snap.last_pass.upserted, 3);
assert_eq!(snap.last_pass.at, Some(at));
}
#[tokio::test]
async fn subscribe_resolves_on_mutation_poke_and_chapter_change() {
let h = StatusHandle::new(1);
let mut rx = h.subscribe();
h.set_phase(Phase::WalkingList).await;
rx.changed().await.unwrap();
h.poke();
rx.changed().await.unwrap();
// begin_chapter + guard drop each bump the version.
let g = h.begin_chapter(sample_chapter(1));
rx.changed().await.unwrap();
drop(g);
rx.changed().await.unwrap();
}
}

View File

@@ -1,446 +0,0 @@
//! TOR control-port client for `SIGNAL NEWNYM` ("recircuit").
//!
//! The crawler can be proxied through TOR (`CRAWLER_PROXY=socks5h://tor:9050`)
//! to randomize the exit IP seen by the target site. When the target
//! returns a "bad page" (its broken-template body, missing layout
//! sentinel, or unauthenticated probe despite a valid PHPSESSID), it
//! is often the current exit being rate-limited or fingerprinted rather
//! than a real failure. Asking the local TOR daemon for a new identity
//! over its control port (port 9051 by default) makes subsequent
//! connections draw a fresh circuit; combined with `IsolateDestAddr`
//! in torrc this is usually enough to clear the failure.
//!
//! Scope is deliberately tiny — `AUTHENTICATE` + `SIGNAL NEWNYM` over
//! a one-shot TCP connection. No `torut` dep, no hidden-service
//! plumbing, no event streaming.
//!
//! **Caveat for in-flight connections:** Chromium reuses sockets, so a
//! `NEWNYM` only affects *new* connections (in TOR terms, new circuits).
//! That's fine for our retry path — the next navigation opens a fresh
//! connection. We do not try to forcibly close existing streams.
use std::path::{Path, PathBuf};
use std::time::Duration;
use anyhow::{anyhow, bail, Context};
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use tokio::net::TcpStream;
use tokio::time::timeout;
/// Default control-port (`tor --defaults-torrc` ships 9051).
const DEFAULT_CONTROL_PORT: u16 = 9051;
/// Connect timeout — generous enough for a slow compose start, short
/// enough that a misconfigured controller doesn't stall a crawl.
const CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
/// Per-command read timeout. `SIGNAL NEWNYM` returns instantly on the
/// happy path; bound it so a half-broken control port can't hang us.
const READ_TIMEOUT: Duration = Duration::from_secs(5);
/// How the controller authenticates to the control port.
///
/// `Cookie` is preferred for compose deploys where the auth cookie file
/// is shared between the `tor` and `backend` containers via a named
/// volume. `Password` is the fallback when the cookie file isn't
/// reachable (different gid, no shared volume, etc.). `None` matches a
/// torrc with no `CookieAuthentication 1` and no `HashedControlPassword`
/// — useful for local experimentation, not for production.
///
/// `Debug` is implemented manually to redact the password (and the
/// cookie path, which is non-sensitive but uninteresting in logs).
/// Don't add `#[derive(Debug)]` — the controller is `?`-logged at
/// startup and a derive would expand the password into the trace.
#[derive(Clone)]
pub enum TorAuth {
None,
Password(String),
Cookie(PathBuf),
}
impl std::fmt::Debug for TorAuth {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
TorAuth::None => f.write_str("None"),
TorAuth::Password(_) => f.write_str("Password(<redacted>)"),
TorAuth::Cookie(_) => f.write_str("Cookie(<path>)"),
}
}
}
#[derive(Debug, Clone)]
pub struct TorController {
/// `host:port` string. Kept as a string (not a `SocketAddr`) so
/// docker-compose hostnames like `tor:9051` resolve at connect time.
addr: String,
auth: TorAuth,
}
impl TorController {
pub fn new(addr: impl Into<String>, auth: TorAuth) -> Self {
Self { addr: addr.into(), auth }
}
/// Build a controller from the env-config shape:
/// `url` (e.g. `tcp://tor:9051`, `127.0.0.1:9051`, or `tor`),
/// optional password, optional cookie path. Returns `Ok(None)` when
/// `url` is absent — that's the "TOR feature disabled" signal.
/// Cookie wins over password when both are set (rotates with TOR;
/// no secret to manage).
pub fn from_parts(
url: Option<&str>,
password: Option<&str>,
cookie_path: Option<&Path>,
) -> anyhow::Result<Option<Self>> {
let Some(url) = url else { return Ok(None) };
let addr = parse_control_url(url)?;
let auth = match (cookie_path, password) {
(Some(p), _) => TorAuth::Cookie(p.to_path_buf()),
(None, Some(p)) => TorAuth::Password(p.to_string()),
(None, None) => TorAuth::None,
};
Ok(Some(Self { addr, auth }))
}
/// Open the control port, `AUTHENTICATE`, `SIGNAL NEWNYM`, `QUIT`.
/// Each invocation is a fresh connection; the controller is cheap
/// to clone and stateless across calls.
pub async fn new_identity(&self) -> anyhow::Result<()> {
let stream = timeout(CONNECT_TIMEOUT, TcpStream::connect(&self.addr))
.await
.with_context(|| {
format!("timed out connecting to TOR control port {}", self.addr)
})?
.with_context(|| format!("connect to TOR control port {}", self.addr))?;
let (read, mut write) = stream.into_split();
let mut read = BufReader::new(read);
let auth_line = self.build_auth_line().await?;
write_line(&mut write, &auth_line).await?;
timeout(READ_TIMEOUT, expect_250(&mut read))
.await
.map_err(|_| anyhow!("TOR control AUTHENTICATE timed out"))?
.context("AUTHENTICATE")?;
write_line(&mut write, "SIGNAL NEWNYM").await?;
timeout(READ_TIMEOUT, expect_250(&mut read))
.await
.map_err(|_| anyhow!("TOR control SIGNAL NEWNYM timed out"))?
.context("SIGNAL NEWNYM")?;
// QUIT is courtesy; ignore errors — the daemon may close the
// socket before our QUIT lands and that's perfectly fine.
let _ = write_line(&mut write, "QUIT").await;
// Debug-level: a busy crawl can rotate circuits many times per
// minute, INFO is too chatty. Failures still log at WARN.
tracing::debug!(addr = %self.addr, "TOR NEWNYM signaled");
Ok(())
}
async fn build_auth_line(&self) -> anyhow::Result<String> {
match &self.auth {
TorAuth::None => Ok("AUTHENTICATE".to_string()),
TorAuth::Password(p) => Ok(format!("AUTHENTICATE \"{}\"", escape_quoted(p))),
TorAuth::Cookie(path) => {
let bytes = tokio::fs::read(path)
.await
.with_context(|| format!("read TOR cookie file {}", path.display()))?;
Ok(format!("AUTHENTICATE {}", hex_encode(&bytes)))
}
}
}
}
/// Parse `tcp://host:port`, `host:port`, or bare `host` into a
/// connect-time string. Default port is [`DEFAULT_CONTROL_PORT`].
fn parse_control_url(url: &str) -> anyhow::Result<String> {
let stripped = url.strip_prefix("tcp://").unwrap_or(url);
if stripped.is_empty() {
bail!("TOR control url is empty");
}
if stripped.contains(':') {
Ok(stripped.to_string())
} else {
Ok(format!("{stripped}:{DEFAULT_CONTROL_PORT}"))
}
}
fn escape_quoted(s: &str) -> String {
s.replace('\\', r"\\").replace('"', r#"\""#)
}
fn hex_encode(bytes: &[u8]) -> String {
let mut s = String::with_capacity(bytes.len() * 2);
for b in bytes {
s.push_str(&format!("{b:02x}"));
}
s
}
async fn write_line<W: tokio::io::AsyncWrite + Unpin>(
w: &mut W,
line: &str,
) -> anyhow::Result<()> {
w.write_all(line.as_bytes()).await?;
w.write_all(b"\r\n").await?;
w.flush().await?;
Ok(())
}
/// Drain a TOR control reply, accepting only status `250`. Handles
/// the protocol's three line forms: `XYZ ...` (single/end), `XYZ-...`
/// (continuation), `XYZ+...` (data block ended by a lone `.`). Our
/// commands only ever produce single-line `250 OK`, but we honor the
/// continuation forms so a future torrc that adds events / banners
/// doesn't confuse the parser.
async fn expect_250<R: AsyncBufReadExt + Unpin>(r: &mut R) -> anyhow::Result<()> {
loop {
let mut line = String::new();
let n = r.read_line(&mut line).await?;
if n == 0 {
bail!("TOR control port closed connection mid-reply");
}
let trimmed = line.trim_end_matches(['\r', '\n']);
if trimmed.len() < 4 {
bail!("malformed TOR control reply: {trimmed:?}");
}
let (code, rest) = trimmed.split_at(3);
if code != "250" {
bail!("TOR control replied {trimmed:?}");
}
let sep = rest.as_bytes()[0];
match sep {
b' ' => return Ok(()),
b'-' => continue,
b'+' => {
// Data block — read until a line consisting of only ".".
loop {
let mut data = String::new();
let n = r.read_line(&mut data).await?;
if n == 0 {
bail!("TOR control port closed mid-data-block");
}
if data.trim_end_matches(['\r', '\n']) == "." {
break;
}
}
}
_ => bail!("malformed TOR control reply separator: {trimmed:?}"),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
use tokio::io::AsyncWriteExt;
use tokio::net::TcpListener;
/// Spawn a mock control port that responds to each \r\n-terminated
/// inbound line with the next entry from `replies`. Each reply has
/// its own `\r\n` appended. Records received lines into `recorder`.
/// After `replies.len()` exchanges the task drops the socket — this
/// matches the real TOR behavior for QUIT (close after acking).
async fn spawn_mock(
replies: Vec<&'static str>,
recorder: Arc<Mutex<Vec<String>>>,
) -> String {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap().to_string();
tokio::spawn(async move {
let (sock, _) = listener.accept().await.unwrap();
let (r, mut w) = sock.into_split();
let mut r = BufReader::new(r);
for reply in replies {
let mut line = String::new();
let n = r.read_line(&mut line).await.unwrap_or(0);
if n == 0 {
return;
}
recorder
.lock()
.unwrap()
.push(line.trim_end_matches(['\r', '\n']).to_string());
w.write_all(reply.as_bytes()).await.unwrap();
w.write_all(b"\r\n").await.unwrap();
w.flush().await.unwrap();
}
});
addr
}
#[tokio::test]
async fn password_auth_then_newnym_writes_expected_sequence() {
let recorder = Arc::new(Mutex::new(Vec::new()));
// Two replies: AUTHENTICATE then SIGNAL NEWNYM. QUIT is
// fire-and-forget; the mock dropping the socket is the
// expected real-world behavior.
let addr =
spawn_mock(vec!["250 OK", "250 OK"], Arc::clone(&recorder)).await;
let controller = TorController::new(addr, TorAuth::Password("secret".into()));
controller.new_identity().await.expect("new_identity ok");
let recorded = recorder.lock().unwrap().clone();
assert_eq!(recorded.first().map(String::as_str), Some("AUTHENTICATE \"secret\""));
assert_eq!(recorded.get(1).map(String::as_str), Some("SIGNAL NEWNYM"));
}
#[tokio::test]
async fn cookie_auth_hex_encodes_file_bytes() {
let tmp = tempfile::NamedTempFile::new().unwrap();
let cookie: Vec<u8> = (0u8..32).collect();
std::fs::write(tmp.path(), &cookie).unwrap();
let recorder = Arc::new(Mutex::new(Vec::new()));
let addr =
spawn_mock(vec!["250 OK", "250 OK"], Arc::clone(&recorder)).await;
let controller =
TorController::new(addr, TorAuth::Cookie(tmp.path().to_path_buf()));
controller.new_identity().await.expect("new_identity ok");
let recorded = recorder.lock().unwrap().clone();
let expected_hex: String = cookie.iter().map(|b| format!("{b:02x}")).collect();
assert_eq!(
recorded.first().map(String::as_str),
Some(format!("AUTHENTICATE {expected_hex}").as_str())
);
}
#[tokio::test]
async fn no_auth_sends_bare_authenticate() {
let recorder = Arc::new(Mutex::new(Vec::new()));
let addr =
spawn_mock(vec!["250 OK", "250 OK"], Arc::clone(&recorder)).await;
let controller = TorController::new(addr, TorAuth::None);
controller.new_identity().await.expect("new_identity ok");
let recorded = recorder.lock().unwrap().clone();
assert_eq!(recorded.first().map(String::as_str), Some("AUTHENTICATE"));
}
#[tokio::test]
async fn non_250_reply_returns_err_with_reply_text() {
let recorder = Arc::new(Mutex::new(Vec::new()));
let addr = spawn_mock(
vec!["515 Bad authentication"],
Arc::clone(&recorder),
)
.await;
let controller =
TorController::new(addr, TorAuth::Password("wrong".into()));
let err = controller.new_identity().await.expect_err("should fail");
let msg = format!("{err:#}");
assert!(msg.contains("515"), "expected 515 in error, got: {msg}");
}
#[tokio::test]
async fn closed_connection_mid_reply_is_an_error() {
// Listener accepts the AUTH line then drops without replying —
// this exercises the EOF-mid-reply path in expect_250 (rather
// than tor's own error replies which are covered elsewhere).
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap().to_string();
tokio::spawn(async move {
if let Ok((sock, _)) = listener.accept().await {
let (r, _w) = sock.into_split();
let mut r = BufReader::new(r);
let mut line = String::new();
let _ = r.read_line(&mut line).await; // read AUTH, ignore
// Drop _w (and the read half via scope exit) so the
// peer sees an immediate EOF on the next read.
}
});
let controller = TorController::new(addr, TorAuth::None);
let err = controller.new_identity().await.expect_err("should fail");
let msg = format!("{err:#}");
assert!(
msg.contains("closed connection"),
"expected EOF-mid-reply error, got: {msg}"
);
}
#[tokio::test]
async fn multi_line_250_continuation_is_accepted() {
let recorder = Arc::new(Mutex::new(Vec::new()));
// AUTHENTICATE reply uses the `250-...\r\n250 OK\r\n` form.
// Single reply string contains the whole multi-line response.
let addr = spawn_mock(
vec!["250-banner=foo\r\n250 OK", "250 OK"],
Arc::clone(&recorder),
)
.await;
let controller = TorController::new(addr, TorAuth::None);
controller.new_identity().await.expect("new_identity ok");
}
#[test]
fn from_parts_returns_none_when_url_unset() {
let c = TorController::from_parts(None, None, None).unwrap();
assert!(c.is_none());
}
#[test]
fn from_parts_prefers_cookie_over_password() {
let c = TorController::from_parts(
Some("tor:9051"),
Some("pw"),
Some(Path::new("/var/lib/tor/control_auth_cookie")),
)
.unwrap()
.expect("controller built");
assert!(matches!(c.auth, TorAuth::Cookie(_)));
}
#[test]
fn from_parts_falls_back_to_password_without_cookie() {
let c = TorController::from_parts(Some("tor:9051"), Some("pw"), None)
.unwrap()
.expect("controller built");
assert!(matches!(c.auth, TorAuth::Password(p) if p == "pw"));
}
#[test]
fn parse_control_url_accepts_tcp_scheme() {
assert_eq!(parse_control_url("tcp://127.0.0.1:9051").unwrap(), "127.0.0.1:9051");
}
#[test]
fn parse_control_url_defaults_port_when_omitted() {
assert_eq!(parse_control_url("tor").unwrap(), "tor:9051");
}
#[test]
fn parse_control_url_passes_through_host_port() {
assert_eq!(parse_control_url("tor:9999").unwrap(), "tor:9999");
}
#[test]
fn parse_control_url_rejects_empty() {
assert!(parse_control_url("").is_err());
assert!(parse_control_url("tcp://").is_err());
}
#[test]
fn escape_quoted_handles_quotes_and_backslashes() {
assert_eq!(escape_quoted(r#"a"b\c"#), r#"a\"b\\c"#);
}
#[test]
fn debug_format_redacts_password_and_cookie_path() {
// Regression: app.rs / bin/crawler.rs log the controller at
// startup via `tracing::info!(?t, ...)`. A derived Debug on
// TorAuth would expand TorAuth::Password(p) and leak the
// plaintext into logs.
let c = TorController::new("tor:9051", TorAuth::Password("super-secret".into()));
let dbg = format!("{c:?}");
assert!(!dbg.contains("super-secret"), "password leaked: {dbg}");
assert!(dbg.contains("<redacted>"), "expected <redacted>, got: {dbg}");
let c = TorController::new(
"tor:9051",
TorAuth::Cookie("/var/lib/tor/control_auth_cookie".into()),
);
let dbg = format!("{c:?}");
assert!(!dbg.contains("control_auth_cookie"), "cookie path leaked: {dbg}");
}
#[test]
fn hex_encode_zero_pads_low_bytes() {
assert_eq!(hex_encode(&[0x00, 0x0f, 0xff]), "000fff");
}
}

View File

@@ -1,244 +0,0 @@
//! Centralised URL helpers for the crawler subsystem.
//!
//! Three near-identical hand-rolled URL parsers used to live in
//! `crawler::session`, `crawler::rate_limit`, and `crawler::pipeline`
//! respectively, each with subtly different edge-case behaviour
//! around port handling and IPv6 literals. They're consolidated here
//! so the divergence can't drift again.
//!
//! The hand-rolled implementations are kept intentionally — they
//! preserve the exact semantics every existing test pins. A future
//! refactor can switch to `reqwest::Url` if it can be done without
//! changing those semantics.
/// Lowercased host (no port). Returns `None` for inputs without a
/// `scheme://host` shape — those would never have reached the network
/// layer anyway. Used by the per-host rate limiter as its bucket key.
///
/// IPv6 literals are kept in their `[::1]` bracketed form so the
/// `rsplit_once(':')` port-stripping logic doesn't split inside the
/// address (e.g. `https://[::1]/foo` used to return `"[:"` because
/// the rightmost `:` is inside the literal). Buckets keyed by
/// `[::1]` vs `::1` are still uniquely-per-host; the brackets are
/// cosmetic.
pub fn host_of(url: &str) -> Option<String> {
let after_scheme = url.split_once("://")?.1;
let host_with_port = after_scheme.split('/').next()?;
let host = if host_with_port.starts_with('[') {
// IPv6 literal: keep through the closing bracket. There may
// be a trailing `:port` after `]`; strip only that.
match host_with_port.rfind(']') {
Some(end) => &host_with_port[..=end],
None => host_with_port,
}
} else {
// Hostnames and IPv4 literals: trailing `:port` (if any) is
// after the last `:`.
host_with_port
.rsplit_once(':')
.map_or(host_with_port, |(h, _)| h)
};
(!host.is_empty()).then(|| host.to_ascii_lowercase())
}
/// `scheme://host` with no path or port stripping. Used by the metadata
/// pass to seed `sources.base_url` from `CRAWLER_START_URL`.
pub fn origin_of(url: &str) -> Option<String> {
let (scheme, rest) = url.split_once("://")?;
let host = rest.split('/').next()?;
Some(format!("{scheme}://{host}"))
}
/// Approximate registrable-domain calculation: take the last two
/// dot-labels of the host, prefix with `.`. Used to set a parent-
/// domain cookie so the catalog's `www.` / `m.` redirects don't drop
/// the cookie mid-crawl.
///
/// Caveat: wrong for multi-part TLDs (`.co.uk`, `.com.br`). The
/// operator can override via `CRAWLER_COOKIE_DOMAIN`; pulling in the
/// Public Suffix List for one knob isn't worth it yet.
///
/// Bare hostnames (e.g. `localhost`) return the host as-is, with no
/// leading dot — setting `.localhost` as a cookie domain is invalid.
/// IPv6 literals (e.g. `[::1]`) are returned bracketed and unchanged;
/// the browser will reject them as a cookie `Domain` anyway, but the
/// representation stays sensible. Same `starts_with('[')` branch as
/// [`host_of`] for consistent IPv6 handling across the module.
pub fn registrable_domain(url: &str) -> Option<String> {
let after_scheme = url.split_once("://")?.1;
let host_with_port = after_scheme.split('/').next()?;
let host_str = if host_with_port.starts_with('[') {
// IPv6 literal: keep through the closing bracket; an optional
// `:port` follows `]`.
match host_with_port.rfind(']') {
Some(end) => &host_with_port[..=end],
None => host_with_port,
}
} else {
host_with_port
.rsplit_once(':')
.map_or(host_with_port, |(h, _)| h)
};
let host = host_str.to_ascii_lowercase();
if host.is_empty() {
return None;
}
let labels: Vec<&str> = host.split('.').filter(|l| !l.is_empty()).collect();
if labels.len() < 2 {
return Some(host);
}
let registrable = &labels[labels.len() - 2..];
Some(format!(".{}", registrable.join(".")))
}
/// Normalise a SOCKS proxy URL for Chromium's `--proxy-server=` flag.
///
/// reqwest accepts both `socks5://` (resolve locally) and
/// `socks5h://` (resolve via the SOCKS server — important when the
/// proxy is TOR and we don't want the host's resolver to see the
/// target hostname). Chromium does **not** know the `socks5h` scheme
/// and refuses navigations with `ERR_NO_SUPPORTED_PROXIES`. It
/// already sends destination hostnames over SOCKS5 by default
/// regardless, so stripping the `h` is a pure scheme rename — the
/// remote-DNS behaviour is preserved.
///
/// Non-SOCKS schemes pass through unchanged.
pub fn chromium_proxy_arg(proxy: &str) -> String {
if let Some(rest) = proxy.strip_prefix("socks5h://") {
format!("socks5://{rest}")
} else {
proxy.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn host_of_strips_port_and_lowercases() {
assert_eq!(
host_of("https://CDN.Example.com:443/x").as_deref(),
Some("cdn.example.com")
);
assert_eq!(host_of("http://localhost/").as_deref(), Some("localhost"));
assert_eq!(host_of("not a url"), None);
}
#[test]
fn host_of_keeps_bracketed_ipv6_literal_intact() {
// Regression: the old impl rsplit_once(':')'d the IPv6 address,
// returning "[:" instead of "[::1]". A real IPv6 source would
// silently get a wrong rate-limit bucket key.
assert_eq!(host_of("https://[::1]/").as_deref(), Some("[::1]"));
assert_eq!(host_of("https://[::1]:8080/").as_deref(), Some("[::1]"));
assert_eq!(
host_of("https://[2001:db8::1]/foo").as_deref(),
Some("[2001:db8::1]")
);
assert_eq!(
host_of("https://[2001:db8::1]:443/foo").as_deref(),
Some("[2001:db8::1]")
);
}
#[test]
fn origin_of_returns_scheme_and_host() {
assert_eq!(
origin_of("https://example.com/some/path?q=1").as_deref(),
Some("https://example.com")
);
assert_eq!(origin_of("garbage"), None);
}
#[test]
fn registrable_domain_strips_subdomain() {
assert_eq!(
registrable_domain("https://www.target-site.com/manga/foo/").as_deref(),
Some(".target-site.com")
);
assert_eq!(
registrable_domain("https://m.example.org").as_deref(),
Some(".example.org")
);
}
#[test]
fn registrable_domain_keeps_two_label_host() {
assert_eq!(
registrable_domain("https://example.com/").as_deref(),
Some(".example.com")
);
}
#[test]
fn registrable_domain_handles_port() {
assert_eq!(
registrable_domain("http://www.foo.bar:8080/x").as_deref(),
Some(".foo.bar")
);
}
#[test]
fn registrable_domain_bare_hostname_no_leading_dot() {
assert_eq!(
registrable_domain("http://localhost:5173").as_deref(),
Some("localhost")
);
}
#[test]
fn registrable_domain_returns_none_for_garbage() {
assert!(registrable_domain("not a url").is_none());
}
#[test]
fn registrable_domain_keeps_bracketed_ipv6_literal_intact() {
// Symmetric with host_of's IPv6 fix. The cookie-domain code
// won't accept an IP as a `Domain` value, but the function
// should at least return a sensible representation rather
// than the truncated `"[:"` the old port-stripper produced.
assert_eq!(
registrable_domain("https://[::1]/").as_deref(),
Some("[::1]")
);
assert_eq!(
registrable_domain("https://[::1]:8080/").as_deref(),
Some("[::1]")
);
assert_eq!(
registrable_domain("https://[2001:db8::1]/foo").as_deref(),
Some("[2001:db8::1]")
);
}
#[test]
fn chromium_proxy_arg_strips_socks5h_to_socks5() {
// Regression: passing socks5h:// to Chromium yields
// ERR_NO_SUPPORTED_PROXIES at navigation time.
assert_eq!(
chromium_proxy_arg("socks5h://127.0.0.1:9050"),
"socks5://127.0.0.1:9050"
);
assert_eq!(
chromium_proxy_arg("socks5h://tor:9050"),
"socks5://tor:9050"
);
}
#[test]
fn chromium_proxy_arg_passes_socks5_unchanged() {
assert_eq!(
chromium_proxy_arg("socks5://127.0.0.1:9050"),
"socks5://127.0.0.1:9050"
);
}
#[test]
fn chromium_proxy_arg_passes_non_socks_unchanged() {
assert_eq!(
chromium_proxy_arg("http://proxy.example:8080"),
"http://proxy.example:8080"
);
}
}

View File

@@ -1,15 +0,0 @@
use chrono::{DateTime, Utc};
use serde::Serialize;
use sqlx::FromRow;
use uuid::Uuid;
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct AdminAuditEntry {
pub id: Uuid,
pub actor_user_id: Option<Uuid>,
pub action: String,
pub target_kind: String,
pub target_id: Option<Uuid>,
pub payload: serde_json::Value,
pub at: DateTime<Utc>,
}

View File

@@ -1,4 +1,3 @@
pub mod admin_audit;
pub mod api_token;
pub mod author;
pub mod bookmark;
@@ -10,13 +9,11 @@ pub mod page;
pub mod patch;
pub mod read_progress;
pub mod session;
pub mod sync_state;
pub mod tag;
pub mod upload_entry;
pub mod user;
pub mod user_preferences;
pub use admin_audit::AdminAuditEntry;
pub use api_token::ApiToken;
pub use author::{Author, AuthorRef, AuthorWithCount};
pub use bookmark::{Bookmark, BookmarkSummary};
@@ -28,7 +25,6 @@ pub use page::Page;
pub use patch::Patch;
pub use read_progress::{ReadProgress, ReadProgressForManga, ReadProgressSummary};
pub use session::Session;
pub use sync_state::{ChapterSyncState, MangaSyncState};
pub use tag::{Tag, TagRef};
pub use upload_entry::UploadEntry;
pub use user::User;

View File

@@ -1,48 +0,0 @@
//! Sync-state enums derived per-manga / per-chapter from `manga_sources`,
//! `chapter_sources`, and `crawler_jobs` at query time. No state column
//! is persisted on `mangas` / `chapters` — see `repo::admin_view` for the
//! derivation rules and priority order.
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, sqlx::Type)]
#[sqlx(type_name = "text", rename_all = "snake_case")]
#[serde(rename_all = "snake_case")]
pub enum MangaSyncState {
/// A `sync_manga` or `sync_chapter_list` job is currently
/// pending or running for this manga.
InProgress,
/// At least one `manga_sources` row exists for this manga and ALL of
/// them have `dropped_at IS NOT NULL` — every source we know about
/// has stopped surfacing it.
Dropped,
/// Default healthy state: at least one live source row OR the manga
/// was user-uploaded (no `manga_sources` rows at all).
Synced,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, sqlx::Type)]
#[sqlx(type_name = "text", rename_all = "snake_case")]
#[serde(rename_all = "snake_case")]
pub enum ChapterSyncState {
/// A `sync_chapter_content` job is currently pending or running for
/// this chapter (the 0014 dedup index guarantees at most one).
Downloading,
/// At least one `chapter_sources` row exists AND all of them are
/// `dropped_at IS NOT NULL`.
Dropped,
/// `page_count = 0` AND a `dead` `sync_chapter_content` job exists
/// for this chapter. Checked BEFORE `NotDownloaded` so the more
/// informative "we tried and it died" state wins over "we never
/// got around to it". Does NOT fire when `page_count > 0`, because
/// pages on disk mean the chapter IS synced regardless of historical
/// job failures — see the priority comment in `repo::admin_view`.
Failed,
/// `page_count = 0` and no in-flight or failed job — the chapter
/// row exists but content has never been downloaded.
NotDownloaded,
/// `page_count > 0` — content has been downloaded at some point.
/// Reaped `done` jobs in `crawler_jobs` mean we can't read this from
/// the job table, so `page_count` is the durable truth.
Synced,
}

View File

@@ -10,5 +10,4 @@ pub struct User {
#[serde(skip)]
pub password_hash: String,
pub created_at: DateTime<Utc>,
pub is_admin: bool,
}

View File

@@ -21,16 +21,6 @@ pub enum AppError {
PayloadTooLarge(String),
#[error("unsupported media type: {0}")]
UnsupportedMediaType(String),
/// 503 — a feature is currently unavailable, distinct from a 5xx
/// internal error. Used when admin actions require the crawler
/// daemon but it's been disabled (`CRAWLER_DAEMON=false`).
#[error("service unavailable: {0}")]
ServiceUnavailable(String),
/// 429 with an optional `Retry-After` header value (in seconds).
#[error("too many requests")]
TooManyRequests {
retry_after_secs: Option<u64>,
},
/// Semantic per-field validation failure. `details` is rendered into the
/// envelope so the client can highlight the bad field(s).
#[error("validation failed")]
@@ -61,8 +51,6 @@ impl AppError {
AppError::Conflict(_) => "conflict",
AppError::PayloadTooLarge(_) => "payload_too_large",
AppError::UnsupportedMediaType(_) => "unsupported_media_type",
AppError::ServiceUnavailable(_) => "service_unavailable",
AppError::TooManyRequests { .. } => "too_many_requests",
AppError::ValidationFailed { .. } => "validation_failed",
AppError::Database(sqlx::Error::RowNotFound) => "not_found",
AppError::Database(_) => "internal_error",
@@ -91,34 +79,6 @@ impl IntoResponse for AppError {
AppError::UnsupportedMediaType(msg) => {
(StatusCode::UNSUPPORTED_MEDIA_TYPE, msg.clone(), None)
}
AppError::ServiceUnavailable(msg) => {
(StatusCode::SERVICE_UNAVAILABLE, msg.clone(), None)
}
AppError::TooManyRequests { retry_after_secs } => {
// Emit `Retry-After: N` (RFC 6585 §4) so a well-behaved
// client can back off correctly. Done by building the
// response by hand below — the `(status, headers,
// body)` tuple shape doesn't fit the standard
// `(status, body)` IntoResponse path for the other
// variants.
let body = json!({
"error": {
"code": code,
"message": "too many requests; slow down",
}
});
let mut resp = (StatusCode::TOO_MANY_REQUESTS, Json(body)).into_response();
if let Some(secs) = retry_after_secs {
// `HeaderValue: From<u64>` skips both the
// intermediate `String` allocation and the
// fallible-by-shape `from_str` path.
resp.headers_mut().insert(
axum::http::header::RETRY_AFTER,
axum::http::HeaderValue::from(*secs),
);
}
return resp;
}
AppError::ValidationFailed { message, details } => (
StatusCode::UNPROCESSABLE_ENTITY,
message.clone(),

View File

@@ -1,21 +1,12 @@
use std::net::SocketAddr;
use std::time::Duration;
use tracing_subscriber::EnvFilter;
/// Upper bound on how long we're willing to wait for the crawler daemon
/// to drain before letting `main` return. Without it a wedged background
/// task (e.g. a chromiumoxide handler stuck on a dead WS) blocks the
/// process from exiting after Ctrl-C / SIGTERM.
const CRAWLER_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(5);
#[tokio::main]
async fn main() -> anyhow::Result<()> {
dotenvy::dotenv().ok();
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| {
"info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off".into()
}),
EnvFilter::try_from_default_env().unwrap_or_else(|_| "info,mangalord=debug".into()),
)
.init();
@@ -30,18 +21,9 @@ async fn main() -> anyhow::Result<()> {
.await?;
// Drain background tasks (crawler daemon) before exiting so Chromium
// gets a clean shutdown rather than relying on kill-on-drop. Bounded
// by a timeout so a wedged shutdown path can't trap the process.
// gets a clean shutdown rather than relying on kill-on-drop.
if let Some(d) = daemon {
if tokio::time::timeout(CRAWLER_SHUTDOWN_TIMEOUT, d.shutdown())
.await
.is_err()
{
tracing::warn!(
timeout_s = CRAWLER_SHUTDOWN_TIMEOUT.as_secs(),
"crawler daemon shutdown exceeded timeout; abandoning"
);
}
d.shutdown().await;
}
Ok(())
}

View File

@@ -1,32 +0,0 @@
//! Admin-action audit log writes.
//!
//! Insert is always called from inside the same transaction as the
//! action it audits — the executor parameter is `PgExecutor` so the
//! caller passes `&mut *tx` directly.
use sqlx::PgExecutor;
use uuid::Uuid;
use crate::error::AppResult;
pub async fn insert<'e, E: PgExecutor<'e>>(
executor: E,
actor_user_id: Uuid,
action: &str,
target_kind: &str,
target_id: Option<Uuid>,
payload: serde_json::Value,
) -> AppResult<()> {
sqlx::query(
"INSERT INTO admin_audit (actor_user_id, action, target_kind, target_id, payload) \
VALUES ($1, $2, $3, $4, $5)",
)
.bind(actor_user_id)
.bind(action)
.bind(target_kind)
.bind(target_id)
.bind(payload)
.execute(executor)
.await?;
Ok(())
}

View File

@@ -1,232 +0,0 @@
//! Admin-facing read queries that join manga/chapter with the crawler
//! signals (`manga_sources`, `chapter_sources`, `crawler_jobs`) to
//! derive a sync state per row at query time.
//!
//! Priority order for `MangaSyncState`:
//! 1. `InProgress` — any pending/running `sync_manga` or
//! `sync_chapter_list` job matches this manga.
//! 2. `Dropped` — manga has source rows AND every one of them is
//! `dropped_at IS NOT NULL`.
//! 3. `Synced` — default (includes user-uploaded mangas with no
//! `manga_sources` rows at all).
//!
//! Priority order for `ChapterSyncState`:
//! 1. `Downloading` — pending/running `sync_chapter_content` for this id
//! 2. `Dropped` — chapter has source rows AND all are dropped
//! 3. `Failed` — `page_count = 0` AND a `dead` `sync_chapter_content`
//! row exists for this chapter. Constrained to `page_count = 0`
//! because once pages are on disk the chapter IS synced — a
//! historical dead job (likely from a re-download attempt that
//! crashed) is noise that gets reaped after retention. Surfacing
//! "Failed" when content is present would contradict
//! `ChapterSyncState::Synced`'s "downloaded at some point" contract.
//! 4. `NotDownloaded` — `page_count = 0`, no in-flight, no dead job
//! 5. `Synced` — `page_count > 0`
//!
//! Reminder: `done` jobs are reaped after `CRAWLER_JOB_RETENTION_DAYS`,
//! so `chapters.page_count > 0` is the durable "this is synced" signal,
//! not the job table.
use chrono::{DateTime, Utc};
use serde::Serialize;
use sqlx::{FromRow, PgPool};
use uuid::Uuid;
use crate::domain::{ChapterSyncState, MangaSyncState};
use crate::error::AppResult;
#[derive(Debug, Serialize, FromRow)]
pub struct AdminMangaRow {
pub id: Uuid,
pub title: String,
pub status: String,
pub cover_image_path: Option<String>,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
pub sync_state: MangaSyncState,
pub chapter_count: i64,
pub latest_seen_at: Option<DateTime<Utc>>,
}
#[derive(Debug, Default)]
pub struct ListAdminMangasQuery {
pub search: Option<String>,
pub sync_state: Option<MangaSyncState>,
pub limit: i64,
pub offset: i64,
}
const MANGA_SYNC_STATE_CASE: &str = r#"
CASE
WHEN EXISTS (
SELECT 1 FROM crawler_jobs cj
WHERE cj.state IN ('pending','running')
AND (
(cj.payload->>'kind' = 'sync_chapter_list'
AND (cj.payload->>'manga_id')::uuid = m.id)
OR (cj.payload->>'kind' = 'sync_manga'
AND EXISTS (
SELECT 1 FROM manga_sources ms
WHERE ms.manga_id = m.id
AND ms.source_id = cj.payload->>'source_id'
AND ms.source_manga_key = cj.payload->>'source_manga_key'
))
)
) THEN 'in_progress'
WHEN EXISTS (SELECT 1 FROM manga_sources ms WHERE ms.manga_id = m.id)
AND NOT EXISTS (
SELECT 1 FROM manga_sources ms
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
)
THEN 'dropped'
ELSE 'synced'
END
"#;
/// Paginated admin manga list with derived sync state and total count.
/// Filters by `search` (substring on title, case-insensitive) and
/// `sync_state` (post-derivation). The CTE keeps the case expression
/// in one place — the same projection feeds both the page rows and the
/// totals count under the same filter.
pub async fn list_mangas_with_sync_state(
pool: &PgPool,
q: &ListAdminMangasQuery,
) -> AppResult<(Vec<AdminMangaRow>, i64)> {
let search_pat = q
.search
.as_ref()
.map(|s| format!("%{}%", s.trim()))
.filter(|p| p.len() > 2);
// sqlx::Type → text: bind the snake_case representation manually so
// the SQL can compare it as text without an explicit cast.
let sync_filter = q.sync_state.map(|s| match s {
MangaSyncState::InProgress => "in_progress",
MangaSyncState::Dropped => "dropped",
MangaSyncState::Synced => "synced",
});
let sql = format!(
r#"
WITH classified AS (
SELECT
m.id, m.title, m.status, m.cover_image_path,
m.created_at, m.updated_at,
{case} AS sync_state,
(SELECT COUNT(*) FROM chapters c WHERE c.manga_id = m.id) AS chapter_count,
(SELECT MAX(last_seen_at) FROM manga_sources ms
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL) AS latest_seen_at
FROM mangas m
WHERE ($1::text IS NULL OR m.title ILIKE $1)
)
SELECT * FROM classified
WHERE ($2::text IS NULL OR sync_state = $2)
ORDER BY updated_at DESC
LIMIT $3 OFFSET $4
"#,
case = MANGA_SYNC_STATE_CASE
);
let items: Vec<AdminMangaRow> = sqlx::query_as(&sql)
.bind(&search_pat)
.bind(sync_filter)
.bind(q.limit)
.bind(q.offset)
.fetch_all(pool)
.await?;
let total_sql = format!(
r#"
WITH classified AS (
SELECT {case} AS sync_state
FROM mangas m
WHERE ($1::text IS NULL OR m.title ILIKE $1)
)
SELECT COUNT(*) FROM classified
WHERE ($2::text IS NULL OR sync_state = $2)
"#,
case = MANGA_SYNC_STATE_CASE
);
let total: i64 = sqlx::query_scalar(&total_sql)
.bind(&search_pat)
.bind(sync_filter)
.fetch_one(pool)
.await?;
Ok((items, total))
}
#[derive(Debug, Serialize, FromRow)]
pub struct AdminChapterRow {
pub id: Uuid,
pub manga_id: Uuid,
pub number: i32,
pub title: Option<String>,
pub page_count: i32,
pub created_at: DateTime<Utc>,
pub sync_state: ChapterSyncState,
pub latest_seen_at: Option<DateTime<Utc>>,
}
#[derive(Debug, Default)]
pub struct ListAdminChaptersQuery {
pub manga_id: Uuid,
pub limit: i64,
pub offset: i64,
}
/// Paginated chapter list with derived sync state. Pagination is non-
/// optional — long-runners can have thousands of chapters and the
/// per-row scalar subqueries make the unbounded variant a real
/// stall risk even behind an admin guard. Returns the page slice plus
/// the unfiltered total so the UI can render "showing N of M".
pub async fn list_chapters_with_sync_state(
pool: &PgPool,
q: &ListAdminChaptersQuery,
) -> AppResult<(Vec<AdminChapterRow>, i64)> {
let items: Vec<AdminChapterRow> = sqlx::query_as(
r#"
SELECT
c.id, c.manga_id, c.number, c.title, c.page_count, c.created_at,
CASE
WHEN EXISTS (
SELECT 1 FROM crawler_jobs cj
WHERE cj.state IN ('pending','running')
AND cj.payload->>'kind' = 'sync_chapter_content'
AND (cj.payload->>'chapter_id')::uuid = c.id
) THEN 'downloading'
WHEN EXISTS (SELECT 1 FROM chapter_sources cs WHERE cs.chapter_id = c.id)
AND NOT EXISTS (
SELECT 1 FROM chapter_sources cs
WHERE cs.chapter_id = c.id AND cs.dropped_at IS NULL
)
THEN 'dropped'
WHEN c.page_count = 0
AND EXISTS (
SELECT 1 FROM crawler_jobs cj
WHERE cj.state = 'dead'
AND cj.payload->>'kind' = 'sync_chapter_content'
AND (cj.payload->>'chapter_id')::uuid = c.id
) THEN 'failed'
WHEN c.page_count = 0 THEN 'not_downloaded'
ELSE 'synced'
END AS sync_state,
(SELECT MAX(last_seen_at) FROM chapter_sources cs
WHERE cs.chapter_id = c.id AND cs.dropped_at IS NULL) AS latest_seen_at
FROM chapters c
WHERE c.manga_id = $1
ORDER BY c.number ASC
LIMIT $2 OFFSET $3
"#,
)
.bind(q.manga_id)
.bind(q.limit)
.bind(q.offset)
.fetch_all(pool)
.await?;
let total: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM chapters WHERE manga_id = $1")
.bind(q.manga_id)
.fetch_one(pool)
.await?;
Ok((items, total))
}

View File

@@ -99,11 +99,6 @@ pub async fn list(
/// Atomically replace the set of authors on a manga. Caller passes a
/// `&mut PgConnection` (`&mut *tx` works) so the delete+upserts run in
/// one transaction with whatever called us.
///
/// Note: `crawler::repo::sync_authors` does a similar replace with the
/// same semantics on names. The duplication is intentional — handler
/// callers want the `Vec<AuthorRef>` for the API response; the
/// crawler doesn't need it and stays inside its own transaction.
pub async fn set_for_manga(
conn: &mut PgConnection,
manga_id: Uuid,

View File

@@ -29,9 +29,9 @@ pub async fn create(
match result {
Ok(b) => Ok(b),
Err(sqlx::Error::Database(ref db_err)) if db_err.is_unique_violation() => Err(
AppError::Conflict("bookmark already exists for this manga/chapter".into()),
),
Err(e) if is_unique_violation(&e) => Err(AppError::Conflict(
"bookmark already exists for this manga/chapter".into(),
)),
Err(e) => Err(AppError::Database(e)),
}
}
@@ -97,3 +97,10 @@ pub async fn delete(pool: &PgPool, id: Uuid) -> AppResult<()> {
Ok(())
}
fn is_unique_violation(err: &sqlx::Error) -> bool {
if let sqlx::Error::Database(db_err) = err {
db_err.code().as_deref() == Some("23505")
} else {
false
}
}

View File

@@ -4,7 +4,7 @@ use sqlx::{PgExecutor, PgPool};
use uuid::Uuid;
use crate::domain::Chapter;
use crate::error::AppResult;
use crate::error::{AppError, AppResult};
pub async fn list_for_manga(
pool: &PgPool,
@@ -12,20 +12,15 @@ pub async fn list_for_manga(
limit: i64,
offset: i64,
) -> AppResult<Vec<Chapter>> {
// Display order = source-site order reversed. The crawler stamps
// `source_index` = position in the source DOM (0 = first = newest
// on this site, see migration 0021), so DESC puts the oldest
// chapter first and keeps the site's variant grouping and the
// placement of non-numeric entries (e.g. "notice. : Officials")
// intact. NULLS LAST keeps user-uploaded chapters (no source row)
// and rows that pre-date the migration below crawled rows; the
// (number, created_at) tail then orders them deterministically.
// Secondary sort by created_at gives duplicate-numbered chapters
// (multiple uploaders/translations of the same number) a stable
// order in lists and prev/next reader navigation.
let rows = sqlx::query_as::<_, Chapter>(
r#"
SELECT id, manga_id, number, title, page_count, created_at
FROM chapters
WHERE manga_id = $1
ORDER BY source_index DESC NULLS LAST, number ASC, created_at ASC
ORDER BY number ASC, created_at ASC
LIMIT $2 OFFSET $3
"#,
)
@@ -67,9 +62,10 @@ pub async fn find_by_id_in_manga(
///
/// Chapter identity is the row UUID; the same (manga_id, number)
/// combination can repeat (multiple translations, re-uploads). The
/// 0013 migration dropped the (manga_id, number) UNIQUE, so duplicate
/// inserts succeed by design. If a future migration re-adds any
/// uniqueness, surface a 409 by adding a unique-violation arm here.
/// `is_unique_violation` branch below is a defensive holdover from
/// 0001's (manga_id, number) UNIQUE — it can no longer fire under
/// normal operation, but we surface a clean 409 if a future migration
/// re-adds any chapter uniqueness.
pub async fn create<'e, E: PgExecutor<'e>>(
executor: E,
manga_id: Uuid,
@@ -77,7 +73,7 @@ pub async fn create<'e, E: PgExecutor<'e>>(
title: Option<&str>,
uploaded_by: Option<Uuid>,
) -> AppResult<Chapter> {
let row = sqlx::query_as::<_, Chapter>(
let result = sqlx::query_as::<_, Chapter>(
r#"
INSERT INTO chapters (manga_id, number, title, uploaded_by)
VALUES ($1, $2, $3, $4)
@@ -89,75 +85,15 @@ pub async fn create<'e, E: PgExecutor<'e>>(
.bind(title)
.bind(uploaded_by)
.fetch_one(executor)
.await?;
Ok(row)
}
.await;
/// Cross-link guard for `POST /bookmarks`: the bookmarks FK accepts
/// any valid chapter id, but a chapter must belong to the bookmark's
/// manga or the bookmark would dangle on a foreign manga. Handlers
/// call this before the insert and surface `NotFound` when it
/// returns `false`.
pub async fn belongs_to_manga(
pool: &PgPool,
chapter_id: Uuid,
manga_id: Uuid,
) -> AppResult<bool> {
let (exists,): (bool,) = sqlx::query_as(
"SELECT EXISTS(SELECT 1 FROM chapters WHERE id = $1 AND manga_id = $2)",
)
.bind(chapter_id)
.bind(manga_id)
.fetch_one(pool)
.await?;
Ok(exists)
}
/// Read just the page_count for a chapter. Used by the crawler
/// daemon's consumer-side dedup safety net so it can ack-done a job
/// whose chapter has already been fetched by a racing worker.
pub async fn page_count(pool: &PgPool, id: Uuid) -> sqlx::Result<Option<i32>> {
sqlx::query_scalar("SELECT page_count FROM chapters WHERE id = $1")
.bind(id)
.fetch_optional(pool)
.await
}
/// Look up the manga_id + most recent live source_url for a chapter.
/// Used by the daemon's chapter dispatcher to resolve the URL it needs
/// to hand to `content::sync_chapter_content`.
///
/// Skips soft-dropped sources (`cs.dropped_at IS NOT NULL`) and breaks
/// ties between multiple live sources by `last_seen_at DESC`, so the
/// freshest still-attached URL wins. Returns `None` when the chapter
/// is gone or all its source rows are dropped — callers in the
/// dispatcher treat `None` as "ack the job, skip the work."
///
/// The enqueue queries (`pipeline::enqueue_bookmarked_pending` and
/// `enqueue_pending_for_manga`) apply the same `dropped_at IS NULL`
/// filter — this resolver stays in lockstep so a chapter that was
/// dropped between enqueue and lease isn't dispatched against a stale
/// URL.
/// Returns `(manga_id, source_url, manga_title, chapter_number)`. The
/// title + number feed the live "currently crawling" status; the rest is
/// what the dispatcher needs to do the work.
pub async fn dispatch_target(
pool: &PgPool,
chapter_id: Uuid,
) -> sqlx::Result<Option<(Uuid, String, String, i32)>> {
sqlx::query_as(
"SELECT c.manga_id, cs.source_url, m.title, c.number \
FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
JOIN mangas m ON m.id = c.manga_id \
WHERE c.id = $1 \
AND cs.dropped_at IS NULL \
ORDER BY cs.last_seen_at DESC \
LIMIT 1",
)
.bind(chapter_id)
.fetch_optional(pool)
.await
match result {
Ok(c) => Ok(c),
Err(e) if is_unique_violation(&e) => Err(AppError::Conflict(format!(
"chapter {number} conflicts with an existing chapter for this manga"
))),
Err(e) => Err(AppError::Database(e)),
}
}
pub async fn set_page_count<'e, E: PgExecutor<'e>>(
@@ -173,3 +109,10 @@ pub async fn set_page_count<'e, E: PgExecutor<'e>>(
Ok(())
}
fn is_unique_violation(err: &sqlx::Error) -> bool {
if let sqlx::Error::Database(db_err) = err {
db_err.code().as_deref() == Some("23505")
} else {
false
}
}

View File

@@ -8,18 +8,15 @@
//! updated (metadata_hash changed), or unchanged.
//! - [`sync_manga_chapters`]: per-manga chapter reconciliation. Adds
//! new ones, refreshes URLs on existing ones, soft-drops vanished.
//! - [`mark_run_started`] / [`mark_run_completed`] /
//! [`last_run_completed_cleanly`]: per-source recovery flag in
//! `crawler_state`. A `false` flag on tick start means the previous
//! run did not exit cleanly and the next walk should ignore the
//! early-stop condition.
//! - [`mark_dropped_mangas`]: end-of-run pass. Any manga from this
//! source whose `last_seen_at` is older than the run start is
//! soft-dropped.
//!
//! Each public function is a transaction boundary so a partial failure
//! mid-call leaves the DB in its pre-call state.
use chrono::{DateTime, Utc};
use serde::Serialize;
use sqlx::{FromRow, PgPool, Postgres, Transaction};
use sqlx::{PgPool, Postgres, Transaction};
use uuid::Uuid;
use crate::crawler::source::{SourceChapterRef, SourceManga};
@@ -277,20 +274,7 @@ async fn sync_tags(
manga_id: Uuid,
tags: &[String],
) -> sqlx::Result<()> {
// Only clear crawler-owned attachments (added_by IS NULL). User-
// attached tags are owned by the attaching user and must survive
// the recurring metadata pass — see manga_tags.added_by in
// migration 0009.
//
// Note on orphans: `manga_tags.added_by` is `ON DELETE SET NULL`,
// so an attachment whose user was deleted becomes
// indistinguishable from a crawler-owned row and is cleaned up
// here. That mirrors how `api::mangas::detach_tag` already treats
// orphans ("nobody owns it, refuse to let anyone but admin clear
// them") — the crawler now becomes the eventual reaper. Tracked
// by `sync_tags_garbage_collects_orphan_user_attachments` in
// backend/tests/crawler_sync.rs.
sqlx::query("DELETE FROM manga_tags WHERE manga_id = $1 AND added_by IS NULL")
sqlx::query("DELETE FROM manga_tags WHERE manga_id = $1")
.bind(manga_id)
.execute(&mut **tx)
.await?;
@@ -331,53 +315,18 @@ pub async fn sync_manga_chapters(
chapters: &[SourceChapterRef],
) -> sqlx::Result<ChapterDiff> {
let mut tx = pool.begin().await?;
// Per-manga advisory lock. Two concurrent calls for the same manga
// would otherwise both read `seen_keys`, both run the drop UPDATE
// filtered on `NOT (key = ANY $3)`, and the later commit could soft-
// drop a chapter the earlier commit had just inserted (lost-update
// shape under MVCC). `pg_advisory_xact_lock` is scoped to this
// transaction: it auto-releases on COMMIT/ROLLBACK so a Rust-side
// panic mid-call doesn't strand the lock. The single-arg int8 form
// keyed by `hashtextextended(manga_id::text, 0)` shares Postgres'
// global advisory-lock namespace with `CRON_LOCK_KEY`, but collision
// is 2^-64 per pair (a UUID-derived hash hitting the fixed cron key
// is effectively impossible).
sqlx::query("SELECT pg_advisory_xact_lock(hashtextextended($1::text, 0))")
.bind(manga_id)
.execute(&mut *tx)
.await?;
let mut diff = ChapterDiff::default();
let seen_keys: Vec<String> = chapters
.iter()
.map(|c| c.source_chapter_key.clone())
.collect();
for (idx, c) in chapters.iter().enumerate() {
// `source_index` captures the chapter's position in the source
// DOM (0 = first = newest on this site) so the list query can
// reverse it for the user-facing list — see migration 0021.
// Every sync overwrites the value on both branches, so a new
// chapter inserted at the top of the source shifts every other
// row down by one on the next tick.
let source_index = idx as i32;
// Lookup is constrained by manga_id (via the chapters join) so a
// source whose chapter slugs collide across mangas (e.g.
// "chapter-1" appearing under two different mangas) attributes
// each row to the correct manga. Migration 0017 dropped the
// (source_id, source_chapter_key) PK in favour of
// (source_id, chapter_id) for exactly this reason.
for c in chapters {
let existing: Option<(Uuid,)> = sqlx::query_as(
"SELECT cs.chapter_id \
FROM chapter_sources cs \
JOIN chapters ch ON ch.id = cs.chapter_id \
WHERE cs.source_id = $1 \
AND cs.source_chapter_key = $2 \
AND ch.manga_id = $3",
"SELECT chapter_id FROM chapter_sources WHERE source_id = $1 AND source_chapter_key = $2",
)
.bind(source_id)
.bind(&c.source_chapter_key)
.bind(manga_id)
.fetch_optional(&mut *tx)
.await?;
@@ -390,15 +339,14 @@ pub async fn sync_manga_chapters(
// identity is the UUID, not the number.
let (chapter_id,): (Uuid,) = sqlx::query_as(
r#"
INSERT INTO chapters (manga_id, number, title, page_count, source_index)
VALUES ($1, $2, $3, 0, $4)
INSERT INTO chapters (manga_id, number, title, page_count)
VALUES ($1, $2, $3, 0)
RETURNING id
"#,
)
.bind(manga_id)
.bind(c.number)
.bind(c.title.as_deref())
.bind(source_index)
.fetch_one(&mut *tx)
.await?;
sqlx::query(
@@ -417,27 +365,21 @@ pub async fn sync_manga_chapters(
diff.new += 1;
}
Some((chapter_id,)) => {
sqlx::query(
"UPDATE chapters SET title = $1, source_index = $2 WHERE id = $3",
)
sqlx::query("UPDATE chapters SET title = $1 WHERE id = $2")
.bind(c.title.as_deref())
.bind(source_index)
.bind(chapter_id)
.execute(&mut *tx)
.await?;
// chapter_id is now the natural per-(source, chapter)
// identifier — use it directly instead of re-keying on
// (source_id, source_chapter_key) which may not be unique.
sqlx::query(
r#"
UPDATE chapter_sources
SET source_url = $1, last_seen_at = NOW(), dropped_at = NULL
WHERE source_id = $2 AND chapter_id = $3
WHERE source_id = $2 AND source_chapter_key = $3
"#,
)
.bind(&c.url)
.bind(source_id)
.bind(chapter_id)
.bind(&c.source_chapter_key)
.execute(&mut *tx)
.await?;
diff.refreshed += 1;
@@ -470,52 +412,19 @@ pub async fn sync_manga_chapters(
Ok(diff)
}
/// Count the chapters that the source `(source_id, source_manga_key)`
/// is currently known to attach to — i.e. the number of `chapter_sources`
/// rows for the manga identified by the (source_id, source_manga_key)
/// pair, restricted to live (`dropped_at IS NULL`) rows.
/// Record that a complete Backfill walk has finished for `source_id`.
/// The presence of this row is what the daemon's mode auto-detection
/// uses to flip from Backfill to Incremental on subsequent ticks.
///
/// Used by the metadata pass's partial-render guard: if `fetch_manga`
/// returns an empty `chapters` Vec but the source previously surfaced
/// chapters here, that's most likely a chromium snapshot taken between
/// the `#chapter_table` wrapper render and its rows render — the
/// safest move is to skip `sync_manga_chapters` so the soft-drop
/// branch doesn't flip every existing chapter to `dropped_at`.
///
/// Returns `Ok(0)` when the manga is brand-new (no `manga_sources`
/// row yet), which is the legitimate "this manga has no chapters yet"
/// case and must NOT be flagged.
pub async fn live_chapter_count_for_source_manga(
/// Keyed `seed_completed:<source_id>` in `crawler_state`. JSON payload
/// stores the timestamp so we can surface "last fully reseeded at" in
/// future ops tooling without another migration.
pub async fn mark_seed_completed(
pool: &PgPool,
source_id: &str,
source_manga_key: &str,
) -> sqlx::Result<i64> {
let row: Option<(i64,)> = sqlx::query_as(
"SELECT COUNT(*) \
FROM chapter_sources cs \
JOIN chapters c ON c.id = cs.chapter_id \
JOIN manga_sources ms \
ON ms.manga_id = c.manga_id \
AND ms.source_id = cs.source_id \
WHERE ms.source_id = $1 \
AND ms.source_manga_key = $2 \
AND cs.dropped_at IS NULL",
)
.bind(source_id)
.bind(source_manga_key)
.fetch_optional(pool)
.await?;
Ok(row.map(|(n,)| n).unwrap_or(0))
}
/// Mark a metadata pass as in-flight for `source_id`. Stamps
/// `last_run_completed:<source_id>` in `crawler_state` with
/// `{"completed": false, "at": now}`. A crash, panic, or SIGKILL after
/// this point leaves the flag at `false`, which the next tick reads as
/// "previous run did not exit cleanly — walk the full catalog this
/// time" (recovery sweep).
pub async fn mark_run_started(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
let key = format!("last_run_completed:{source_id}");
at: DateTime<Utc>,
) -> sqlx::Result<()> {
let key = format!("seed_completed:{source_id}");
sqlx::query(
"INSERT INTO crawler_state (key, value, updated_at) \
VALUES ($1, $2, now()) \
@@ -523,423 +432,50 @@ pub async fn mark_run_started(pool: &PgPool, source_id: &str) -> sqlx::Result<()
SET value = EXCLUDED.value, updated_at = now()",
)
.bind(&key)
.bind(serde_json::json!({
"completed": false,
"at": Utc::now().to_rfc3339(),
}))
.bind(serde_json::json!({ "at": at.to_rfc3339() }))
.execute(pool)
.await?;
Ok(())
}
/// Mark a metadata pass as completed cleanly for `source_id`. Called
/// from the same place a run decides it reached end-of-walk or hit the
/// intentional stop. The next tick reads `true` and applies the normal
/// stop condition.
pub async fn mark_run_completed(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
let key = format!("last_run_completed:{source_id}");
sqlx::query(
"INSERT INTO crawler_state (key, value, updated_at) \
VALUES ($1, $2, now()) \
ON CONFLICT (key) DO UPDATE \
SET value = EXCLUDED.value, updated_at = now()",
)
.bind(&key)
.bind(serde_json::json!({
"completed": true,
"at": Utc::now().to_rfc3339(),
}))
.execute(pool)
.await?;
Ok(())
}
/// List mangas whose `cover_image_path IS NULL` but a live
/// `manga_sources` row still attaches them to a source. The bounded
/// result feeds the cover-backfill pass in [`crate::crawler::pipeline`]:
/// each entry is one (manga, freshest source row) pair where a cover
/// re-download is in order.
///
/// Per-manga deduplication uses `DISTINCT ON (m.id)` keyed on the row
/// with the newest `last_seen_at`, so a manga that's surfaced by
/// multiple sources only produces one row (the freshest). Sort is
/// stable for tests.
pub async fn list_missing_covers(
pool: &PgPool,
max: i64,
) -> sqlx::Result<Vec<MissingCoverEntry>> {
let rows: Vec<(Uuid, String, String)> = sqlx::query_as(
r#"
SELECT DISTINCT ON (m.id) m.id, ms.source_manga_key, ms.source_url
FROM mangas m
JOIN manga_sources ms ON ms.manga_id = m.id
WHERE m.cover_image_path IS NULL
AND ms.dropped_at IS NULL
ORDER BY m.id, ms.last_seen_at DESC
LIMIT $1
"#,
)
.bind(max)
.fetch_all(pool)
.await?;
Ok(rows
.into_iter()
.map(|(manga_id, source_manga_key, source_url)| MissingCoverEntry {
manga_id,
source_manga_key,
source_url,
})
.collect())
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MissingCoverEntry {
pub manga_id: Uuid,
pub source_manga_key: String,
pub source_url: String,
}
/// Read the recovery flag for `source_id`. A missing row OR an
/// unparseable value reads as `true` ("clean") — the former covers the
/// first-ever run on a virgin DB (no recovery needed), the latter
/// covers forward-compat against future schema changes; both fail-safe
/// toward not making an operator pay for an unnecessary full sweep.
pub async fn last_run_completed_cleanly(
/// Read the timestamp written by [`mark_seed_completed`], if any.
/// `None` means no complete Backfill has ever finished for this
/// source — the daemon should run Backfill on the next tick.
pub async fn seed_completed_at(
pool: &PgPool,
source_id: &str,
) -> sqlx::Result<bool> {
let key = format!("last_run_completed:{source_id}");
) -> sqlx::Result<Option<DateTime<Utc>>> {
let key = format!("seed_completed:{source_id}");
let row: Option<serde_json::Value> =
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
.bind(&key)
.fetch_optional(pool)
.await?;
Ok(row
.and_then(|v| v.get("completed").and_then(|b| b.as_bool()))
.unwrap_or(true))
Ok(row.and_then(|v| {
v.get("at")
.and_then(|s| s.as_str())
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
.map(|dt| dt.with_timezone(&Utc))
}))
}
// ---------------------------------------------------------------------------
// Dead-letter jobs: admin observability + requeue.
// ---------------------------------------------------------------------------
/// A `dead` crawler job joined to its chapter/manga context for the admin
/// dead-letter view. Chapter columns are `Option` because the join is
/// best-effort (the chapter may have been removed since the job died, or
/// the job may be a non-chapter kind).
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct DeadJob {
pub id: Uuid,
pub kind: String,
pub chapter_id: Option<Uuid>,
pub manga_id: Option<Uuid>,
pub manga_title: Option<String>,
pub chapter_number: Option<i32>,
pub attempts: i32,
pub max_attempts: i32,
pub last_error: Option<String>,
pub updated_at: DateTime<Utc>,
}
/// Paginated list of `dead` jobs, newest-failed first, joined to chapter +
/// manga context. `search` filters on manga title (case-insensitive
/// substring). Returns the page slice plus the unfiltered-by-page total.
pub async fn list_dead_jobs(
pub async fn mark_dropped_mangas(
pool: &PgPool,
search: Option<&str>,
limit: i64,
offset: i64,
) -> sqlx::Result<(Vec<DeadJob>, i64)> {
let search_pat = search
.map(|s| format!("%{}%", s.trim()))
.filter(|p| p.len() > 2);
let items: Vec<DeadJob> = sqlx::query_as(
source_id: &str,
run_started_at: DateTime<Utc>,
) -> sqlx::Result<u64> {
let res = sqlx::query(
r#"
SELECT
cj.id,
cj.payload->>'kind' AS kind,
(cj.payload->>'chapter_id')::uuid AS chapter_id,
c.manga_id AS manga_id,
m.title AS manga_title,
c.number AS chapter_number,
cj.attempts,
cj.max_attempts,
cj.last_error,
cj.updated_at
FROM crawler_jobs cj
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
LEFT JOIN mangas m ON m.id = c.manga_id
WHERE cj.state = 'dead'
AND ($1::text IS NULL OR m.title ILIKE $1)
ORDER BY cj.updated_at DESC
LIMIT $2 OFFSET $3
UPDATE manga_sources
SET dropped_at = NOW()
WHERE source_id = $1
AND last_seen_at < $2
AND dropped_at IS NULL
"#,
)
.bind(&search_pat)
.bind(limit)
.bind(offset)
.fetch_all(pool)
.bind(source_id)
.bind(run_started_at)
.execute(pool)
.await?;
let total: i64 = sqlx::query_scalar(
r#"
SELECT COUNT(*)
FROM crawler_jobs cj
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
LEFT JOIN mangas m ON m.id = c.manga_id
WHERE cj.state = 'dead'
AND ($1::text IS NULL OR m.title ILIKE $1)
"#,
)
.bind(&search_pat)
.fetch_one(pool)
.await?;
Ok((items, total))
Ok(res.rows_affected())
}
/// An in-flight chapter-content job (`pending` or `running`) joined to its
/// chapter + manga, for the "queued chapters" admin view.
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct ActiveJob {
pub id: Uuid,
pub chapter_id: Option<Uuid>,
pub manga_id: Option<Uuid>,
pub manga_title: Option<String>,
pub chapter_number: Option<i32>,
/// `"pending"` or `"running"`.
pub state: String,
pub attempts: i32,
pub max_attempts: i32,
pub updated_at: DateTime<Utc>,
}
/// Paginated list of `pending`/`running` chapter-content jobs (which
/// chapters of which mangas are queued or being crawled). Running first,
/// then by scheduled order. `search` filters on manga title.
pub async fn list_active_jobs(
pool: &PgPool,
search: Option<&str>,
limit: i64,
offset: i64,
) -> sqlx::Result<(Vec<ActiveJob>, i64)> {
let search_pat = search
.map(|s| format!("%{}%", s.trim()))
.filter(|p| p.len() > 2);
let items: Vec<ActiveJob> = sqlx::query_as(
r#"
SELECT
cj.id,
(cj.payload->>'chapter_id')::uuid AS chapter_id,
c.manga_id AS manga_id,
m.title AS manga_title,
c.number AS chapter_number,
cj.state,
cj.attempts,
cj.max_attempts,
cj.updated_at
FROM crawler_jobs cj
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
LEFT JOIN mangas m ON m.id = c.manga_id
WHERE cj.state IN ('pending','running')
AND cj.payload->>'kind' = 'sync_chapter_content'
AND ($1::text IS NULL OR m.title ILIKE $1)
ORDER BY (cj.state = 'running') DESC, cj.scheduled_at, cj.created_at
LIMIT $2 OFFSET $3
"#,
)
.bind(&search_pat)
.bind(limit)
.bind(offset)
.fetch_all(pool)
.await?;
let total: i64 = sqlx::query_scalar(
r#"
SELECT COUNT(*)
FROM crawler_jobs cj
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
LEFT JOIN mangas m ON m.id = c.manga_id
WHERE cj.state IN ('pending','running')
AND cj.payload->>'kind' = 'sync_chapter_content'
AND ($1::text IS NULL OR m.title ILIKE $1)
"#,
)
.bind(&search_pat)
.fetch_one(pool)
.await?;
Ok((items, total))
}
/// A manga whose cover is still missing (queued for cover fetch).
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct MissingCoverRow {
pub manga_id: Uuid,
pub manga_title: String,
}
/// Count mangas with no cover yet but a live source row — the cover
/// backlog the metadata pass + backfill drain.
pub async fn count_missing_covers(pool: &PgPool) -> sqlx::Result<i64> {
sqlx::query_scalar(
r#"
SELECT COUNT(*) FROM mangas m
WHERE m.cover_image_path IS NULL
AND EXISTS (
SELECT 1 FROM manga_sources ms
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
)
"#,
)
.fetch_one(pool)
.await
}
/// Paginated list of mangas queued for a cover fetch (no cover yet + a live
/// source), with titles. `search` filters on title. Freshest source first.
pub async fn list_missing_cover_mangas(
pool: &PgPool,
search: Option<&str>,
limit: i64,
offset: i64,
) -> sqlx::Result<(Vec<MissingCoverRow>, i64)> {
let search_pat = search
.map(|s| format!("%{}%", s.trim()))
.filter(|p| p.len() > 2);
let items: Vec<MissingCoverRow> = sqlx::query_as(
r#"
SELECT m.id AS manga_id, m.title AS manga_title
FROM mangas m
WHERE m.cover_image_path IS NULL
AND EXISTS (
SELECT 1 FROM manga_sources ms
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
)
AND ($1::text IS NULL OR m.title ILIKE $1)
ORDER BY m.updated_at DESC
LIMIT $2 OFFSET $3
"#,
)
.bind(&search_pat)
.bind(limit)
.bind(offset)
.fetch_all(pool)
.await?;
let total: i64 = sqlx::query_scalar(
r#"
SELECT COUNT(*) FROM mangas m
WHERE m.cover_image_path IS NULL
AND EXISTS (
SELECT 1 FROM manga_sources ms
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
)
AND ($1::text IS NULL OR m.title ILIKE $1)
"#,
)
.bind(&search_pat)
.fetch_one(pool)
.await?;
Ok((items, total))
}
/// Scope of a dead-job requeue.
#[derive(Debug, Clone)]
pub enum RequeueScope {
/// Every dead job.
All,
/// Dead jobs whose chapter belongs to this manga.
Manga(Uuid),
/// Dead jobs for a single chapter.
Chapter(Uuid),
/// A single dead job by its id.
Job(Uuid),
}
/// Requeue dead jobs back to `pending` with a fresh attempt budget. This is
/// an explicit operator override, so it bypasses the dead-letter quarantine
/// the enqueue helpers honour (we act directly on the row). Returns the
/// number of rows requeued.
///
/// Two invariants protect the partial unique dedup index
/// `crawler_jobs_chapter_content_dedup_idx` (one `pending|running`
/// sync_chapter_content job per chapter):
/// 1. A chapter that already has a live (`pending|running`) job is
/// skipped entirely (`NO_LIVE_DUP`).
/// 2. When a chapter has *multiple* dead jobs, only the newest is
/// revived (`DISTINCT ON` the chapter key) — without this, flipping
/// two dead rows for the same chapter to `pending` in one statement
/// would violate the index and abort the whole requeue. Non-chapter
/// jobs fall back to their row id so each stays distinct.
pub async fn requeue_dead_jobs(pool: &PgPool, scope: RequeueScope) -> sqlx::Result<u64> {
// Scope predicate spliced into the `pick` CTE. Only compile-time
// literals are interpolated; all values are bound below.
let scope_pred: &str = match scope {
RequeueScope::All => "",
RequeueScope::Manga(_) => {
"AND (cj.payload->>'chapter_id')::uuid IN \
(SELECT id FROM chapters WHERE manga_id = $1)"
}
RequeueScope::Chapter(_) => "AND (cj.payload->>'chapter_id')::uuid = $1",
RequeueScope::Job(_) => "AND cj.id = $1",
};
let sql = format!(
r#"
WITH pick AS (
SELECT DISTINCT ON (COALESCE(cj.payload->>'chapter_id', cj.id::text)) cj.id
FROM crawler_jobs cj
WHERE cj.state = 'dead'
{scope_pred}
AND NOT EXISTS (
SELECT 1 FROM crawler_jobs live
WHERE live.payload->>'kind' = 'sync_chapter_content'
AND live.payload->>'chapter_id' = cj.payload->>'chapter_id'
AND live.state IN ('pending','running')
)
ORDER BY COALESCE(cj.payload->>'chapter_id', cj.id::text), cj.updated_at DESC
)
UPDATE crawler_jobs
SET state = 'pending', attempts = 0, leased_until = NULL,
last_error = NULL, scheduled_at = now(), updated_at = now()
FROM pick
WHERE crawler_jobs.id = pick.id
"#
);
let mut q = sqlx::query(&sql);
match scope {
RequeueScope::All => {}
RequeueScope::Manga(id) | RequeueScope::Chapter(id) | RequeueScope::Job(id) => {
q = q.bind(id);
}
}
Ok(q.execute(pool).await?.rows_affected())
}
/// Count crawler jobs grouped by state — drives the dashboard queue
/// gauges. Returns `(pending, running, dead)`.
pub async fn job_state_counts(pool: &PgPool) -> sqlx::Result<(i64, i64, i64)> {
let rows: Vec<(String, i64)> =
sqlx::query_as("SELECT state, COUNT(*) FROM crawler_jobs GROUP BY state")
.fetch_all(pool)
.await?;
let mut pending = 0;
let mut running = 0;
let mut dead = 0;
for (state, n) in rows {
match state.as_str() {
"pending" => pending = n,
"running" => running = n,
"dead" => dead = n,
_ => {}
}
}
Ok((pending, running, dead))
}

View File

@@ -61,11 +61,6 @@ pub async fn load_for_mangas(
/// FK constraint would reject them, so we filter upstream rather than
/// surface a 500 here. (The API layer validates the set against
/// `list_all` first.)
///
/// Note: `crawler::repo::sync_genres` does a similar replace, but by
/// *name* and with auto-create of unseen genres — the crawler can't
/// validate against the curated vocabulary on its own. Both paths are
/// intentional; don't merge them without preserving that semantic.
pub async fn set_for_manga(
conn: &mut PgConnection,
manga_id: Uuid,

View File

@@ -281,17 +281,3 @@ pub async fn exists(pool: &PgPool, id: Uuid) -> AppResult<bool> {
.await?;
Ok(exists)
}
/// Returns the uploader's user id for a manga. `None` either when the
/// manga doesn't exist or when the row predates the `uploaded_by`
/// column (historical NULL — see migration 0011). Callers must
/// distinguish "manga missing" via [`exists`] before relying on this
/// to make an authz decision.
pub async fn uploaded_by(pool: &PgPool, id: Uuid) -> AppResult<Option<Uuid>> {
let row: Option<(Option<Uuid>,)> =
sqlx::query_as("SELECT uploaded_by FROM mangas WHERE id = $1")
.bind(id)
.fetch_optional(pool)
.await?;
Ok(row.and_then(|(u,)| u))
}

View File

@@ -1,5 +1,3 @@
pub mod admin_audit;
pub mod admin_view;
pub mod api_token;
pub mod author;
pub mod bookmark;

View File

@@ -11,7 +11,7 @@ pub async fn create(pool: &PgPool, username: &str, password_hash: &str) -> AppRe
r#"
INSERT INTO users (username, password_hash)
VALUES ($1, $2)
RETURNING id, username, password_hash, created_at, is_admin
RETURNING id, username, password_hash, created_at
"#,
)
.bind(username)
@@ -21,7 +21,7 @@ pub async fn create(pool: &PgPool, username: &str, password_hash: &str) -> AppRe
match result {
Ok(user) => Ok(user),
Err(sqlx::Error::Database(ref db_err)) if db_err.is_unique_violation() => {
Err(e) if is_unique_violation(&e) => {
Err(AppError::Conflict("username is already taken".into()))
}
Err(e) => Err(AppError::Database(e)),
@@ -35,7 +35,7 @@ pub async fn create(pool: &PgPool, username: &str, password_hash: &str) -> AppRe
pub async fn find_by_username(pool: &PgPool, username: &str) -> AppResult<Option<User>> {
let row = sqlx::query_as::<_, User>(
r#"
SELECT id, username, password_hash, created_at, is_admin
SELECT id, username, password_hash, created_at
FROM users
WHERE lower(username) = lower($1)
"#,
@@ -48,7 +48,7 @@ pub async fn find_by_username(pool: &PgPool, username: &str) -> AppResult<Option
pub async fn find_by_id(pool: &PgPool, id: Uuid) -> AppResult<Option<User>> {
let row = sqlx::query_as::<_, User>(
r#"SELECT id, username, password_hash, created_at, is_admin FROM users WHERE id = $1"#,
r#"SELECT id, username, password_hash, created_at FROM users WHERE id = $1"#,
)
.bind(id)
.fetch_optional(pool)
@@ -56,317 +56,10 @@ pub async fn find_by_id(pool: &PgPool, id: Uuid) -> AppResult<Option<User>> {
Ok(row)
}
/// Postgres advisory-lock key guarding admin-count-changing operations
/// (demote, delete-admin). Without this lock two concurrent demotes of
/// different admins could each pass their "more than one admin remains"
/// check, then commit, leaving zero admins. The lock serialises any tx
/// that might change the admin count so the recount under the lock is
/// authoritative.
///
/// Value is the bytes of "admininv" interpreted as a big-endian i64.
/// Postgres' advisory-lock keyspace is global; collision risk with
/// `CRON_LOCK_KEY` and friends is ~2^-64.
pub const ADMIN_INVARIANT_LOCK_KEY: i64 = 0x61_64_6d_69_6e_69_6e_76;
#[derive(Debug, Default)]
pub struct ListUsersQuery {
pub search: Option<String>,
pub limit: i64,
pub offset: i64,
}
/// Paginated user list with total count. `search` is a case-insensitive
/// substring match on `username`. Order is alphabetical by username so
/// pagination is stable across concurrent writes (mangas changing
/// is_admin doesn't reshuffle the page).
pub async fn list_with_total(
pool: &PgPool,
q: &ListUsersQuery,
) -> AppResult<(Vec<User>, i64)> {
let pat = q
.search
.as_ref()
.map(|s| format!("%{}%", s.trim()))
.filter(|p| p.len() > 2);
let items = sqlx::query_as::<_, User>(
r#"
SELECT id, username, password_hash, created_at, is_admin
FROM users
WHERE ($1::text IS NULL OR username ILIKE $1)
ORDER BY username
LIMIT $2 OFFSET $3
"#,
)
.bind(&pat)
.bind(q.limit)
.bind(q.offset)
.fetch_all(pool)
.await?;
let total: i64 = sqlx::query_scalar(
"SELECT COUNT(*) FROM users WHERE ($1::text IS NULL OR username ILIKE $1)",
)
.bind(&pat)
.fetch_one(pool)
.await?;
Ok((items, total))
}
/// Raw `is_admin` update with no safety checks, no audit log, and no
/// advisory lock. Exists only as a test setup helper for the admin-
/// feature integration suite — production code MUST go through
/// [`admin_safe_set_is_admin`], which enforces self-protection, the
/// last-admin invariant, and the audit log atomically.
pub async fn set_is_admin_unchecked(pool: &PgPool, id: Uuid, value: bool) -> AppResult<()> {
sqlx::query("UPDATE users SET is_admin = $1 WHERE id = $2")
.bind(value)
.bind(id)
.execute(pool)
.await?;
Ok(())
}
/// Ensure the user `username` exists and is an admin. Called at startup
/// from `app::build` when `ADMIN_USERNAME` / `ADMIN_PASSWORD` are set.
///
/// Semantics — see cross-cutting decision #2 in the feature plan:
/// - If no row exists: create with the env-supplied password hashed via
/// argon2id and `is_admin = true`.
/// - If a row already exists: flip `is_admin` to true if needed; **never**
/// touch the existing `password_hash`. Lets the operator rotate the
/// admin password through the UI without env-var conflict.
/// Wrapped in a transaction so a concurrent `register` for the same
/// username can't slip an INSERT between the SELECT and UPDATE/INSERT.
/// Set `is_admin` on a user with full safety checks: rejects self-demote,
/// rejects demoting the only remaining admin (under `ADMIN_INVARIANT_LOCK_KEY`
/// to close the parallel-demote race), and writes an `admin_audit` row
/// in the same tx so the log mirrors what actually committed.
///
/// Returns the freshly-written user row (so the handler can return it
/// without a second SELECT).
pub async fn admin_safe_set_is_admin(
pool: &PgPool,
actor_id: Uuid,
target_id: Uuid,
value: bool,
) -> AppResult<User> {
// Cheap pre-check before opening a tx — also covers the "demote me"
// case which would otherwise pass the recount when other admins exist.
if actor_id == target_id && !value {
return Err(AppError::Conflict(
"cannot demote yourself; ask another admin".into(),
));
fn is_unique_violation(err: &sqlx::Error) -> bool {
if let sqlx::Error::Database(db_err) = err {
db_err.code().as_deref() == Some("23505")
} else {
false
}
let mut tx = pool.begin().await?;
sqlx::query("SELECT pg_advisory_xact_lock($1)")
.bind(ADMIN_INVARIANT_LOCK_KEY)
.execute(&mut *tx)
.await?;
let target: Option<User> = sqlx::query_as(
"SELECT id, username, password_hash, created_at, is_admin \
FROM users WHERE id = $1 FOR UPDATE",
)
.bind(target_id)
.fetch_optional(&mut *tx)
.await?;
let Some(target) = target else {
return Err(AppError::NotFound);
};
// No-op: caller asked to set `is_admin` to its current value. Return
// the row as-is without writing an audit entry — otherwise repeated
// PATCH calls (browser retry, double-click) pile misleading
// "promote_user" rows in `admin_audit` for actions that changed
// nothing.
if target.is_admin == value {
tx.commit().await?;
return Ok(target);
}
// Recount inside the lock — this is the authoritative read.
if target.is_admin && !value {
let admin_count: i64 =
sqlx::query_scalar("SELECT COUNT(*) FROM users WHERE is_admin = true")
.fetch_one(&mut *tx)
.await?;
if admin_count <= 1 {
return Err(AppError::Conflict(
"cannot demote the last admin; promote another user first".into(),
));
}
}
let updated: User = sqlx::query_as(
"UPDATE users SET is_admin = $1 WHERE id = $2 \
RETURNING id, username, password_hash, created_at, is_admin",
)
.bind(value)
.bind(target_id)
.fetch_one(&mut *tx)
.await?;
let action = if value { "promote_user" } else { "demote_user" };
crate::repo::admin_audit::insert(
&mut *tx,
actor_id,
action,
"user",
Some(target_id),
serde_json::json!({ "username": target.username }),
)
.await?;
tx.commit().await?;
Ok(updated)
}
/// Delete a user with full safety checks: rejects self-delete, rejects
/// deleting the only remaining admin (under `ADMIN_INVARIANT_LOCK_KEY`),
/// and writes an `admin_audit` row in the same tx. Captures the deleted
/// username + admin status in the audit payload so the action is
/// readable after the user row itself is gone.
pub async fn admin_safe_delete(
pool: &PgPool,
actor_id: Uuid,
target_id: Uuid,
) -> AppResult<()> {
if actor_id == target_id {
return Err(AppError::Conflict(
"cannot delete yourself; ask another admin".into(),
));
}
let mut tx = pool.begin().await?;
sqlx::query("SELECT pg_advisory_xact_lock($1)")
.bind(ADMIN_INVARIANT_LOCK_KEY)
.execute(&mut *tx)
.await?;
let target: Option<User> = sqlx::query_as(
"SELECT id, username, password_hash, created_at, is_admin \
FROM users WHERE id = $1 FOR UPDATE",
)
.bind(target_id)
.fetch_optional(&mut *tx)
.await?;
let Some(target) = target else {
return Err(AppError::NotFound);
};
if target.is_admin {
let admin_count: i64 =
sqlx::query_scalar("SELECT COUNT(*) FROM users WHERE is_admin = true")
.fetch_one(&mut *tx)
.await?;
if admin_count <= 1 {
return Err(AppError::Conflict(
"cannot delete the last admin; promote another user first".into(),
));
}
}
sqlx::query("DELETE FROM users WHERE id = $1")
.bind(target_id)
.execute(&mut *tx)
.await?;
crate::repo::admin_audit::insert(
&mut *tx,
actor_id,
"delete_user",
"user",
Some(target_id),
serde_json::json!({
"username": target.username,
"was_admin": target.is_admin,
}),
)
.await?;
tx.commit().await?;
Ok(())
}
/// Admin-initiated user creation. Wraps the INSERT + audit row in a
/// single transaction so a rolled-back create never leaves an orphan
/// audit entry. Caller (HTTP handler) is responsible for validating
/// `username`/`password` and hashing — this fn assumes both are
/// already vetted by the same `validate_*` rules used by self-
/// registration.
pub async fn admin_create_user(
pool: &PgPool,
actor_id: Uuid,
username: &str,
password_hash: &str,
is_admin: bool,
) -> AppResult<User> {
let mut tx = pool.begin().await?;
let user: User = match sqlx::query_as::<_, User>(
"INSERT INTO users (username, password_hash, is_admin) VALUES ($1, $2, $3) \
RETURNING id, username, password_hash, created_at, is_admin",
)
.bind(username)
.bind(password_hash)
.bind(is_admin)
.fetch_one(&mut *tx)
.await
{
Ok(u) => u,
Err(sqlx::Error::Database(ref db_err)) if db_err.is_unique_violation() => {
return Err(AppError::Conflict("username is already taken".into()));
}
Err(e) => return Err(AppError::Database(e)),
};
crate::repo::admin_audit::insert(
&mut *tx,
actor_id,
"create_user",
"user",
Some(user.id),
serde_json::json!({
"username": user.username,
"is_admin": user.is_admin,
}),
)
.await?;
tx.commit().await?;
Ok(user)
}
pub async fn bootstrap_admin(
pool: &PgPool,
username: &str,
password: &str,
) -> AppResult<()> {
let mut tx = pool.begin().await?;
let existing: Option<(Uuid,)> = sqlx::query_as(
"SELECT id FROM users WHERE lower(username) = lower($1) FOR UPDATE",
)
.bind(username)
.fetch_optional(&mut *tx)
.await?;
match existing {
Some((id,)) => {
sqlx::query("UPDATE users SET is_admin = true WHERE id = $1 AND is_admin = false")
.bind(id)
.execute(&mut *tx)
.await?;
}
None => {
let hash = crate::auth::password::hash_password(password)?;
sqlx::query("INSERT INTO users (username, password_hash, is_admin) VALUES ($1, $2, true)")
.bind(username)
.bind(&hash)
.execute(&mut *tx)
.await?;
}
}
tx.commit().await?;
Ok(())
}

View File

@@ -16,13 +16,6 @@ impl LocalStorage {
}
fn resolve(&self, key: &str) -> Result<PathBuf, StorageError> {
// NUL bytes are rejected by the Linux syscall layer, but the
// error surfaces as an opaque IO failure rather than the
// explicit `BadKey` the rest of the contract uses. Catch it
// here so the error path is consistent.
if key.contains('\0') {
return Err(StorageError::BadKey);
}
let key = key.trim_start_matches('/');
if key.is_empty() {
return Err(StorageError::BadKey);
@@ -86,10 +79,6 @@ impl Storage for LocalStorage {
let path: &Path = &self.resolve(key)?;
Ok(fs::try_exists(path).await?)
}
fn local_root(&self) -> Option<&Path> {
Some(&self.root)
}
}
#[cfg(test)]
@@ -125,9 +114,6 @@ mod tests {
assert!(matches!(s.get(".").await, Err(StorageError::BadKey)));
// Empty segment via doubled slash.
assert!(matches!(s.get("a//b").await, Err(StorageError::BadKey)));
// NUL byte (rejected explicitly so callers see BadKey rather
// than an opaque IO error from the kernel).
assert!(matches!(s.put("a\0b", b"x").await, Err(StorageError::BadKey)));
}
#[tokio::test]

View File

@@ -9,8 +9,6 @@ mod local;
use std::io;
use std::pin::Pin;
use std::path::Path;
use async_trait::async_trait;
use bytes::Bytes;
use futures_core::Stream;
@@ -46,13 +44,4 @@ pub trait Storage: Send + Sync {
async fn get_stream(&self, key: &str) -> Result<StreamingFile, StorageError>;
async fn delete(&self, key: &str) -> Result<(), StorageError>;
async fn exists(&self, key: &str) -> Result<bool, StorageError>;
/// Filesystem path the backend is rooted at, when introspectable.
/// Returns `None` for backends that aren't a local filesystem (e.g.
/// a future `S3Storage`). The admin system endpoint uses this to
/// statvfs the data dir; backends that return `None` get a `disk:
/// null` payload instead of fabricated numbers.
fn local_root(&self) -> Option<&Path> {
None
}
}

View File

@@ -1,344 +0,0 @@
//! Integration tests for the admin crawler observability/control API.
//!
//! The default test harness wires `AppState.crawler = None` (no daemon),
//! so the *control* endpoints return 503 and the *read* endpoints that
//! work off the DB (status shell, dead-jobs list/requeue) still function.
//! This is exactly the production "daemon disabled" posture.
mod common;
use std::time::Duration;
use axum::http::StatusCode;
use axum::Router;
use http_body_util::BodyExt;
use serde_json::json;
use sqlx::PgPool;
use tower::ServiceExt;
use uuid::Uuid;
use common::{body_json, get, get_with_cookie, post_json_with_cookie, register_user, harness};
async fn seed_admin(pool: &PgPool, app: &Router) -> String {
let (username, cookie) = register_user(app).await;
let u = mangalord::repo::user::find_by_username(pool, &username)
.await
.unwrap()
.unwrap();
mangalord::repo::user::set_is_admin_unchecked(pool, u.id, true)
.await
.unwrap();
cookie
}
async fn seed_dead_job(pool: &PgPool, title: &str) -> Uuid {
let manga_id = Uuid::new_v4();
let chapter_id = Uuid::new_v4();
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, $2)")
.bind(manga_id)
.bind(title)
.execute(pool)
.await
.unwrap();
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, 1)")
.bind(chapter_id)
.bind(manga_id)
.execute(pool)
.await
.unwrap();
let job_id = Uuid::new_v4();
sqlx::query(
"INSERT INTO crawler_jobs (id, payload, state, attempts, last_error) \
VALUES ($1, $2, 'dead', 5, 'boom')",
)
.bind(job_id)
.bind(json!({
"kind": "sync_chapter_content",
"source_id": "target",
"chapter_id": chapter_id,
"source_chapter_key": "k",
}))
.execute(pool)
.await
.unwrap();
job_id
}
/// Seed a chapter-content job in a given state ('pending'/'running').
async fn seed_job(pool: &PgPool, title: &str, state: &str) {
let manga_id = Uuid::new_v4();
let chapter_id = Uuid::new_v4();
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, $2)")
.bind(manga_id)
.bind(title)
.execute(pool)
.await
.unwrap();
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, 1)")
.bind(chapter_id)
.bind(manga_id)
.execute(pool)
.await
.unwrap();
sqlx::query("INSERT INTO crawler_jobs (id, payload, state) VALUES ($1, $2, $3)")
.bind(Uuid::new_v4())
.bind(json!({
"kind": "sync_chapter_content",
"source_id": "target",
"chapter_id": chapter_id,
"source_chapter_key": "k",
}))
.bind(state)
.execute(pool)
.await
.unwrap();
}
/// Seed a manga with no cover + a live source row (queued for cover fetch).
async fn seed_missing_cover(pool: &PgPool, title: &str) {
let manga_id = Uuid::new_v4();
sqlx::query("INSERT INTO mangas (id, title, cover_image_path) VALUES ($1, $2, NULL)")
.bind(manga_id)
.bind(title)
.execute(pool)
.await
.unwrap();
sqlx::query("INSERT INTO sources (id, name, base_url) VALUES ('target','T','http://x') ON CONFLICT DO NOTHING")
.execute(pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url) \
VALUES ('target', $1, $2, 'http://x/m')",
)
.bind(format!("k-{manga_id}"))
.bind(manga_id)
.execute(pool)
.await
.unwrap();
}
#[sqlx::test(migrations = "./migrations")]
async fn active_jobs_and_covers_lists_over_http(pool: PgPool) {
seed_job(&pool, "Naruto", "pending").await;
seed_job(&pool, "Bleach", "running").await;
seed_missing_cover(&pool, "One Piece").await;
let h = harness(pool.clone());
let cookie = seed_admin(&pool, &h.app).await;
// Queued/active chapters.
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler/active-jobs", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = body_json(resp).await;
assert_eq!(body["page"]["total"], 2);
// Queued covers.
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler/covers", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = body_json(resp).await;
assert_eq!(body["page"]["total"], 1);
assert_eq!(body["items"][0]["manga_title"], "One Piece");
// Both are admin-gated.
let (_u, plain) = register_user(&h.app).await;
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler/active-jobs", &plain))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn get_status_requires_admin(pool: PgPool) {
let h = harness(pool);
// Unauthenticated → 401.
let resp = h.app.clone().oneshot(get("/api/v1/admin/crawler")).await.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
// Authenticated non-admin → 403.
let (_u, cookie) = register_user(&h.app).await;
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn get_status_reports_disabled_daemon_with_queue_counts(pool: PgPool) {
seed_dead_job(&pool, "Naruto").await;
let h = harness(pool.clone());
let cookie = seed_admin(&pool, &h.app).await;
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = body_json(resp).await;
assert_eq!(body["daemon"], "disabled");
assert_eq!(body["queue"]["dead"], 1);
assert_eq!(body["browser"], "down");
}
#[sqlx::test(migrations = "./migrations")]
async fn control_endpoints_return_503_when_daemon_disabled(pool: PgPool) {
let h = harness(pool.clone());
let cookie = seed_admin(&pool, &h.app).await;
for uri in [
"/api/v1/admin/crawler/run",
"/api/v1/admin/crawler/browser/restart",
"/api/v1/admin/crawler/session/clear-expired",
] {
let resp = h
.app
.clone()
.oneshot(post_json_with_cookie(uri, json!({}), &cookie))
.await
.unwrap();
assert_eq!(
resp.status(),
StatusCode::SERVICE_UNAVAILABLE,
"{uri} should be 503 when daemon disabled"
);
}
}
#[sqlx::test(migrations = "./migrations")]
async fn status_stream_requires_admin(pool: PgPool) {
let h = harness(pool);
// Unauthenticated → 401.
let resp = h
.app
.clone()
.oneshot(get("/api/v1/admin/crawler/stream"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
// Non-admin → 403.
let (_u, cookie) = register_user(&h.app).await;
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler/stream", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn status_stream_emits_initial_event(pool: PgPool) {
let h = harness(pool.clone());
let cookie = seed_admin(&pool, &h.app).await;
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler/stream", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let ct = resp
.headers()
.get(axum::http::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or_default()
.to_string();
assert!(ct.starts_with("text/event-stream"), "content-type was {ct:?}");
// Accumulate frames (the immediate snapshot may arrive split across
// frames) until the status payload appears, with an overall timeout so
// the never-ending stream can't hang the test.
let mut body = resp.into_body();
let mut acc = String::new();
let deadline = tokio::time::timeout(Duration::from_secs(5), async {
loop {
let Some(frame) = body.frame().await else { break };
if let Ok(data) = frame.expect("frame ok").into_data() {
acc.push_str(&String::from_utf8_lossy(&data));
if acc.contains("\"daemon\"") {
break;
}
}
}
})
.await;
assert!(deadline.is_ok(), "did not receive status within 5s; got: {acc:?}");
assert!(acc.contains("\"daemon\""), "missing status payload: {acc}");
assert!(acc.contains("status"), "missing SSE event name: {acc}");
}
#[sqlx::test(migrations = "./migrations")]
async fn mutating_endpoints_reject_non_admin(pool: PgPool) {
let h = harness(pool);
// A logged-in non-admin must be forbidden from a mutating endpoint.
let (_u, cookie) = register_user(&h.app).await;
let resp = h
.app
.clone()
.oneshot(post_json_with_cookie(
"/api/v1/admin/crawler/dead-jobs/requeue",
json!({ "scope": "all" }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn dead_jobs_list_and_requeue_over_http(pool: PgPool) {
let job_id = seed_dead_job(&pool, "Bleach").await;
let h = harness(pool.clone());
let cookie = seed_admin(&pool, &h.app).await;
// List.
let resp = h
.app
.clone()
.oneshot(get_with_cookie("/api/v1/admin/crawler/dead-jobs", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = body_json(resp).await;
assert_eq!(body["page"]["total"], 1);
assert_eq!(body["items"][0]["manga_title"], "Bleach");
// Requeue the single job.
let resp = h
.app
.clone()
.oneshot(post_json_with_cookie(
"/api/v1/admin/crawler/dead-jobs/requeue",
json!({ "scope": "job", "job_id": job_id }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = body_json(resp).await;
assert_eq!(body["requeued"], 1);
let state: String = sqlx::query_scalar("SELECT state FROM crawler_jobs WHERE id = $1")
.bind(job_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(state, "pending");
}

View File

@@ -1,548 +0,0 @@
//! PR 3 (feat/admin-mangas-api) integration tests.
//!
//! Per-variant fixture tests for the derived sync-state SQL plus
//! happy-path E2E for the two admin endpoints. Auth-gate regression
//! (403/401) is covered by PR 1's `RequireAdmin` test matrix; the only
//! gate test here is one spot check per endpoint.
mod common;
use axum::http::StatusCode;
use axum::Router;
use serde_json::json;
use sqlx::PgPool;
use tower::ServiceExt;
use uuid::Uuid;
use mangalord::repo;
const SOURCE_ID: &str = "test-source";
async fn seed_admin(pool: &PgPool, app: &Router) -> (String, String) {
let (username, cookie) = common::register_user(app).await;
let u = repo::user::find_by_username(pool, &username)
.await
.unwrap()
.unwrap();
repo::user::set_is_admin_unchecked(pool, u.id, true).await.unwrap();
(username, cookie)
}
async fn seed_source(pool: &PgPool) {
repo::crawler::ensure_source(pool, SOURCE_ID, "Test", "https://example.test")
.await
.unwrap();
}
async fn insert_manga(pool: &PgPool, title: &str) -> Uuid {
let (id,): (Uuid,) = sqlx::query_as(
"INSERT INTO mangas (title, status, alt_titles) VALUES ($1, 'ongoing', ARRAY[]::text[]) RETURNING id",
)
.bind(title)
.fetch_one(pool)
.await
.unwrap();
id
}
async fn insert_manga_source(
pool: &PgPool,
manga_id: Uuid,
source_manga_key: &str,
dropped: bool,
) {
let dropped_at = if dropped { "now()" } else { "NULL" };
let sql = format!(
"INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url, dropped_at) \
VALUES ($1, $2, $3, 'https://example.test/m', {dropped_at})"
);
sqlx::query(&sql)
.bind(SOURCE_ID)
.bind(source_manga_key)
.bind(manga_id)
.execute(pool)
.await
.unwrap();
}
async fn insert_chapter(pool: &PgPool, manga_id: Uuid, number: i32, page_count: i32) -> Uuid {
let (id,): (Uuid,) = sqlx::query_as(
"INSERT INTO chapters (manga_id, number, title, page_count) VALUES ($1, $2, NULL, $3) RETURNING id",
)
.bind(manga_id)
.bind(number)
.bind(page_count)
.fetch_one(pool)
.await
.unwrap();
id
}
async fn insert_chapter_source(
pool: &PgPool,
chapter_id: Uuid,
source_chapter_key: &str,
dropped: bool,
) {
let dropped_at = if dropped { "now()" } else { "NULL" };
let sql = format!(
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url, dropped_at) \
VALUES ($1, $2, $3, 'https://example.test/c', {dropped_at})"
);
sqlx::query(&sql)
.bind(SOURCE_ID)
.bind(source_chapter_key)
.bind(chapter_id)
.execute(pool)
.await
.unwrap();
}
async fn insert_job(pool: &PgPool, payload: serde_json::Value, state: &str) {
sqlx::query("INSERT INTO crawler_jobs (payload, state) VALUES ($1, $2)")
.bind(payload)
.bind(state)
.execute(pool)
.await
.unwrap();
}
/// Per-variant tests don't care about pagination — fetch the whole
/// chapter set (up to the hard cap) and discard the total.
async fn fetch_chapter_rows(
pool: &PgPool,
manga_id: Uuid,
) -> Vec<mangalord::repo::admin_view::AdminChapterRow> {
let (rows, _) = repo::admin_view::list_chapters_with_sync_state(
pool,
&repo::admin_view::ListAdminChaptersQuery {
manga_id,
limit: 500,
offset: 0,
},
)
.await
.unwrap();
rows
}
// ---- manga sync state ------------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn manga_state_synced_for_fresh_source(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "Synced Manga").await;
insert_manga_source(&pool, m, "smk-1", false).await;
let (rows, total) = repo::admin_view::list_mangas_with_sync_state(
&pool,
&repo::admin_view::ListAdminMangasQuery {
limit: 50,
..Default::default()
},
)
.await
.unwrap();
assert_eq!(total, 1);
assert_eq!(rows[0].id, m);
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::Synced);
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_state_synced_for_user_upload_without_sources(pool: PgPool) {
let m = insert_manga(&pool, "User Upload").await;
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
&pool,
&repo::admin_view::ListAdminMangasQuery {
limit: 50,
..Default::default()
},
)
.await
.unwrap();
assert_eq!(rows[0].id, m);
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::Synced);
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_state_dropped_when_all_sources_dropped(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "Dropped Manga").await;
insert_manga_source(&pool, m, "smk-1", true).await;
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
&pool,
&repo::admin_view::ListAdminMangasQuery {
limit: 50,
..Default::default()
},
)
.await
.unwrap();
assert_eq!(rows[0].id, m);
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::Dropped);
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_state_in_progress_via_sync_chapter_list_job(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "Syncing Manga").await;
insert_manga_source(&pool, m, "smk-1", false).await;
// sync_chapter_list payload carries manga_id directly.
insert_job(
&pool,
json!({
"kind": "sync_chapter_list",
"source_id": SOURCE_ID,
"manga_id": m.to_string(),
"source_manga_key": "smk-1",
}),
"pending",
)
.await;
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
&pool,
&repo::admin_view::ListAdminMangasQuery {
limit: 50,
..Default::default()
},
)
.await
.unwrap();
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::InProgress);
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_state_in_progress_via_sync_manga_job(pool: PgPool) {
// The trickier branch: sync_manga payload is keyed by
// source_manga_key, NOT manga_id — must join through manga_sources.
seed_source(&pool).await;
let m = insert_manga(&pool, "Metadata-Refreshing Manga").await;
insert_manga_source(&pool, m, "smk-key-42", false).await;
insert_job(
&pool,
json!({
"kind": "sync_manga",
"source_id": SOURCE_ID,
"source_manga_key": "smk-key-42",
}),
"running",
)
.await;
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
&pool,
&repo::admin_view::ListAdminMangasQuery {
limit: 50,
..Default::default()
},
)
.await
.unwrap();
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::InProgress);
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_list_filters_by_sync_state(pool: PgPool) {
seed_source(&pool).await;
let m_synced = insert_manga(&pool, "AAA Synced").await;
insert_manga_source(&pool, m_synced, "smk-a", false).await;
let m_dropped = insert_manga(&pool, "BBB Dropped").await;
insert_manga_source(&pool, m_dropped, "smk-b", true).await;
let (rows, total) = repo::admin_view::list_mangas_with_sync_state(
&pool,
&repo::admin_view::ListAdminMangasQuery {
sync_state: Some(mangalord::domain::MangaSyncState::Dropped),
limit: 50,
..Default::default()
},
)
.await
.unwrap();
assert_eq!(total, 1);
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].id, m_dropped);
}
// ---- chapter sync state ----------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn chapter_state_synced_when_pages_present(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
insert_manga_source(&pool, m, "smk", false).await;
let c = insert_chapter(&pool, m, 1, 12).await;
insert_chapter_source(&pool, c, "ckey-1", false).await;
let rows = fetch_chapter_rows(&pool, m).await;
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].id, c);
assert_eq!(rows[0].sync_state, mangalord::domain::ChapterSyncState::Synced);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_state_not_downloaded_when_page_count_zero(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
let c = insert_chapter(&pool, m, 1, 0).await;
insert_chapter_source(&pool, c, "ckey-1", false).await;
let rows = fetch_chapter_rows(&pool, m).await;
assert_eq!(
rows[0].sync_state,
mangalord::domain::ChapterSyncState::NotDownloaded
);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_state_downloading_when_job_in_flight(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
let c = insert_chapter(&pool, m, 1, 0).await;
insert_chapter_source(&pool, c, "ckey-1", false).await;
insert_job(
&pool,
json!({
"kind": "sync_chapter_content",
"source_id": SOURCE_ID,
"chapter_id": c.to_string(),
"source_chapter_key": "ckey-1",
}),
"running",
)
.await;
let rows = fetch_chapter_rows(&pool, m).await;
assert_eq!(
rows[0].sync_state,
mangalord::domain::ChapterSyncState::Downloading
);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_state_dropped_when_all_sources_dropped(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
let c = insert_chapter(&pool, m, 1, 0).await;
insert_chapter_source(&pool, c, "ckey-1", true).await;
let rows = fetch_chapter_rows(&pool, m).await;
assert_eq!(
rows[0].sync_state,
mangalord::domain::ChapterSyncState::Dropped
);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_state_failed_when_most_recent_job_dead(pool: PgPool) {
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
let c = insert_chapter(&pool, m, 1, 0).await;
insert_chapter_source(&pool, c, "ckey-1", false).await;
insert_job(
&pool,
json!({
"kind": "sync_chapter_content",
"source_id": SOURCE_ID,
"chapter_id": c.to_string(),
"source_chapter_key": "ckey-1",
}),
"dead",
)
.await;
let rows = fetch_chapter_rows(&pool, m).await;
assert_eq!(
rows[0].sync_state,
mangalord::domain::ChapterSyncState::Failed
);
}
// ---- HTTP-level happy-path + gate ------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn http_list_mangas_returns_paged_with_state(pool: PgPool) {
let h = common::harness(pool.clone());
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
seed_source(&pool).await;
let m = insert_manga(&pool, "Hello").await;
insert_manga_source(&pool, m, "smk", false).await;
let resp = h
.app
.oneshot(common::get_with_cookie(
"/api/v1/admin/mangas?limit=50",
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
let items = body["items"].as_array().unwrap();
assert_eq!(items.len(), 1);
assert_eq!(items[0]["id"], m.to_string());
assert_eq!(items[0]["sync_state"], "synced");
assert_eq!(items[0]["chapter_count"], 0);
assert_eq!(body["page"]["total"], 1);
}
#[sqlx::test(migrations = "./migrations")]
async fn http_list_mangas_rejects_unknown_sync_state(pool: PgPool) {
let h = common::harness(pool.clone());
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::get_with_cookie(
"/api/v1/admin/mangas?sync_state=bogus",
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
}
#[sqlx::test(migrations = "./migrations")]
async fn http_list_chapters_returns_per_chapter_state(pool: PgPool) {
let h = common::harness(pool.clone());
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
let c1 = insert_chapter(&pool, m, 1, 12).await;
let c2 = insert_chapter(&pool, m, 2, 0).await;
insert_chapter_source(&pool, c1, "ck1", false).await;
insert_chapter_source(&pool, c2, "ck2", false).await;
let resp = h
.app
.oneshot(common::get_with_cookie(
&format!("/api/v1/admin/mangas/{m}/chapters"),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
let items = body["items"].as_array().unwrap();
assert_eq!(items.len(), 2);
assert_eq!(items[0]["id"], c1.to_string());
assert_eq!(items[0]["sync_state"], "synced");
assert_eq!(items[1]["id"], c2.to_string());
assert_eq!(items[1]["sync_state"], "not_downloaded");
assert_eq!(body["page"]["total"], 2);
}
#[sqlx::test(migrations = "./migrations")]
async fn http_list_chapters_caps_limit_at_500(pool: PgPool) {
// The handler clamps limit to [1, 500] so a long-runner with
// thousands of chapters can't be turned into a request-stall by an
// admin (or by a curious admin tab) just clicking expand.
let h = common::harness(pool.clone());
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
for n in 1..=3 {
let _c = insert_chapter(&pool, m, n, 0).await;
}
let resp = h
.app
.oneshot(common::get_with_cookie(
&format!("/api/v1/admin/mangas/{m}/chapters?limit=999"),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(body["page"]["limit"], 500, "limit must clamp to 500");
assert_eq!(body["items"].as_array().unwrap().len(), 3);
}
#[sqlx::test(migrations = "./migrations")]
async fn http_list_chapters_paginates(pool: PgPool) {
let h = common::harness(pool.clone());
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
for n in 1..=5 {
let _c = insert_chapter(&pool, m, n, 0).await;
}
let resp = h
.app
.clone()
.oneshot(common::get_with_cookie(
&format!("/api/v1/admin/mangas/{m}/chapters?limit=2&offset=2"),
&cookie,
))
.await
.unwrap();
let body = common::body_json(resp).await;
let items = body["items"].as_array().unwrap();
assert_eq!(items.len(), 2);
// Ordered by chapter number ascending; offset=2 skips chapters 1 & 2.
assert_eq!(items[0]["number"], 3);
assert_eq!(items[1]["number"], 4);
assert_eq!(body["page"]["total"], 5);
}
#[sqlx::test(migrations = "./migrations")]
async fn http_list_chapters_returns_404_for_unknown_manga(pool: PgPool) {
// Regression: used to return 200 [] for a non-existent manga,
// which silently rendered "No chapters." for a typo'd / deleted id.
let h = common::harness(pool.clone());
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::get_with_cookie(
&format!("/api/v1/admin/mangas/{}/chapters", Uuid::new_v4()),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_state_synced_when_pages_present_even_with_dead_job(pool: PgPool) {
// Regression: the old CASE prioritised the dead-job branch above
// the page_count check, so a chapter with pages on disk AND a
// historical dead job (e.g. from a re-download attempt that
// crashed) flipped to Failed — contradicting Synced's "downloaded
// at some point" contract.
seed_source(&pool).await;
let m = insert_manga(&pool, "M").await;
let c = insert_chapter(&pool, m, 1, 12).await; // pages present
insert_chapter_source(&pool, c, "ckey-1", false).await;
insert_job(
&pool,
json!({
"kind": "sync_chapter_content",
"source_id": SOURCE_ID,
"chapter_id": c.to_string(),
"source_chapter_key": "ckey-1",
}),
"dead",
)
.await;
let rows = fetch_chapter_rows(&pool, m).await;
assert_eq!(
rows[0].sync_state,
mangalord::domain::ChapterSyncState::Synced,
"pages on disk override historical dead-job noise"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn http_list_mangas_requires_admin(pool: PgPool) {
let h = common::harness(pool);
let (_u, cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::get_with_cookie("/api/v1/admin/mangas", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}

View File

@@ -1,350 +0,0 @@
//! Integration tests for the admin force-resync endpoints.
//!
//! Real resync work requires Chromium, so these tests swap in a stub
//! [`ResyncService`] to assert the handler-level contract: routing,
//! admin gate, 503 when the daemon is disabled, 404 / 422 mapping for
//! missing-resource / no-source cases, and the audit-log side effect.
mod common;
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
use async_trait::async_trait;
use axum::http::StatusCode;
use serde_json::json;
use sqlx::PgPool;
use tower::ServiceExt;
use uuid::Uuid;
use mangalord::crawler::resync::{
ChapterResyncOutcome, MangaResyncOutcome, ResyncError, ResyncService,
};
use mangalord::repo;
use mangalord::repo::crawler::UpsertStatus;
/// Stub that records call counts and returns a canned outcome.
struct StubResync {
manga_calls: AtomicUsize,
chapter_calls: AtomicUsize,
/// When true, returns NoMangaSource / NoChapterSource.
no_source: bool,
}
impl StubResync {
fn new() -> Arc<Self> {
Arc::new(Self {
manga_calls: AtomicUsize::new(0),
chapter_calls: AtomicUsize::new(0),
no_source: false,
})
}
fn no_source() -> Arc<Self> {
Arc::new(Self {
manga_calls: AtomicUsize::new(0),
chapter_calls: AtomicUsize::new(0),
no_source: true,
})
}
}
#[async_trait]
impl ResyncService for StubResync {
async fn resync_manga(&self, manga_id: Uuid) -> anyhow::Result<MangaResyncOutcome> {
self.manga_calls.fetch_add(1, Ordering::SeqCst);
if self.no_source {
return Err(ResyncError::NoMangaSource.into());
}
Ok(MangaResyncOutcome {
manga_id,
metadata_status: UpsertStatus::Updated,
cover_fetched: true,
})
}
async fn resync_chapter(&self, chapter_id: Uuid) -> anyhow::Result<ChapterResyncOutcome> {
self.chapter_calls.fetch_add(1, Ordering::SeqCst);
if self.no_source {
return Err(ResyncError::NoChapterSource.into());
}
Ok(ChapterResyncOutcome::Fetched {
chapter_id,
pages: 7,
})
}
}
async fn promote_admin(pool: &PgPool, username: &str) {
let u = repo::user::find_by_username(pool, username)
.await
.unwrap()
.unwrap();
repo::user::set_is_admin_unchecked(pool, u.id, true)
.await
.unwrap();
}
async fn insert_manga(pool: &PgPool, title: &str) -> Uuid {
let (id,): (Uuid,) = sqlx::query_as(
"INSERT INTO mangas (title, status, alt_titles) VALUES ($1, 'ongoing', ARRAY[]::text[]) RETURNING id",
)
.bind(title)
.fetch_one(pool)
.await
.unwrap();
id
}
async fn insert_chapter(pool: &PgPool, manga_id: Uuid, number: i32, pages: i32) -> Uuid {
let (id,): (Uuid,) = sqlx::query_as(
"INSERT INTO chapters (manga_id, number, title, page_count) VALUES ($1, $2, NULL, $3) RETURNING id",
)
.bind(manga_id)
.bind(number)
.bind(pages)
.fetch_one(pool)
.await
.unwrap();
id
}
// ----- manga resync ---------------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn manga_resync_calls_service_and_returns_refreshed_detail(pool: PgPool) {
let stub = StubResync::new();
let h = common::harness_with_resync(pool.clone(), stub.clone());
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let manga_id = insert_manga(&pool, "Hello").await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
// Stub returned Updated + cover_fetched=true.
assert_eq!(body["metadata_status"], "updated");
assert_eq!(body["cover_fetched"], true);
// Response includes the refreshed manga detail.
assert_eq!(body["manga"]["id"], manga_id.to_string());
assert_eq!(body["manga"]["title"], "Hello");
assert_eq!(stub.manga_calls.load(Ordering::SeqCst), 1);
// Audit row written.
let (audit_count,): (i64,) =
sqlx::query_as("SELECT count(*) FROM admin_audit WHERE action = 'manga_resync' AND target_id = $1")
.bind(manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(audit_count, 1);
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_resync_returns_404_for_unknown_id(pool: PgPool) {
let stub = StubResync::new();
let h = common::harness_with_resync(pool.clone(), stub.clone());
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/mangas/{}/resync", Uuid::new_v4()),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
// Service must not have been called when the manga doesn't exist.
assert_eq!(stub.manga_calls.load(Ordering::SeqCst), 0);
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_resync_maps_no_source_to_422(pool: PgPool) {
let stub = StubResync::no_source();
let h = common::harness_with_resync(pool.clone(), stub);
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let manga_id = insert_manga(&pool, "Manual upload, no crawler source").await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["details"]["manga"], "no_source");
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_resync_returns_503_when_daemon_disabled(pool: PgPool) {
let h = common::harness(pool.clone());
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let manga_id = insert_manga(&pool, "Z").await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "service_unavailable");
}
#[sqlx::test(migrations = "./migrations")]
async fn manga_resync_requires_admin(pool: PgPool) {
let stub = StubResync::new();
let h = common::harness_with_resync(pool.clone(), stub);
// Non-admin user.
let (_u, cookie) = common::register_user(&h.app).await;
let manga_id = insert_manga(&pool, "M").await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
// ----- chapter resync -------------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn chapter_resync_calls_service_and_returns_refreshed_chapter(pool: PgPool) {
let stub = StubResync::new();
let h = common::harness_with_resync(pool.clone(), stub.clone());
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let manga_id = insert_manga(&pool, "M").await;
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(body["outcome"], "fetched");
assert_eq!(body["pages"], 7);
assert_eq!(body["chapter"]["id"], chapter_id.to_string());
assert_eq!(stub.chapter_calls.load(Ordering::SeqCst), 1);
let (audit_count,): (i64,) = sqlx::query_as(
"SELECT count(*) FROM admin_audit WHERE action = 'chapter_resync' AND target_id = $1",
)
.bind(chapter_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(audit_count, 1);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_resync_returns_404_for_unknown_id(pool: PgPool) {
let stub = StubResync::new();
let h = common::harness_with_resync(pool.clone(), stub.clone());
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/chapters/{}/resync", Uuid::new_v4()),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
assert_eq!(stub.chapter_calls.load(Ordering::SeqCst), 0);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_resync_maps_no_source_to_422(pool: PgPool) {
let stub = StubResync::no_source();
let h = common::harness_with_resync(pool.clone(), stub);
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let manga_id = insert_manga(&pool, "M").await;
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["details"]["chapter"], "no_source");
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_resync_returns_503_when_daemon_disabled(pool: PgPool) {
let h = common::harness(pool.clone());
let (username, cookie) = common::register_user(&h.app).await;
promote_admin(&pool, &username).await;
let manga_id = insert_manga(&pool, "M").await;
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
}
#[sqlx::test(migrations = "./migrations")]
async fn chapter_resync_requires_admin(pool: PgPool) {
let stub = StubResync::new();
let h = common::harness_with_resync(pool.clone(), stub);
let (_u, cookie) = common::register_user(&h.app).await;
let manga_id = insert_manga(&pool, "M").await;
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
json!({}),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}

View File

@@ -1,259 +0,0 @@
//! PR 1 (feat/admin-role) integration tests.
//!
//! Covers: `bootstrap_admin` semantics, `is_admin` exposed on /auth/me,
//! and the `RequireAdmin` extractor's 401/403/200 matrix — including the
//! load-bearing decision that Bearer-authed callers can NEVER reach an
//! admin-guarded route, even when the underlying user IS admin.
mod common;
use std::sync::Arc;
use axum::http::StatusCode;
use axum::routing::get;
use axum::{Json, Router};
use serde_json::json;
use sqlx::PgPool;
use tempfile::TempDir;
use tower::ServiceExt;
use mangalord::api;
use mangalord::app::AppState;
use mangalord::auth::extractor::RequireAdmin;
use mangalord::auth::rate_limit::AuthRateLimiter;
use mangalord::config::{AuthConfig, UploadConfig};
use mangalord::repo;
use mangalord::storage::{LocalStorage, Storage};
/// Test-only handler guarded by `RequireAdmin`. Lets the test suite assert
/// the extractor's behaviour end-to-end without depending on an admin
/// endpoint existing yet (those land in PR 2+).
async fn admin_only_handler(RequireAdmin(user): RequireAdmin) -> Json<serde_json::Value> {
Json(json!({ "username": user.username, "is_admin": user.is_admin }))
}
/// Build a router that exposes the production /api/v1/* AND a test-only
/// `/_test/admin_only` route guarded by `RequireAdmin`. Pool is consumed;
/// callers that want to inspect the DB after a request should clone it.
fn admin_test_router(pool: PgPool) -> (Router, TempDir) {
let storage_dir = tempfile::tempdir().expect("tempdir");
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(storage_dir.path()));
let auth = AuthConfig {
cookie_secure: false,
..AuthConfig::default()
};
let auth_limiter = Arc::new(AuthRateLimiter::new(auth.rate_limit));
let state = AppState {
db: pool,
storage,
auth,
upload: UploadConfig::default(),
auth_limiter,
resync: None,
crawler: None,
};
let app = Router::new()
.nest("/api/v1", api::routes())
.route("/_test/admin_only", get(admin_only_handler))
.with_state(state);
(app, storage_dir)
}
// ---- bootstrap_admin -------------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn bootstrap_creates_admin_when_user_missing(pool: PgPool) {
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
.await
.expect("bootstrap on empty DB");
let user = repo::user::find_by_username(&pool, "root")
.await
.unwrap()
.expect("root user exists after bootstrap");
assert!(user.is_admin, "bootstrap must set is_admin = true on creation");
// Password hash must verify the env-supplied password (and not be empty).
assert!(
mangalord::auth::password::verify_password("hunter2hunter2", &user.password_hash),
"bootstrap-created user must accept the env-supplied password"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn bootstrap_promotes_existing_user_without_touching_password(pool: PgPool) {
// Pre-existing user, not admin. Use the real register path so the
// hash format matches production exactly.
let (app, _td) = admin_test_router(pool.clone());
let resp = app
.oneshot(common::post_json(
"/api/v1/auth/register",
json!({ "username": "preexisting", "password": "originalpw1234" }),
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CREATED);
let before = repo::user::find_by_username(&pool, "preexisting")
.await
.unwrap()
.unwrap();
assert!(!before.is_admin);
let original_hash = before.password_hash.clone();
// Bootstrap with a DIFFERENT password — must not overwrite the hash.
repo::user::bootstrap_admin(&pool, "preexisting", "envpw_should_be_ignored")
.await
.expect("bootstrap on existing user");
let after = repo::user::find_by_username(&pool, "preexisting")
.await
.unwrap()
.unwrap();
assert!(after.is_admin, "bootstrap must promote existing user");
assert_eq!(
after.password_hash, original_hash,
"bootstrap must NOT overwrite the existing password hash"
);
assert!(
mangalord::auth::password::verify_password("originalpw1234", &after.password_hash),
"original password must still verify after bootstrap"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn bootstrap_is_idempotent(pool: PgPool) {
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
.await
.expect("first bootstrap");
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
.await
.expect("second bootstrap is no-op");
// Exactly one row, still admin.
let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM users WHERE username = $1")
.bind("root")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(count, 1);
}
// ---- /api/v1/auth/me exposes is_admin --------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn auth_me_response_includes_is_admin(pool: PgPool) {
let (app, _td) = admin_test_router(pool.clone());
let (_username, cookie) = common::register_user(&app).await;
let resp = app
.oneshot(common::get_with_cookie("/api/v1/auth/me", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(
body["user"]["is_admin"], false,
"freshly-registered users default to is_admin=false"
);
}
// ---- RequireAdmin: 401 / 403 / 200 matrix ----------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn require_admin_rejects_unauthenticated(pool: PgPool) {
let (app, _td) = admin_test_router(pool);
let resp = app
.oneshot(common::get("/_test/admin_only"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
#[sqlx::test(migrations = "./migrations")]
async fn require_admin_rejects_non_admin_cookie(pool: PgPool) {
let (app, _td) = admin_test_router(pool);
let (_username, cookie) = common::register_user(&app).await;
let resp = app
.oneshot(common::get_with_cookie("/_test/admin_only", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "forbidden");
}
#[sqlx::test(migrations = "./migrations")]
async fn require_admin_accepts_admin_cookie(pool: PgPool) {
let (app, _td) = admin_test_router(pool.clone());
let (username, cookie) = common::register_user(&app).await;
// Promote via the repo (the admin-users API doesn't exist yet).
let u = repo::user::find_by_username(&pool, &username)
.await
.unwrap()
.unwrap();
repo::user::set_is_admin_unchecked(&pool, u.id, true).await.unwrap();
let resp = app
.oneshot(common::get_with_cookie("/_test/admin_only", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(body["username"], username);
assert_eq!(body["is_admin"], true);
}
#[sqlx::test(migrations = "./migrations")]
async fn require_admin_rejects_bearer_token_even_for_admin_user(pool: PgPool) {
// Key privilege-escalation test: an API token belonging to an admin user
// must NOT grant admin authority. Bot tokens are excluded from admin
// routes by design (the RequireAdmin extractor only accepts session
// cookies). See cross-cutting decision #1 in the PR plan.
let (app, _td) = admin_test_router(pool.clone());
let (username, cookie) = common::register_user(&app).await;
// Promote to admin and mint an API token (the existing /auth/tokens
// endpoint authenticates via the same cookie).
let u = repo::user::find_by_username(&pool, &username)
.await
.unwrap()
.unwrap();
repo::user::set_is_admin_unchecked(&pool, u.id, true).await.unwrap();
let resp = app
.clone()
.oneshot(common::post_json_with_cookie(
"/api/v1/auth/tokens",
json!({ "name": "test-bot" }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CREATED);
let body = common::body_json(resp).await;
let token = body["bearer"]
.as_str()
.expect("raw bearer token in response")
.to_string();
// Sanity: the bearer DOES work on a non-admin endpoint (proves the
// token is valid, isolating the failure below to the admin guard).
let resp = app
.clone()
.oneshot(common::get_with_bearer("/api/v1/auth/me", &token))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
// Same token, same admin user, but on the admin-guarded route → 401
// (no session cookie present at all from the extractor's POV).
let resp = app
.oneshot(common::get_with_bearer("/_test/admin_only", &token))
.await
.unwrap();
assert_eq!(
resp.status(),
StatusCode::UNAUTHORIZED,
"Bearer-authed admin must NOT pass the RequireAdmin guard"
);
}

View File

@@ -1,96 +0,0 @@
//! PR 4 (feat/admin-system-api) integration tests.
//!
//! Shape-only assertions — we don't mock the system, just call the
//! endpoint and check the response envelope. Threshold-triggering of
//! alerts would require faking statvfs / sysinfo, which is more
//! plumbing than the test gives back.
mod common;
use axum::http::StatusCode;
use axum::Router;
use sqlx::PgPool;
use tower::ServiceExt;
use mangalord::repo;
async fn seed_admin(pool: &PgPool, app: &Router) -> String {
let (username, cookie) = common::register_user(app).await;
let u = repo::user::find_by_username(pool, &username)
.await
.unwrap()
.unwrap();
repo::user::set_is_admin_unchecked(pool, u.id, true).await.unwrap();
cookie
}
#[sqlx::test(migrations = "./migrations")]
async fn requires_admin(pool: PgPool) {
let h = common::harness(pool);
let (_u, cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::get_with_cookie("/api/v1/admin/system", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn unauthenticated_request_is_rejected(pool: PgPool) {
let h = common::harness(pool);
let resp = h
.app
.oneshot(common::get("/api/v1/admin/system"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
#[sqlx::test(migrations = "./migrations")]
async fn returns_disk_memory_cpu_alerts_shape(pool: PgPool) {
let h = common::harness(pool.clone());
let cookie = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::get_with_cookie("/api/v1/admin/system", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
// Disk: harness uses LocalStorage on a tempdir, so disk SHOULD be
// populated. Validate the field shape and percent range.
let disk = body
.get("disk")
.expect("disk key present")
.as_object()
.expect("disk is an object (LocalStorage exposes a path)");
assert!(disk["total_bytes"].as_u64().unwrap() > 0);
let pct = disk["percent_used"].as_f64().unwrap();
assert!(
(0.0..=100.0).contains(&pct),
"percent_used outside [0,100]: {pct}"
);
let mem = body.get("memory").expect("memory key").as_object().unwrap();
assert!(mem["total_bytes"].as_u64().unwrap() > 0);
let mpct = mem["percent_used"].as_f64().unwrap();
assert!((0.0..=100.0).contains(&mpct));
let cpu = body.get("cpu").expect("cpu key").as_object().unwrap();
let cpu_pct = cpu["percent_used"].as_f64().unwrap();
assert!(
(0.0..=100.0).contains(&cpu_pct),
"cpu out of range: {cpu_pct}"
);
let alerts = body.get("alerts").expect("alerts key").as_array().unwrap();
// Don't assert on length — the box may genuinely be >90% on memory
// when the test runs. Just confirm shape of any present entry.
for alert in alerts {
assert!(alert["level"].is_string());
assert!(alert["message"].is_string());
}
}

View File

@@ -1,605 +0,0 @@
//! PR 2 (feat/admin-users-api) integration tests.
//!
//! Exercises list / delete / promote-demote on /api/v1/admin/users:
//! pagination + search, the RequireAdmin gate, self-protection,
//! last-admin invariant (including the parallel-demote race that
//! `pg_advisory_xact_lock` + recount-inside-tx guards against), and
//! that audit rows land in `admin_audit` only on successful commit.
//!
//! Note on the last-admin invariant: the *serial* path via HTTP is
//! structurally unreachable — the only configuration that would hit the
//! "would orphan admins" branch requires the actor to be the lone admin
//! demoting themselves, which the self-guard fires on first. So the
//! last-admin checks below call the repo directly to exercise the
//! invariant; the HTTP race scenario is covered by
//! `parallel_demotes_cannot_orphan_admins`.
mod common;
use axum::http::StatusCode;
use axum::Router;
use serde_json::json;
use sqlx::PgPool;
use tower::ServiceExt;
use uuid::Uuid;
use mangalord::error::AppError;
use mangalord::repo;
/// Register a user via the public API and immediately promote them via
/// the repo. Returns (username, session cookie, user_id) — the common
/// "I need a logged-in admin" prelude.
async fn seed_admin(pool: &PgPool, app: &Router) -> (String, String, Uuid) {
let (username, cookie) = common::register_user(app).await;
let u = repo::user::find_by_username(pool, &username)
.await
.unwrap()
.unwrap();
repo::user::set_is_admin_unchecked(pool, u.id, true).await.unwrap();
(username, cookie, u.id)
}
// ---- RequireAdmin gate -----------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn list_requires_admin(pool: PgPool) {
let h = common::harness(pool);
let (_username, cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::get_with_cookie("/api/v1/admin/users", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn delete_requires_admin(pool: PgPool) {
let h = common::harness(pool);
let (_username, cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::delete_with_cookie(
&format!("/api/v1/admin/users/{}", Uuid::new_v4()),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn patch_requires_admin(pool: PgPool) {
let h = common::harness(pool);
let (_username, cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::patch_json_with_cookie(
&format!("/api/v1/admin/users/{}", Uuid::new_v4()),
json!({ "is_admin": true }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
// ---- list with search and pagination ---------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn list_returns_paginated_users(pool: PgPool) {
let h = common::harness(pool.clone());
let (_admin_name, cookie, _) = seed_admin(&pool, &h.app).await;
let _u1 = common::register_user(&h.app).await;
let _u2 = common::register_user(&h.app).await;
let _u3 = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::get_with_cookie(
"/api/v1/admin/users?limit=2&offset=0",
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
let items = body["items"].as_array().expect("items array");
assert_eq!(items.len(), 2, "limit=2 should cap the page");
assert_eq!(body["page"]["limit"], 2);
assert_eq!(body["page"]["offset"], 0);
assert_eq!(body["page"]["total"], 4);
assert!(items[0].get("is_admin").is_some());
assert!(
items[0].get("password_hash").is_none(),
"password_hash must never leak even to other admins"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn list_filters_by_substring_search(pool: PgPool) {
let h = common::harness(pool.clone());
let (_admin_name, cookie, _) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/register",
json!({ "username": "zzzfindme01", "password": "hunter2hunter2" }),
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CREATED);
let resp = h
.app
.oneshot(common::get_with_cookie(
"/api/v1/admin/users?search=zzzfindme",
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
let items = body["items"].as_array().unwrap();
assert_eq!(items.len(), 1, "search must narrow to the one match");
assert_eq!(items[0]["username"], "zzzfindme01");
assert_eq!(body["page"]["total"], 1);
}
// ---- self-protection -------------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn cannot_self_delete(pool: PgPool) {
let h = common::harness(pool.clone());
let (_username, cookie, actor_id) = seed_admin(&pool, &h.app).await;
// Second admin so the last-admin guard isn't what triggers the conflict.
let (_other, _, _) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::delete_with_cookie(
&format!("/api/v1/admin/users/{actor_id}"),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CONFLICT);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "conflict");
assert!(
body["error"]["message"]
.as_str()
.unwrap()
.contains("yourself"),
"message must call out the self-action; got {:?}",
body["error"]["message"]
);
}
#[sqlx::test(migrations = "./migrations")]
async fn cannot_self_demote(pool: PgPool) {
let h = common::harness(pool.clone());
let (_username, cookie, actor_id) = seed_admin(&pool, &h.app).await;
let (_other, _, _) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::patch_json_with_cookie(
&format!("/api/v1/admin/users/{actor_id}"),
json!({ "is_admin": false }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CONFLICT);
let body = common::body_json(resp).await;
assert!(body["error"]["message"]
.as_str()
.unwrap()
.contains("yourself"));
}
// ---- last-admin invariant (repo layer, see file header) --------------------
#[sqlx::test(migrations = "./migrations")]
async fn last_admin_demote_refused_at_repo(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a, _, a_id) = seed_admin(&pool, &h.app).await;
let (_b, _, b_id) = seed_admin(&pool, &h.app).await;
// admins = {A, B}. Demote A via B (count 2 → 1) — allowed.
let r = repo::user::admin_safe_set_is_admin(&pool, b_id, a_id, false)
.await
.expect("first demote succeeds");
assert!(!r.is_admin);
// admins = {B}. Try to demote B via A (actor doesn't matter to the
// repo — that's the HTTP gate's job). Last-admin guard kicks in.
let err = repo::user::admin_safe_set_is_admin(&pool, a_id, b_id, false)
.await
.expect_err("second demote must be refused");
match err {
AppError::Conflict(m) => assert!(
m.contains("last admin"),
"expected last-admin conflict; got {m:?}"
),
other => panic!("expected Conflict, got {other:?}"),
}
}
#[sqlx::test(migrations = "./migrations")]
async fn last_admin_delete_refused_at_repo(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a, _, a_id) = seed_admin(&pool, &h.app).await;
let (_b, _, b_id) = seed_admin(&pool, &h.app).await;
// admins = {A, B}. Delete A via B (count 2 → 1) — allowed.
repo::user::admin_safe_delete(&pool, b_id, a_id)
.await
.expect("first delete succeeds");
// admins = {B}. Try to delete B via a fresh non-admin actor. Last-
// admin guard kicks in.
let (_c, _, c_id) = {
let (cn, _ck) = common::register_user(&h.app).await;
let c = repo::user::find_by_username(&pool, &cn).await.unwrap().unwrap();
(cn, _ck, c.id)
};
let err = repo::user::admin_safe_delete(&pool, c_id, b_id)
.await
.expect_err("second delete must be refused");
match err {
AppError::Conflict(m) => assert!(
m.contains("last admin"),
"expected last-admin conflict; got {m:?}"
),
other => panic!("expected Conflict, got {other:?}"),
}
}
#[sqlx::test(migrations = "./migrations")]
async fn parallel_demotes_cannot_orphan_admins(pool: PgPool) {
// The race the advisory lock + recount exists to close: two parallel
// demotes of two DIFFERENT admins, each reading `count = 2` and
// committing, would land at zero admins. With the lock the second
// demote sees count = 1 inside the tx and refuses.
let h = common::harness(pool.clone());
let (_a, _, a_id) = seed_admin(&pool, &h.app).await;
let (_b, _, b_id) = seed_admin(&pool, &h.app).await;
let pool_x = pool.clone();
let pool_y = pool.clone();
let task_x = tokio::spawn(async move {
repo::user::admin_safe_set_is_admin(&pool_x, a_id, b_id, false).await
});
let task_y = tokio::spawn(async move {
repo::user::admin_safe_set_is_admin(&pool_y, b_id, a_id, false).await
});
let r_x = task_x.await.unwrap();
let r_y = task_y.await.unwrap();
let outcomes = (r_x.is_ok(), r_y.is_ok());
assert!(
outcomes == (true, false) || outcomes == (false, true),
"exactly one of the two parallel demotes must succeed; got {outcomes:?}"
);
let (count,): (i64,) =
sqlx::query_as("SELECT COUNT(*) FROM users WHERE is_admin = true")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(count, 1, "at least one admin must remain");
}
// ---- audit log -------------------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn promote_writes_audit_row(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a_name, a_cookie, a_id) = seed_admin(&pool, &h.app).await;
let (b_name, _b_cookie) = common::register_user(&h.app).await;
let b = repo::user::find_by_username(&pool, &b_name)
.await
.unwrap()
.unwrap();
let resp = h
.app
.oneshot(common::patch_json_with_cookie(
&format!("/api/v1/admin/users/{}", b.id),
json!({ "is_admin": true }),
&a_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let rows: Vec<(Option<Uuid>, String, String, Option<Uuid>)> = sqlx::query_as(
"SELECT actor_user_id, action, target_kind, target_id FROM admin_audit",
)
.fetch_all(&pool)
.await
.unwrap();
assert_eq!(rows.len(), 1);
let (actor, action, kind, target) = rows.into_iter().next().unwrap();
assert_eq!(actor, Some(a_id));
assert_eq!(action, "promote_user");
assert_eq!(kind, "user");
assert_eq!(target, Some(b.id));
}
#[sqlx::test(migrations = "./migrations")]
async fn redundant_promote_does_not_write_audit_row(pool: PgPool) {
// Regression: PATCH {is_admin: true} on someone already admin used
// to UPDATE (no-op) and still INSERT a misleading "promote_user"
// audit row. Should short-circuit without touching admin_audit.
let h = common::harness(pool.clone());
let (_a_name, a_cookie, _a_id) = seed_admin(&pool, &h.app).await;
let (b_name, _b_cookie, _b_id) = seed_admin(&pool, &h.app).await; // already admin
let b = repo::user::find_by_username(&pool, &b_name)
.await
.unwrap()
.unwrap();
let resp = h
.app
.oneshot(common::patch_json_with_cookie(
&format!("/api/v1/admin/users/{}", b.id),
json!({ "is_admin": true }),
&a_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM admin_audit")
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(count, 0, "no-op promote must not write audit row");
}
#[sqlx::test(migrations = "./migrations")]
async fn delete_writes_audit_row(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a_name, a_cookie, a_id) = seed_admin(&pool, &h.app).await;
let (b_name, _b_cookie) = common::register_user(&h.app).await;
let b = repo::user::find_by_username(&pool, &b_name)
.await
.unwrap()
.unwrap();
let resp = h
.app
.oneshot(common::delete_with_cookie(
&format!("/api/v1/admin/users/{}", b.id),
&a_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
let rows: Vec<(Option<Uuid>, String, String, Option<Uuid>, serde_json::Value)> =
sqlx::query_as(
"SELECT actor_user_id, action, target_kind, target_id, payload FROM admin_audit",
)
.fetch_all(&pool)
.await
.unwrap();
assert_eq!(rows.len(), 1);
let (actor, action, kind, target, payload) = rows.into_iter().next().unwrap();
assert_eq!(actor, Some(a_id));
assert_eq!(action, "delete_user");
assert_eq!(kind, "user");
assert_eq!(target, Some(b.id));
assert_eq!(payload["username"], b_name);
assert_eq!(payload["was_admin"], false);
}
// ---- POST /admin/users (admin-create) --------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn create_user_requires_admin(pool: PgPool) {
let h = common::harness(pool);
let (_username, cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/admin/users",
json!({ "username": "newbie", "password": "hunter2hunter2" }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
#[sqlx::test(migrations = "./migrations")]
async fn create_user_unauthenticated_is_rejected(pool: PgPool) {
let h = common::harness(pool);
let resp = h
.app
.oneshot(common::post_json(
"/api/v1/admin/users",
json!({ "username": "newbie", "password": "hunter2hunter2" }),
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
#[sqlx::test(migrations = "./migrations")]
async fn create_user_happy_path_creates_user_and_audit(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a_name, a_cookie, a_id) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/admin/users",
json!({ "username": "invited01", "password": "freshpass1234" }),
&a_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CREATED);
let body = common::body_json(resp).await;
assert_eq!(body["username"], "invited01");
assert_eq!(body["is_admin"], false);
assert!(body["id"].as_str().is_some());
assert!(
body.get("password_hash").is_none(),
"password_hash must never appear in admin-create response"
);
let target_id =
Uuid::parse_str(body["id"].as_str().unwrap()).unwrap();
let (actor, action, kind, target, payload): (
Option<Uuid>,
String,
String,
Option<Uuid>,
serde_json::Value,
) = sqlx::query_as(
"SELECT actor_user_id, action, target_kind, target_id, payload \
FROM admin_audit ORDER BY at DESC LIMIT 1",
)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(actor, Some(a_id));
assert_eq!(action, "create_user");
assert_eq!(kind, "user");
assert_eq!(target, Some(target_id));
assert_eq!(payload["username"], "invited01");
assert_eq!(payload["is_admin"], false);
}
#[sqlx::test(migrations = "./migrations")]
async fn create_user_can_mint_an_admin_in_one_call(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/admin/users",
json!({
"username": "newadmin",
"password": "freshpass1234",
"is_admin": true
}),
&a_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CREATED);
let body = common::body_json(resp).await;
assert_eq!(body["is_admin"], true);
}
#[sqlx::test(migrations = "./migrations")]
async fn create_user_returns_409_on_duplicate(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
// Seed an existing user via the public register path.
let resp = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/register",
json!({ "username": "taken", "password": "hunter2hunter2" }),
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CREATED);
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/admin/users",
json!({ "username": "Taken", "password": "freshpass1234" }),
&a_cookie,
))
.await
.unwrap();
assert_eq!(
resp.status(),
StatusCode::CONFLICT,
"case-insensitive collision via the lower(username) index"
);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "conflict");
}
#[sqlx::test(migrations = "./migrations")]
async fn create_user_rejects_weak_password(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/admin/users",
json!({ "username": "okayname", "password": "short" }),
&a_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "invalid_input");
}
#[sqlx::test(migrations = "./migrations")]
async fn create_user_rejects_invalid_username(pool: PgPool) {
let h = common::harness(pool.clone());
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/admin/users",
json!({ "username": "bad name!", "password": "freshpass1234" }),
&a_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
}
#[sqlx::test(migrations = "./migrations")]
async fn create_user_works_even_when_self_register_disabled(pool: PgPool) {
// The admin-create path must NOT be gated by ALLOW_SELF_REGISTER —
// that's the entire point of having an admin-create endpoint.
let h = common::harness_with_self_register_disabled(pool.clone());
// Bootstrap an admin out-of-band since self-register would refuse.
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
.await
.unwrap();
let resp = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/login",
json!({ "username": "root", "password": "hunter2hunter2" }),
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let cookie = common::extract_session_cookie(&resp).unwrap();
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/admin/users",
json!({ "username": "invited01", "password": "freshpass1234" }),
&cookie,
))
.await
.unwrap();
assert_eq!(
resp.status(),
StatusCode::CREATED,
"admin must be able to mint users even with self-register off"
);
}

View File

@@ -567,166 +567,6 @@ async fn user_a_cannot_delete_user_b_token(pool: PgPool) {
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
}
/// Username enumeration via login response time: an attacker probes
/// for valid usernames by measuring how long /auth/login takes. Before
/// the equalisation fix, the no-user branch returned 401 in <1 ms
/// while the wrong-password branch took ~50-100 ms (the argon2 verify
/// cost). This test asserts the no-user branch now spends at least
/// some meaningful fraction of the wrong-password branch's time.
///
/// Tolerance is intentionally loose so CI variance doesn't flap the
/// test. The unequalised gap is large enough (~50x) that even a noisy
/// CI run with a 5x slack still catches it.
#[sqlx::test(migrations = "./migrations")]
async fn login_no_user_branch_runs_argon2_for_timing_equalisation(pool: PgPool) {
use std::time::Instant;
let h = common::harness(pool);
// Register the victim user so the wrong-password branch has a real
// argon2 hash to verify against.
let _ = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/register",
json!({ "username": "victim", "password": "hunter2hunter2" }),
))
.await
.unwrap();
// Warm-up: first login of the process initialises the dummy hash
// lazily. Skip that cost when measuring.
let _ = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/login",
json!({ "username": "victim", "password": "wrong" }),
))
.await
.unwrap();
let _ = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/login",
json!({ "username": "ghost", "password": "wrong" }),
))
.await
.unwrap();
// Median-of-N is more stable than a single sample.
async fn sample_min(
app: &axum::Router,
username: &str,
n: u32,
) -> std::time::Duration {
let mut samples = Vec::with_capacity(n as usize);
for _ in 0..n {
let req = common::post_json(
"/api/v1/auth/login",
json!({ "username": username, "password": "wrong-guess" }),
);
let t = Instant::now();
let resp = app.clone().oneshot(req).await.unwrap();
let d = t.elapsed();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
samples.push(d);
}
// Use the minimum: it's the floor that argon2 takes, robust
// against unrelated stalls (DB connection acquisition, etc.).
*samples.iter().min().unwrap()
}
let wrong_pwd = sample_min(&h.app, "victim", 3).await;
let no_user = sample_min(&h.app, "ghost", 3).await;
// 5x slack: argon2 dominates both branches, so they should be
// within an order of magnitude. Unequalised, no_user would be
// ~50-100x faster. Asserting "no_user >= wrong_pwd / 5" catches
// the bug without being flaky in CI.
assert!(
no_user * 5 >= wrong_pwd,
"login timing leaks user existence: no_user={no_user:?}, wrong_pwd={wrong_pwd:?}"
);
}
/// Brute-force / spray protection: at default production limits, a
/// tight loop of /auth/login attempts should burst through the bucket
/// and then 429 every subsequent request until the bucket refills.
#[sqlx::test(migrations = "./migrations")]
async fn login_rate_limited_under_burst_pressure(pool: PgPool) {
let h = common::harness_with_auth_rate_limit(pool, 1, 3);
// Register a victim so the wrong-password branch is real work.
let _ = h
.app
.clone()
.oneshot(common::post_json("/api/v1/auth/register", creds("victim")))
.await
.unwrap();
// Register consumed one token from the burst-3 bucket. Fire 30
// wrong-password logins back-to-back; with per_sec=1 the refill
// is too slow to keep up and at least one must come back 429.
let mut saw_429 = false;
for _ in 0..30 {
let resp = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/login",
json!({ "username": "victim", "password": "wrong" }),
))
.await
.unwrap();
if resp.status() == StatusCode::TOO_MANY_REQUESTS {
// RFC 6585 §4: 429 SHOULD include a Retry-After header. The
// value is in seconds; with per_sec=1 the bucket needs ~1s
// to refill, so the header should be 1 or 2.
let retry_after = resp
.headers()
.get(axum::http::header::RETRY_AFTER)
.and_then(|v| v.to_str().ok())
.and_then(|s| s.parse::<u32>().ok())
.expect("Retry-After header present and numeric");
assert!(
retry_after >= 1,
"Retry-After must be at least 1s, got {retry_after}"
);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "too_many_requests");
saw_429 = true;
break;
}
}
assert!(
saw_429,
"expected at least one 429 within 30 rapid login attempts"
);
}
/// Default (test-harness) limits are disabled, so existing tests that
/// fire multiple auth requests don't start failing.
#[sqlx::test(migrations = "./migrations")]
async fn default_test_harness_does_not_rate_limit(pool: PgPool) {
let h = common::harness(pool);
for i in 0..50 {
let resp = h
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/login",
json!({ "username": format!("nobody-{i}"), "password": "x" }),
))
.await
.unwrap();
// None of these should be 429 — only 401.
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED, "iter {i}");
}
}
#[sqlx::test(migrations = "./migrations")]
async fn delete_unknown_token_is_404(pool: PgPool) {
let h = common::harness(pool);
@@ -741,68 +581,3 @@ async fn delete_unknown_token_is_404(pool: PgPool) {
.unwrap();
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
}
/// Bot token names are user-supplied free-form strings; a 10 MB name
/// was accepted before. Cap at 64 chars to match the other free-form
/// identifier caps (tags, collection names). The response uses
/// `ValidationFailed` (422 with per-field details) so clients can
/// render the same shape they already handle for `attach_tag`.
#[sqlx::test(migrations = "./migrations")]
async fn create_token_rejects_name_over_64_chars(pool: PgPool) {
let h = common::harness(pool);
let (_, cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::post_json_with_cookie(
"/api/v1/auth/tokens",
json!({ "name": "x".repeat(65) }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "validation_failed");
assert!(body["error"]["details"]["name"].is_string());
}
// ---- self-register toggle + /auth/config -----------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn auth_config_reports_self_register_enabled_by_default(pool: PgPool) {
let h = common::harness(pool);
let resp = h
.app
.oneshot(common::get("/api/v1/auth/config"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(body["self_register_enabled"], true);
}
#[sqlx::test(migrations = "./migrations")]
async fn auth_config_reflects_self_register_disabled(pool: PgPool) {
let h = common::harness_with_self_register_disabled(pool);
let resp = h
.app
.oneshot(common::get("/api/v1/auth/config"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(body["self_register_enabled"], false);
}
#[sqlx::test(migrations = "./migrations")]
async fn register_returns_403_when_self_register_disabled(pool: PgPool) {
let h = common::harness_with_self_register_disabled(pool);
let resp = h
.app
.oneshot(common::post_json("/api/v1/auth/register", creds("alice")))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "forbidden");
}

View File

@@ -410,53 +410,3 @@ async fn delete_cover_404_on_unknown_id(pool: PgPool) {
.unwrap();
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
}
/// Authz: PUT /mangas/:id/cover must be uploader-only.
#[sqlx::test(migrations = "./migrations")]
async fn put_cover_forbidden_for_non_uploader(pool: PgPool) {
let h = harness(pool);
let (_, owner_cookie) = register_user(&h.app).await;
let (_, intruder_cookie) = register_user(&h.app).await;
let manga =
create_manga_with_cover(&h.app, &owner_cookie, "Mine", None).await;
let id = id_of(&manga);
let resp = h
.app
.oneshot(put_multipart_with_cookie(
&format!("/api/v1/mangas/{id}/cover"),
cover_form(&fake_png_bytes()),
&intruder_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
/// Authz: DELETE /mangas/:id/cover must be uploader-only.
#[sqlx::test(migrations = "./migrations")]
async fn delete_cover_forbidden_for_non_uploader(pool: PgPool) {
let h = harness(pool);
let (_, owner_cookie) = register_user(&h.app).await;
let (_, intruder_cookie) = register_user(&h.app).await;
let manga = create_manga_with_cover(
&h.app,
&owner_cookie,
"Mine",
Some(("image/jpeg", &fake_jpeg_bytes())),
)
.await;
let id = id_of(&manga);
let resp = h
.app
.oneshot(delete_with_cookie(
&format!("/api/v1/mangas/{id}/cover"),
&intruder_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}

View File

@@ -566,78 +566,3 @@ async fn patch_requires_authentication(pool: PgPool) {
.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
/// A signed-in user who didn't upload the manga must not be able to
/// PATCH it. Without the uploader-gate this returned 200 — see
/// REVIEW.md "manga PATCH / cover endpoints don't check ownership".
#[sqlx::test(migrations = "./migrations")]
async fn patch_forbidden_for_non_uploader(pool: PgPool) {
let h = common::harness(pool);
let (_, owner_cookie) = common::register_user(&h.app).await;
let (_, intruder_cookie) = common::register_user(&h.app).await;
let created = create_manga(&h.app, &owner_cookie, json!({ "title": "Mine" })).await;
let id = id_of(&created);
let resp = h
.app
.oneshot(common::patch_json_with_cookie(
&format!("/api/v1/mangas/{id}"),
json!({ "status": "completed" }),
&intruder_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
}
/// Owner can still edit their own manga (regression guard for the
/// authz fix).
#[sqlx::test(migrations = "./migrations")]
async fn patch_allowed_for_uploader(pool: PgPool) {
let h = common::harness(pool);
let (_, cookie) = common::register_user(&h.app).await;
let created = create_manga(&h.app, &cookie, json!({ "title": "Owned" })).await;
let id = id_of(&created);
let resp = h
.app
.oneshot(common::patch_json_with_cookie(
&format!("/api/v1/mangas/{id}"),
json!({ "status": "completed" }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
}
/// Legacy rows with `uploaded_by IS NULL` (created before migration
/// 0011) remain editable by any signed-in user. Without this carve-out
/// the historical-data note in 0011 would be broken.
#[sqlx::test(migrations = "./migrations")]
async fn patch_allowed_on_legacy_null_uploader(pool: PgPool) {
let h = common::harness(pool.clone());
let (_, cookie) = common::register_user(&h.app).await;
let created = create_manga(&h.app, &cookie, json!({ "title": "Legacy" })).await;
let id = id_of(&created);
// Simulate a row uploaded before the column existed: clear
// uploaded_by directly via SQL.
sqlx::query("UPDATE mangas SET uploaded_by = NULL WHERE id = $1")
.bind(id)
.execute(&pool)
.await
.unwrap();
let (_, other_cookie) = common::register_user(&h.app).await;
let resp = h
.app
.oneshot(common::patch_json_with_cookie(
&format!("/api/v1/mangas/{id}"),
json!({ "status": "completed" }),
&other_cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
}

View File

@@ -1,189 +0,0 @@
//! Site-wide auth gate (`PRIVATE_MODE=true`).
//!
//! With private mode on, every API path except a small allowlist
//! (`/health`, `/auth/config`, `/auth/login`, `/auth/logout`) requires
//! a valid session cookie or bearer token, and `/auth/register` is
//! force-blocked regardless of `ALLOW_SELF_REGISTER`. With private mode
//! off (the default), nothing changes — the `public_mode_*` test
//! pins that regression guard.
mod common;
use serde_json::json;
use sqlx::PgPool;
use tower::ServiceExt;
use axum::http::StatusCode;
#[sqlx::test(migrations = "./migrations")]
async fn private_mode_blocks_anonymous_manga_list(pool: PgPool) {
let h = common::harness_with_private_mode(pool);
let resp = h.app.oneshot(common::get("/api/v1/mangas")).await.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
#[sqlx::test(migrations = "./migrations")]
async fn private_mode_blocks_anonymous_files(pool: PgPool) {
let h = common::harness_with_private_mode(pool);
// The path doesn't have to exist — the guard runs before routing,
// so the response is 401 (not 404). That's the property the test
// is pinning: nothing leaks via crafted URLs.
let resp = h
.app
.oneshot(common::get("/api/v1/files/anything.png"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
}
#[sqlx::test(migrations = "./migrations")]
async fn private_mode_allows_session_cookie_read(pool: PgPool) {
// Register through a non-private harness sharing the same DB pool
// so the session row exists. Then exercise the gate using a fresh
// private-mode harness against the same DB.
let public = common::harness(pool.clone());
let (_, cookie) = common::register_user(&public.app).await;
let private = common::harness_with_private_mode(pool);
let resp = private
.app
.oneshot(common::get_with_cookie("/api/v1/mangas", &cookie))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
}
#[sqlx::test(migrations = "./migrations")]
async fn private_mode_allows_bearer_token_read(pool: PgPool) {
let public = common::harness(pool.clone());
let (_, cookie) = common::register_user(&public.app).await;
let resp = public
.app
.clone()
.oneshot(common::post_json_with_cookie(
"/api/v1/auth/tokens",
json!({ "name": "private-mode-bot" }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::CREATED);
let body = common::body_json(resp).await;
let bearer = body["bearer"].as_str().unwrap().to_string();
let private = common::harness_with_private_mode(pool);
let resp = private
.app
.oneshot(common::get_with_bearer("/api/v1/mangas", &bearer))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
}
#[sqlx::test(migrations = "./migrations")]
async fn private_mode_allows_login_endpoint_anonymous(pool: PgPool) {
// Seed a user via the public harness so login has credentials to
// verify against.
let public = common::harness(pool.clone());
let _ = public
.app
.clone()
.oneshot(common::post_json(
"/api/v1/auth/register",
json!({ "username": "alice", "password": "hunter2hunter2" }),
))
.await
.unwrap();
let private = common::harness_with_private_mode(pool);
let resp = private
.app
.oneshot(common::post_json(
"/api/v1/auth/login",
json!({ "username": "alice", "password": "hunter2hunter2" }),
))
.await
.unwrap();
// Reaches the login handler and succeeds — *not* 401 from the
// gate. That's the property we're pinning.
assert_eq!(resp.status(), StatusCode::OK);
}
#[sqlx::test(migrations = "./migrations")]
async fn private_mode_allows_health_and_config_anonymous(pool: PgPool) {
let h = common::harness_with_private_mode(pool);
let r = h
.app
.clone()
.oneshot(common::get("/api/v1/health"))
.await
.unwrap();
assert_eq!(r.status(), StatusCode::OK);
let r = h
.app
.oneshot(common::get("/api/v1/auth/config"))
.await
.unwrap();
assert_eq!(r.status(), StatusCode::OK);
}
#[sqlx::test(migrations = "./migrations")]
async fn private_mode_blocks_register_even_when_self_register_enabled(pool: PgPool) {
// harness_with_private_mode keeps `allow_self_register=true` (the
// default) — private mode is supposed to force-block register
// regardless. That's what this test pins.
let h = common::harness_with_private_mode(pool);
let resp = h
.app
.oneshot(common::post_json(
"/api/v1/auth/register",
json!({ "username": "alice", "password": "hunter2hunter2" }),
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "forbidden");
}
#[sqlx::test(migrations = "./migrations")]
async fn auth_config_reports_private_mode_and_effective_self_register(pool: PgPool) {
let h = common::harness_with_private_mode(pool);
let resp = h
.app
.oneshot(common::get("/api/v1/auth/config"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(body["private_mode"], true);
// Effective value: `allow_self_register && !private_mode` is false
// here even though the raw `allow_self_register` is true.
assert_eq!(body["self_register_enabled"], false);
}
#[sqlx::test(migrations = "./migrations")]
async fn public_mode_does_not_gate_anonymous_reads(pool: PgPool) {
// Regression guard: with private_mode off (the default), the gate
// must be a no-op so existing public deployments stay public.
let h = common::harness(pool);
let resp = h.app.oneshot(common::get("/api/v1/mangas")).await.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
}
#[sqlx::test(migrations = "./migrations")]
async fn public_mode_reports_private_mode_false(pool: PgPool) {
let h = common::harness(pool);
let resp = h
.app
.oneshot(common::get("/api/v1/auth/config"))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
let body = common::body_json(resp).await;
assert_eq!(body["private_mode"], false);
assert_eq!(body["self_register_enabled"], true);
}

View File

@@ -59,31 +59,6 @@ async fn reattach_same_tag_is_idempotent_and_returns_200(pool: PgPool) {
assert_eq!(second.status(), StatusCode::OK);
}
/// Tag names over 64 chars are rejected at the handler boundary. The
/// repo enforces the same cap, but doing it at the handler keeps the
/// envelope consistent with the other validation paths
/// (username, collection name, etc.).
#[sqlx::test(migrations = "./migrations")]
async fn attach_rejects_tag_name_over_64_chars(pool: PgPool) {
let h = common::harness(pool);
let (_, cookie) = common::register_user(&h.app).await;
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
let long_name: String = "x".repeat(65);
let resp = h
.app
.oneshot(common::post_json_with_cookie(
&format!("/api/v1/mangas/{manga_id}/tags"),
json!({ "name": long_name }),
&cookie,
))
.await
.unwrap();
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
let body = common::body_json(resp).await;
assert_eq!(body["error"]["code"], "validation_failed");
}
#[sqlx::test(migrations = "./migrations")]
async fn tag_names_dedup_case_insensitively(pool: PgPool) {
let h = common::harness(pool);

View File

@@ -15,7 +15,6 @@ use tempfile::TempDir;
use tower::ServiceExt;
use mangalord::app::{router, AppState};
use mangalord::auth::rate_limit::AuthRateLimiter;
use mangalord::config::{AuthConfig, UploadConfig};
use mangalord::storage::{LocalStorage, Storage, StorageError, StreamingFile};
@@ -50,117 +49,20 @@ fn harness_inner(
storage: Arc<dyn Storage>,
storage_dir: TempDir,
) -> Harness {
harness_with_auth_config(pool, storage, storage_dir, AuthConfig {
cookie_secure: false,
..AuthConfig::default()
})
}
fn harness_with_auth_config(
pool: PgPool,
storage: Arc<dyn Storage>,
storage_dir: TempDir,
auth: AuthConfig,
) -> Harness {
let auth_limiter = Arc::new(AuthRateLimiter::new(auth.rate_limit));
let state = AppState {
db: pool,
storage,
auth,
auth: AuthConfig { cookie_secure: false, ..AuthConfig::default() },
upload: UploadConfig {
// Keep file caps small in tests so the size-cap path is cheap to
// exercise without producing tens of MBs of bytes.
max_request_bytes: 4 * 1024 * 1024,
max_file_bytes: 256 * 1024,
},
auth_limiter,
// Default harness has no crawler daemon wired up; admin resync
// handlers return 503 in this config. Tests that need a stub
// resync service swap it in via `harness_with_resync`.
resync: None,
crawler: None,
};
Harness { app: router(state), _storage_dir: storage_dir }
}
/// Like [`harness`] but flips `ALLOW_SELF_REGISTER` off so the
/// register-disabled test exercises the 403 branch in
/// `api::auth::register`.
pub fn harness_with_self_register_disabled(pool: PgPool) -> Harness {
let storage_dir = tempfile::tempdir().expect("tempdir");
let storage = Arc::new(LocalStorage::new(storage_dir.path()));
let auth = AuthConfig {
cookie_secure: false,
allow_self_register: false,
..AuthConfig::default()
};
harness_with_auth_config(pool, storage, storage_dir, auth)
}
/// Like [`harness`] but flips `PRIVATE_MODE` on so the site-wide auth
/// gate is exercised. `allow_self_register` stays at its default `true`
/// to verify that private mode force-disables self-registration on top
/// of whatever `ALLOW_SELF_REGISTER` says.
pub fn harness_with_private_mode(pool: PgPool) -> Harness {
let storage_dir = tempfile::tempdir().expect("tempdir");
let storage = Arc::new(LocalStorage::new(storage_dir.path()));
let auth = AuthConfig {
cookie_secure: false,
private_mode: true,
..AuthConfig::default()
};
harness_with_auth_config(pool, storage, storage_dir, auth)
}
/// Like [`harness`] but configures a tight auth rate limit. Used by
/// the brute-force-rate-limiting test.
pub fn harness_with_auth_rate_limit(
pool: PgPool,
per_sec: u32,
burst: u32,
) -> Harness {
let storage_dir = tempfile::tempdir().expect("tempdir");
let storage = Arc::new(LocalStorage::new(storage_dir.path()));
let auth = AuthConfig {
cookie_secure: false,
rate_limit: mangalord::auth::rate_limit::RateLimitConfig { per_sec, burst },
..AuthConfig::default()
};
harness_with_auth_config(pool, storage, storage_dir, auth)
}
/// Like [`harness`] but slots a caller-supplied [`ResyncService`] stub
/// into `AppState.resync`. Used by the admin resync tests so the
/// endpoint path is exercised without standing up a real Chromium.
pub fn harness_with_resync(
pool: PgPool,
resync: Arc<dyn mangalord::crawler::resync::ResyncService>,
) -> Harness {
let storage_dir = tempfile::tempdir().expect("tempdir");
let storage = Arc::new(LocalStorage::new(storage_dir.path()));
let auth = AuthConfig {
cookie_secure: false,
..AuthConfig::default()
};
let auth_limiter = Arc::new(AuthRateLimiter::new(auth.rate_limit));
let state = AppState {
db: pool,
storage,
auth,
upload: UploadConfig {
max_request_bytes: 4 * 1024 * 1024,
max_file_bytes: 256 * 1024,
},
auth_limiter,
resync: Some(resync),
crawler: None,
};
Harness {
app: router(state),
_storage_dir: storage_dir,
}
}
/// Wraps a real `Storage` and fails on the N-th `put` call so tests can
/// assert that handlers roll their DB writes back when storage errors
/// mid-upload. Reads and other operations delegate to `inner`.

View File

@@ -10,11 +10,6 @@
//!
//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if
//! `$HOME/.cache/mangalord/chromium` isn't writable.
//!
//! Set `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell` (or
//! another system chromium path) to exercise the system-chromium
//! launch path instead of the fetcher download — this is the path the
//! Raspberry Pi deployment takes.
use mangalord::crawler::browser::{self, LaunchOptions};

View File

@@ -40,8 +40,6 @@ fn make_cfg(
tz: Tz::UTC,
retention_days: 7,
session_expired,
status: mangalord::crawler::status::StatusHandle::new(workers),
job_timeout: Duration::from_secs(60),
extra_tasks: Vec::new(),
}
}
@@ -90,52 +88,6 @@ impl ChapterDispatcher for PanickingDispatcher {
}
}
/// Never completes — used to verify the worker's outer dispatch timeout.
struct HangingDispatcher {
seen: AtomicUsize,
}
#[async_trait::async_trait]
impl ChapterDispatcher for HangingDispatcher {
async fn dispatch(&self, _payload: JobPayload) -> anyhow::Result<SyncOutcome> {
self.seen.fetch_add(1, Ordering::AcqRel);
std::future::pending::<()>().await;
unreachable!("hanging dispatcher never resolves");
}
}
#[sqlx::test(migrations = "./migrations")]
async fn worker_times_out_a_hung_dispatch_and_acks_failed(pool: PgPool) {
enqueue_chapter_job(&pool).await;
let dispatcher = Arc::new(HangingDispatcher {
seen: AtomicUsize::new(0),
});
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
let cancel = CancellationToken::new();
let mut cfg = make_cfg(None, dispatcher.clone(), session_expired, 1);
cfg.job_timeout = Duration::from_millis(300);
let handle = daemon::spawn(pool.clone(), cancel.clone(), cfg);
// The hung job should time out and return to pending with backoff
// (attempts=1 < max=5). Poll for the recorded error.
let mut timed_out = false;
for _ in 0..40 {
let n: i64 = sqlx::query_scalar(
"SELECT COUNT(*) FROM crawler_jobs WHERE last_error = 'dispatch timed out'",
)
.fetch_one(&pool)
.await
.unwrap();
if n == 1 {
timed_out = true;
break;
}
tokio::time::sleep(Duration::from_millis(50)).await;
}
handle.shutdown().await;
assert!(timed_out, "hung dispatch must be acked failed with a timeout error");
assert!(dispatcher.seen.load(Ordering::Acquire) >= 1);
}
#[sqlx::test(migrations = "./migrations")]
async fn workers_drain_jobs_through_dispatcher(pool: PgPool) {
enqueue_chapter_job(&pool).await;
@@ -418,279 +370,3 @@ async fn enqueue_bookmarked_pending_skips_dropped_sources(pool: PgPool) {
);
}
#[sqlx::test(migrations = "./migrations")]
async fn enqueue_bookmarked_pending_skips_recently_dead_chapters(pool: PgPool) {
// Setup: a chapter whose last SyncChapterContent job died yesterday.
// The cron tick must not re-enqueue — without the quarantine, the
// chapter would spin: re-enqueue → max_attempts retries → dies again
// → re-enqueue next tick → forever.
let user_id: Uuid = sqlx::query_scalar(
"INSERT INTO users (username, password_hash) VALUES ($1, $2) RETURNING id",
)
.bind("alice")
.bind("not-a-real-hash")
.fetch_one(&pool)
.await
.unwrap();
let manga_id: Uuid =
sqlx::query_scalar("INSERT INTO mangas (title) VALUES ($1) RETURNING id")
.bind("Test")
.fetch_one(&pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO sources (id, name, base_url) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
)
.bind("target")
.bind("Target")
.bind("https://example.com")
.execute(&pool)
.await
.unwrap();
let chapter_id: Uuid = sqlx::query_scalar(
"INSERT INTO chapters (manga_id, number, page_count) VALUES ($1, 1, 0) RETURNING id",
)
.bind(manga_id)
.fetch_one(&pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url) \
VALUES ($1, $2, $3, $4)",
)
.bind("target")
.bind("ch1")
.bind(chapter_id)
.bind("https://example.com/ch1")
.execute(&pool)
.await
.unwrap();
sqlx::query("INSERT INTO bookmarks (user_id, manga_id) VALUES ($1, $2)")
.bind(user_id)
.bind(manga_id)
.execute(&pool)
.await
.unwrap();
// The dead job from the prior tick, updated 1 day ago (well inside the
// 7-day quarantine window).
sqlx::query(
"INSERT INTO crawler_jobs (payload, state, updated_at) \
VALUES ($1::jsonb, 'dead', now() - interval '1 day')",
)
.bind(serde_json::json!({
"kind": "sync_chapter_content",
"source_id": "target",
"chapter_id": chapter_id.to_string(),
"source_chapter_key": "ch1",
}))
.execute(&pool)
.await
.unwrap();
let summary = pipeline::enqueue_bookmarked_pending(&pool).await.unwrap();
assert_eq!(summary.inserted, 0, "recently dead chapter is quarantined");
assert_eq!(summary.skipped, 0);
}
#[sqlx::test(migrations = "./migrations")]
async fn enqueue_bookmarked_pending_resumes_after_quarantine_expires(pool: PgPool) {
// Same setup as above but the dead job is 10 days old — past the
// 7-day quarantine. The chapter should be re-enqueued so a once-failed
// chapter eventually gets a second shot at success.
let user_id: Uuid = sqlx::query_scalar(
"INSERT INTO users (username, password_hash) VALUES ($1, $2) RETURNING id",
)
.bind("alice")
.bind("not-a-real-hash")
.fetch_one(&pool)
.await
.unwrap();
let manga_id: Uuid =
sqlx::query_scalar("INSERT INTO mangas (title) VALUES ($1) RETURNING id")
.bind("Test")
.fetch_one(&pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO sources (id, name, base_url) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
)
.bind("target")
.bind("Target")
.bind("https://example.com")
.execute(&pool)
.await
.unwrap();
let chapter_id: Uuid = sqlx::query_scalar(
"INSERT INTO chapters (manga_id, number, page_count) VALUES ($1, 1, 0) RETURNING id",
)
.bind(manga_id)
.fetch_one(&pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url) \
VALUES ($1, $2, $3, $4)",
)
.bind("target")
.bind("ch1")
.bind(chapter_id)
.bind("https://example.com/ch1")
.execute(&pool)
.await
.unwrap();
sqlx::query("INSERT INTO bookmarks (user_id, manga_id) VALUES ($1, $2)")
.bind(user_id)
.bind(manga_id)
.execute(&pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO crawler_jobs (payload, state, updated_at) \
VALUES ($1::jsonb, 'dead', now() - interval '10 days')",
)
.bind(serde_json::json!({
"kind": "sync_chapter_content",
"source_id": "target",
"chapter_id": chapter_id.to_string(),
"source_chapter_key": "ch1",
}))
.execute(&pool)
.await
.unwrap();
let summary = pipeline::enqueue_bookmarked_pending(&pool).await.unwrap();
assert_eq!(
summary.inserted, 1,
"dead chapter is re-enqueued after quarantine expires"
);
}
/// Helper: insert a chapter with the given `number` and a non-dropped
/// source row, returning the chapter id. Used by the ordering tests so
/// the setup boilerplate doesn't drown the assertion.
async fn insert_pending_chapter(
pool: &PgPool,
manga_id: Uuid,
number: i32,
source_chapter_key: &str,
) -> Uuid {
let chapter_id: Uuid = sqlx::query_scalar(
"INSERT INTO chapters (manga_id, number, page_count) VALUES ($1, $2, 0) RETURNING id",
)
.bind(manga_id)
.bind(number)
.fetch_one(pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url) \
VALUES ($1, $2, $3, $4)",
)
.bind("target")
.bind(source_chapter_key)
.bind(chapter_id)
.bind(format!("https://example.com/{source_chapter_key}"))
.execute(pool)
.await
.unwrap();
chapter_id
}
#[sqlx::test(migrations = "./migrations")]
async fn enqueue_bookmarked_pending_queues_chapters_in_ascending_number_order(pool: PgPool) {
// Insert chapters with `number` values 3, 1, 2 in that insertion
// order — so `created_at` order (the previous tiebreaker) does NOT
// match number order. After enqueue + lease, the worker should see
// chapters 1, 2, 3 in that sequence.
let user_id: Uuid = sqlx::query_scalar(
"INSERT INTO users (username, password_hash) VALUES ($1, $2) RETURNING id",
)
.bind("alice")
.bind("not-a-real-hash")
.fetch_one(&pool)
.await
.unwrap();
let manga_id: Uuid = sqlx::query_scalar("INSERT INTO mangas (title) VALUES ($1) RETURNING id")
.bind("Test")
.fetch_one(&pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO sources (id, name, base_url) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
)
.bind("target")
.bind("Target")
.bind("https://example.com")
.execute(&pool)
.await
.unwrap();
let c3 = insert_pending_chapter(&pool, manga_id, 3, "ch3").await;
let c1 = insert_pending_chapter(&pool, manga_id, 1, "ch1").await;
let c2 = insert_pending_chapter(&pool, manga_id, 2, "ch2").await;
sqlx::query("INSERT INTO bookmarks (user_id, manga_id) VALUES ($1, $2)")
.bind(user_id)
.bind(manga_id)
.execute(&pool)
.await
.unwrap();
let summary = pipeline::enqueue_bookmarked_pending(&pool).await.unwrap();
assert_eq!(summary.inserted, 3);
let leases = jobs::lease(&pool, None, 10, std::time::Duration::from_secs(60))
.await
.unwrap();
let leased_chapter_ids: Vec<Uuid> = leases
.iter()
.map(|l| match &l.payload {
JobPayload::SyncChapterContent { chapter_id, .. } => *chapter_id,
other => panic!("unexpected payload kind: {other:?}"),
})
.collect();
assert_eq!(
leased_chapter_ids,
vec![c1, c2, c3],
"chapters must be leased in ascending chapter-number order, not insertion order"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn enqueue_pending_for_manga_queues_chapters_in_ascending_number_order(pool: PgPool) {
// Same scenario as above but exercising the bookmark-create hook path
// (`enqueue_pending_for_manga`) which has its own ORDER BY.
let manga_id: Uuid = sqlx::query_scalar("INSERT INTO mangas (title) VALUES ($1) RETURNING id")
.bind("Test")
.fetch_one(&pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO sources (id, name, base_url) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING",
)
.bind("target")
.bind("Target")
.bind("https://example.com")
.execute(&pool)
.await
.unwrap();
let c3 = insert_pending_chapter(&pool, manga_id, 3, "ch3").await;
let c1 = insert_pending_chapter(&pool, manga_id, 1, "ch1").await;
let c2 = insert_pending_chapter(&pool, manga_id, 2, "ch2").await;
let summary = pipeline::enqueue_pending_for_manga(&pool, manga_id)
.await
.unwrap();
assert_eq!(summary.inserted, 3);
let leases = jobs::lease(&pool, None, 10, std::time::Duration::from_secs(60))
.await
.unwrap();
let leased_chapter_ids: Vec<Uuid> = leases
.iter()
.map(|l| match &l.payload {
JobPayload::SyncChapterContent { chapter_id, .. } => *chapter_id,
other => panic!("unexpected payload kind: {other:?}"),
})
.collect();
assert_eq!(leased_chapter_ids, vec![c1, c2, c3]);
}

View File

@@ -1,304 +0,0 @@
//! Integration tests for the dead-letter admin queries in
//! `repo::crawler`: listing dead jobs with manga/chapter context and the
//! scoped requeue (all / per-manga / single) used by the admin dashboard.
use mangalord::repo::crawler::{self, RequeueScope};
use serde_json::json;
use sqlx::PgPool;
use uuid::Uuid;
/// Seed a manga with no cover + a live source row (so it's "queued for a
/// cover fetch"). Returns the manga id.
async fn seed_missing_cover(pool: &PgPool, title: &str) -> Uuid {
let manga_id = Uuid::new_v4();
sqlx::query("INSERT INTO mangas (id, title, cover_image_path) VALUES ($1, $2, NULL)")
.bind(manga_id)
.bind(title)
.execute(pool)
.await
.unwrap();
sqlx::query("INSERT INTO sources (id, name, base_url) VALUES ('target', 'T', 'http://x') ON CONFLICT DO NOTHING")
.execute(pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url) \
VALUES ('target', $1, $2, 'http://x/m')",
)
.bind(format!("k-{manga_id}"))
.bind(manga_id)
.execute(pool)
.await
.unwrap();
manga_id
}
/// Seed a manga + chapter and return their ids.
async fn seed_chapter(pool: &PgPool, title: &str, number: i32) -> (Uuid, Uuid) {
let manga_id = Uuid::new_v4();
let chapter_id = Uuid::new_v4();
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, $2)")
.bind(manga_id)
.bind(title)
.execute(pool)
.await
.unwrap();
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, $3)")
.bind(chapter_id)
.bind(manga_id)
.bind(number)
.execute(pool)
.await
.unwrap();
(manga_id, chapter_id)
}
/// Insert a crawler_jobs row in a given state for a chapter-content job.
async fn insert_job(pool: &PgPool, chapter_id: Uuid, state: &str, attempts: i32) -> Uuid {
let id = Uuid::new_v4();
let payload = json!({
"kind": "sync_chapter_content",
"source_id": "target",
"chapter_id": chapter_id,
"source_chapter_key": "k",
});
sqlx::query(
"INSERT INTO crawler_jobs (id, payload, state, attempts, last_error) \
VALUES ($1, $2, $3, $4, 'boom')",
)
.bind(id)
.bind(payload)
.bind(state)
.bind(attempts)
.execute(pool)
.await
.unwrap();
id
}
async fn state_of(pool: &PgPool, id: Uuid) -> String {
sqlx::query_scalar::<_, String>("SELECT state FROM crawler_jobs WHERE id = $1")
.bind(id)
.fetch_one(pool)
.await
.unwrap()
}
#[sqlx::test(migrations = "./migrations")]
async fn list_dead_jobs_returns_context_and_total(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "Naruto", 700).await;
insert_job(&pool, c1, "dead", 5).await;
// A non-dead job must not appear.
let (_m2, c2) = seed_chapter(&pool, "Bleach", 1).await;
insert_job(&pool, c2, "pending", 0).await;
let (items, total) = crawler::list_dead_jobs(&pool, None, 50, 0).await.unwrap();
assert_eq!(total, 1);
assert_eq!(items.len(), 1);
let row = &items[0];
assert_eq!(row.manga_title.as_deref(), Some("Naruto"));
assert_eq!(row.chapter_number, Some(700));
assert_eq!(row.attempts, 5);
assert_eq!(row.last_error.as_deref(), Some("boom"));
}
#[sqlx::test(migrations = "./migrations")]
async fn list_dead_jobs_filters_by_title_search(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "Naruto", 700).await;
insert_job(&pool, c1, "dead", 5).await;
let (_m2, c2) = seed_chapter(&pool, "One Piece", 1).await;
insert_job(&pool, c2, "dead", 5).await;
let (items, total) = crawler::list_dead_jobs(&pool, Some("piece"), 50, 0)
.await
.unwrap();
assert_eq!(total, 1);
assert_eq!(items[0].manga_title.as_deref(), Some("One Piece"));
}
#[sqlx::test(migrations = "./migrations")]
async fn requeue_all_resets_dead_jobs_to_pending(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "A", 1).await;
let (_m2, c2) = seed_chapter(&pool, "B", 1).await;
let j1 = insert_job(&pool, c1, "dead", 5).await;
let j2 = insert_job(&pool, c2, "dead", 5).await;
let n = crawler::requeue_dead_jobs(&pool, RequeueScope::All)
.await
.unwrap();
assert_eq!(n, 2);
assert_eq!(state_of(&pool, j1).await, "pending");
assert_eq!(state_of(&pool, j2).await, "pending");
let attempts: i32 = sqlx::query_scalar("SELECT attempts FROM crawler_jobs WHERE id = $1")
.bind(j1)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(attempts, 0, "attempts reset on requeue");
}
#[sqlx::test(migrations = "./migrations")]
async fn requeue_by_manga_scopes_to_that_manga(pool: PgPool) {
let (m1, c1) = seed_chapter(&pool, "A", 1).await;
let (_m2, c2) = seed_chapter(&pool, "B", 1).await;
let j1 = insert_job(&pool, c1, "dead", 5).await;
let j2 = insert_job(&pool, c2, "dead", 5).await;
let n = crawler::requeue_dead_jobs(&pool, RequeueScope::Manga(m1))
.await
.unwrap();
assert_eq!(n, 1);
assert_eq!(state_of(&pool, j1).await, "pending");
assert_eq!(state_of(&pool, j2).await, "dead", "other manga untouched");
}
#[sqlx::test(migrations = "./migrations")]
async fn requeue_by_chapter_scopes_to_that_chapter(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "A", 1).await;
let (_m2, c2) = seed_chapter(&pool, "A", 2).await;
let j1 = insert_job(&pool, c1, "dead", 5).await;
let j2 = insert_job(&pool, c2, "dead", 5).await;
let n = crawler::requeue_dead_jobs(&pool, RequeueScope::Chapter(c1))
.await
.unwrap();
assert_eq!(n, 1);
assert_eq!(state_of(&pool, j1).await, "pending");
assert_eq!(state_of(&pool, j2).await, "dead", "other chapter untouched");
}
#[sqlx::test(migrations = "./migrations")]
async fn requeue_single_job(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "A", 1).await;
let (_m2, c2) = seed_chapter(&pool, "B", 1).await;
let j1 = insert_job(&pool, c1, "dead", 5).await;
let j2 = insert_job(&pool, c2, "dead", 5).await;
let n = crawler::requeue_dead_jobs(&pool, RequeueScope::Job(j1))
.await
.unwrap();
assert_eq!(n, 1);
assert_eq!(state_of(&pool, j1).await, "pending");
assert_eq!(state_of(&pool, j2).await, "dead");
}
#[sqlx::test(migrations = "./migrations")]
async fn requeue_skips_dead_when_live_job_exists_for_same_chapter(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "A", 1).await;
let dead = insert_job(&pool, c1, "dead", 5).await;
// A live pending job for the SAME chapter already exists.
insert_job(&pool, c1, "pending", 0).await;
let n = crawler::requeue_dead_jobs(&pool, RequeueScope::All)
.await
.unwrap();
assert_eq!(n, 0, "must not resurrect a dead job that has a live counterpart");
assert_eq!(state_of(&pool, dead).await, "dead");
}
#[sqlx::test(migrations = "./migrations")]
async fn requeue_with_two_dead_jobs_for_one_chapter_revives_one_not_500(pool: PgPool) {
// Regression: two dead jobs for the SAME chapter must not both flip to
// pending in one statement — that would violate the partial unique
// dedup index and abort the whole requeue.
let (manga_id, c1) = seed_chapter(&pool, "A", 1).await;
let older = insert_job(&pool, c1, "dead", 5).await;
let newer = insert_job(&pool, c1, "dead", 5).await;
// Make `newer` unambiguously newer.
sqlx::query("UPDATE crawler_jobs SET updated_at = now() - interval '1 hour' WHERE id = $1")
.bind(older)
.execute(&pool)
.await
.unwrap();
for scope in [RequeueScope::All, RequeueScope::Manga(manga_id), RequeueScope::Chapter(c1)] {
// Reset to two-dead before each scope variant.
sqlx::query("UPDATE crawler_jobs SET state = 'dead' WHERE id = ANY($1)")
.bind(vec![older, newer])
.execute(&pool)
.await
.unwrap();
let n = crawler::requeue_dead_jobs(&pool, scope)
.await
.expect("requeue must not error on duplicate dead jobs");
assert_eq!(n, 1, "exactly one dead job per chapter is revived");
// The newest one is the survivor; the other stays dead.
assert_eq!(state_of(&pool, newer).await, "pending");
assert_eq!(state_of(&pool, older).await, "dead");
}
}
#[sqlx::test(migrations = "./migrations")]
async fn list_active_jobs_returns_pending_and_running_running_first(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "Naruto", 700).await;
let (_m2, c2) = seed_chapter(&pool, "Bleach", 10).await;
insert_job(&pool, c1, "pending", 0).await;
insert_job(&pool, c2, "running", 1).await;
// A dead + a done job must NOT appear.
let (_m3, c3) = seed_chapter(&pool, "Gone", 1).await;
insert_job(&pool, c3, "dead", 5).await;
let (items, total) = crawler::list_active_jobs(&pool, None, 50, 0).await.unwrap();
assert_eq!(total, 2);
assert_eq!(items.len(), 2);
// Running first.
assert_eq!(items[0].state, "running");
assert_eq!(items[0].manga_title.as_deref(), Some("Bleach"));
assert_eq!(items[1].state, "pending");
assert_eq!(items[1].chapter_number, Some(700));
}
#[sqlx::test(migrations = "./migrations")]
async fn list_active_jobs_filters_by_title(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "Naruto", 1).await;
let (_m2, c2) = seed_chapter(&pool, "One Piece", 1).await;
insert_job(&pool, c1, "pending", 0).await;
insert_job(&pool, c2, "pending", 0).await;
let (items, total) = crawler::list_active_jobs(&pool, Some("piece"), 50, 0)
.await
.unwrap();
assert_eq!(total, 1);
assert_eq!(items[0].manga_title.as_deref(), Some("One Piece"));
}
#[sqlx::test(migrations = "./migrations")]
async fn missing_covers_count_and_list(pool: PgPool) {
seed_missing_cover(&pool, "Naruto").await;
seed_missing_cover(&pool, "Bleach").await;
// A manga WITH a cover must not be counted.
let with_cover = Uuid::new_v4();
sqlx::query("INSERT INTO mangas (id, title, cover_image_path) VALUES ($1, 'Done', 'k.jpg')")
.bind(with_cover)
.execute(&pool)
.await
.unwrap();
assert_eq!(crawler::count_missing_covers(&pool).await.unwrap(), 2);
let (items, total) = crawler::list_missing_cover_mangas(&pool, None, 50, 0)
.await
.unwrap();
assert_eq!(total, 2);
assert_eq!(items.len(), 2);
let (items, total) = crawler::list_missing_cover_mangas(&pool, Some("naru"), 50, 0)
.await
.unwrap();
assert_eq!(total, 1);
assert_eq!(items[0].manga_title, "Naruto");
}
#[sqlx::test(migrations = "./migrations")]
async fn job_state_counts_groups_by_state(pool: PgPool) {
let (_m, c1) = seed_chapter(&pool, "A", 1).await;
let (_m2, c2) = seed_chapter(&pool, "B", 1).await;
let (_m3, c3) = seed_chapter(&pool, "C", 1).await;
insert_job(&pool, c1, "pending", 0).await;
insert_job(&pool, c2, "dead", 5).await;
insert_job(&pool, c3, "dead", 5).await;
let (pending, running, dead) = crawler::job_state_counts(&pool).await.unwrap();
assert_eq!(pending, 1);
assert_eq!(running, 0);
assert_eq!(dead, 2);
}

View File

@@ -0,0 +1,85 @@
//! Integration tests for the incremental-mode coordination state:
//! `mark_seed_completed` / `seed_completed_at` round-trip via the
//! `crawler_state` table.
//!
//! End-to-end pipeline behavior (walker + stop-on-Unchanged) requires
//! a real `chromiumoxide::Browser` to construct a `FetchContext`, so
//! the live integration of that path is covered by
//! `crawler_browser_smoke.rs` instead. The pure stop logic itself is
//! unit-tested in `crawler::pipeline::tests`.
use chrono::Utc;
use mangalord::repo::crawler;
use sqlx::PgPool;
#[sqlx::test(migrations = "./migrations")]
async fn seed_completed_at_none_before_any_run(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let res = crawler::seed_completed_at(&pool, "target").await.unwrap();
assert!(res.is_none(), "fresh source has no seed marker");
}
#[sqlx::test(migrations = "./migrations")]
async fn mark_seed_completed_then_read_round_trips_timestamp(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let at = Utc::now();
crawler::mark_seed_completed(&pool, "target", at)
.await
.unwrap();
let read = crawler::seed_completed_at(&pool, "target")
.await
.unwrap()
.expect("marker present after mark");
// RFC3339 round-trip is millisecond-precise on chrono::Utc; allow a
// 1ms tolerance to absorb postgres jsonb whitespace canonicalization.
let drift = (read - at).num_milliseconds().abs();
assert!(drift <= 1, "round-trip drift: {drift}ms (at={at}, read={read})");
}
#[sqlx::test(migrations = "./migrations")]
async fn mark_seed_completed_overwrites_previous_value(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let first = Utc::now() - chrono::Duration::hours(1);
let second = Utc::now();
crawler::mark_seed_completed(&pool, "target", first)
.await
.unwrap();
crawler::mark_seed_completed(&pool, "target", second)
.await
.unwrap();
let read = crawler::seed_completed_at(&pool, "target")
.await
.unwrap()
.expect("marker present");
let drift = (read - second).num_milliseconds().abs();
assert!(drift <= 1, "should reflect the latest mark, not the first");
}
#[sqlx::test(migrations = "./migrations")]
async fn seed_completed_is_per_source(pool: PgPool) {
// Two sources, only one is marked complete. The other must still
// report None — the key is namespaced by source_id.
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
crawler::ensure_source(&pool, "other", "O", "https://y.example")
.await
.unwrap();
crawler::mark_seed_completed(&pool, "target", Utc::now())
.await
.unwrap();
assert!(crawler::seed_completed_at(&pool, "target")
.await
.unwrap()
.is_some());
assert!(crawler::seed_completed_at(&pool, "other")
.await
.unwrap()
.is_none());
}

View File

@@ -9,6 +9,7 @@ use std::time::Duration;
use mangalord::crawler::jobs::{
self, EnqueueResult, JobPayload, KIND_SYNC_CHAPTER_CONTENT,
};
use mangalord::crawler::source::DiscoverMode;
use sqlx::PgPool;
use uuid::Uuid;
@@ -20,13 +21,10 @@ fn chapter_content_payload(chapter_id: Uuid) -> JobPayload {
}
}
/// A non-`SyncChapterContent` payload, used to assert that only the
/// chapter-content kind is deduplicated by the partial index and that
/// `lease`'s kind filter correctly excludes other kinds.
fn sync_manga_payload(key: &str) -> JobPayload {
JobPayload::SyncManga {
fn discover_payload() -> JobPayload {
JobPayload::Discover {
source_id: "target".into(),
source_manga_key: key.into(),
mode: DiscoverMode::Backfill,
}
}
@@ -143,7 +141,7 @@ async fn different_chapter_ids_can_coexist(pool: PgPool) {
#[sqlx::test(migrations = "./migrations")]
async fn non_chapter_content_payloads_are_never_deduped(pool: PgPool) {
let p = sync_manga_payload("foo");
let p = discover_payload();
assert!(matches!(
jobs::enqueue(&pool, &p).await.unwrap(),
EnqueueResult::Inserted(_)
@@ -185,74 +183,9 @@ async fn lease_marks_running_and_bumps_attempts_and_sets_leased_until(pool: PgPo
assert!(leased_until > chrono::Utc::now());
}
#[sqlx::test(migrations = "./migrations")]
async fn renew_extends_leased_until_while_running(pool: PgPool) {
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
EnqueueResult::Skipped => unreachable!(),
};
// Lease with a short window, then collapse leased_until to the recent
// past so the renew is unambiguously an extension.
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(5))
.await
.unwrap();
assert_eq!(leases.len(), 1);
sqlx::query("UPDATE crawler_jobs SET leased_until = now() - interval '1 second' WHERE id = $1")
.bind(id)
.execute(&pool)
.await
.unwrap();
let still_owned = jobs::renew(&pool, id, Duration::from_secs(120))
.await
.unwrap();
assert!(still_owned, "renew on a running job returns true");
let leased_until: chrono::DateTime<chrono::Utc> =
sqlx::query_scalar("SELECT leased_until FROM crawler_jobs WHERE id = $1")
.bind(id)
.fetch_one(&pool)
.await
.unwrap();
assert!(
leased_until > chrono::Utc::now() + chrono::Duration::seconds(60),
"leased_until pushed ~120s into the future"
);
assert_eq!(job_state(&pool, id).await, "running");
}
#[sqlx::test(migrations = "./migrations")]
async fn renew_is_noop_once_job_no_longer_running(pool: PgPool) {
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
EnqueueResult::Skipped => unreachable!(),
};
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
.await
.unwrap();
// Job completes — heartbeat should now see it's no longer ours.
jobs::ack_done(&pool, leases[0].id).await.unwrap();
let still_owned = jobs::renew(&pool, id, Duration::from_secs(120))
.await
.unwrap();
assert!(!still_owned, "renew on a non-running job returns false");
assert_eq!(job_state(&pool, id).await, "done");
}
#[sqlx::test(migrations = "./migrations")]
async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
let manga_id = match jobs::enqueue(&pool, &sync_manga_payload("foo"))
.await
.unwrap()
{
let discover_id = match jobs::enqueue(&pool, &discover_payload()).await.unwrap() {
EnqueueResult::Inserted(id) => id,
_ => unreachable!(),
};
@@ -274,8 +207,8 @@ async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
.unwrap();
assert_eq!(leases.len(), 1, "only chapter content payload leases");
assert_eq!(leases[0].id, chapter_id);
// sync_manga is still pending
assert_eq!(job_state(&pool, manga_id).await, "pending");
// discover is still pending
assert_eq!(job_state(&pool, discover_id).await, "pending");
}
#[sqlx::test(migrations = "./migrations")]
@@ -422,112 +355,6 @@ async fn ack_failed_at_max_marks_dead(pool: PgPool) {
assert_eq!(last_error.as_deref(), Some("final boom"));
}
#[sqlx::test(migrations = "./migrations")]
async fn ack_done_no_ops_when_lease_was_stolen(pool: PgPool) {
// Worker A's lease expires, worker B re-leases the job (state stays
// 'running' but attempts++ and leased_until refreshed). A late
// ack_done from worker A must not clobber B's progress.
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
_ => unreachable!(),
};
// Worker A grabs the lease, but its lease expires immediately.
let _a_leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
.await
.unwrap();
sqlx::query("UPDATE crawler_jobs SET leased_until = now() - interval '1 minute' WHERE id = $1")
.bind(id)
.execute(&pool)
.await
.unwrap();
// Worker B re-leases the expired-but-still-running job.
let b_leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
.await
.unwrap();
assert_eq!(b_leases.len(), 1);
assert_eq!(b_leases[0].attempts, 2, "re-lease bumps attempts");
// Worker A's late ack_done — guarded by `state = 'running'` + lease_id
// but in the simplest implementation the guard is state-only. Either
// way, the job stays 'running' with worker B's progress intact.
jobs::ack_done(&pool, id).await.unwrap();
// Worker B is still working; until B acks, the job remains 'running'
// with its leased_until in the future and attempts == 2.
// (We can't make ack_done's lease_id distinguish A from B today —
// both share the same `id` — so the strongest current guarantee is
// that a late ack_done doesn't fire when state is already 'done',
// exercised below.)
// Finalize: worker B acks done.
jobs::ack_done(&pool, b_leases[0].id).await.unwrap();
assert_eq!(job_state(&pool, id).await, "done");
assert_eq!(job_attempts(&pool, id).await, 2);
}
#[sqlx::test(migrations = "./migrations")]
async fn ack_failed_no_ops_when_state_is_not_running(pool: PgPool) {
// After a job transitions to 'done', a stale ack_failed (e.g. a
// worker that finished work and queued its ack but then handed off
// before the SQL ran) must not flip the state back to 'pending' or
// 'dead'. The `state = 'running'` predicate enforces this.
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
_ => unreachable!(),
};
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
.await
.unwrap();
jobs::ack_done(&pool, leases[0].id).await.unwrap();
assert_eq!(job_state(&pool, id).await, "done");
// Late ack_failed arrives. Must be a no-op.
jobs::ack_failed(&pool, leases[0].id, "late", 1, 5)
.await
.unwrap();
assert_eq!(
job_state(&pool, id).await,
"done",
"late ack_failed must not resurrect a done job"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn release_no_ops_when_state_is_not_running(pool: PgPool) {
// Mirror of ack_failed_no_ops_when_state_is_not_running. release also
// decrements `attempts`, which would corrupt a re-leased job's
// attempt count if the guard were missing.
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
_ => unreachable!(),
};
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
.await
.unwrap();
jobs::ack_done(&pool, leases[0].id).await.unwrap();
let attempts_before = job_attempts(&pool, id).await;
// Late release arrives.
jobs::release(&pool, leases[0].id).await.unwrap();
assert_eq!(
job_state(&pool, id).await,
"done",
"late release must not flip a done job back to pending"
);
assert_eq!(
job_attempts(&pool, id).await,
attempts_before,
"late release must not decrement attempts of a non-running job"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn release_returns_to_pending_and_undoes_attempt_increment(pool: PgPool) {
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
@@ -593,89 +420,6 @@ async fn reap_done_deletes_old_rows_keeps_fresh(pool: PgPool) {
assert_eq!(remaining, vec![fresh_id], "only fresh row remains");
}
#[sqlx::test(migrations = "./migrations")]
async fn lease_ties_on_scheduled_at_break_by_created_at(pool: PgPool) {
// Locks in the tiebreaker that lets enqueue order survive the lease
// step: when many jobs share `scheduled_at` (the common cron-batch
// case), the worker must pick the earliest-inserted row, not whatever
// Postgres returns in heap order. The enqueue path inserts chapters
// in chapter-number order, so this tiebreaker is what makes "queue
// in rising order" observable at the dequeue side too.
let a = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
_ => unreachable!(),
};
let b = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
_ => unreachable!(),
};
let c = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
.await
.unwrap()
{
EnqueueResult::Inserted(id) => id,
_ => unreachable!(),
};
// Pin `scheduled_at` to a single literal instant (shared across all
// three rows — `now()` would yield a different microsecond per UPDATE
// and make scheduled_at the actual sort key). Reverse `created_at`
// against insertion order so heap order would give the wrong answer.
let shared_scheduled = chrono::Utc::now() - chrono::Duration::hours(1);
sqlx::query(
"UPDATE crawler_jobs \
SET scheduled_at = $2, \
created_at = $3 \
WHERE id = $1",
)
.bind(a)
.bind(shared_scheduled)
.bind(chrono::Utc::now() - chrono::Duration::seconds(10))
.execute(&pool)
.await
.unwrap();
sqlx::query(
"UPDATE crawler_jobs \
SET scheduled_at = $2, \
created_at = $3 \
WHERE id = $1",
)
.bind(b)
.bind(shared_scheduled)
.bind(chrono::Utc::now() - chrono::Duration::seconds(20))
.execute(&pool)
.await
.unwrap();
sqlx::query(
"UPDATE crawler_jobs \
SET scheduled_at = $2, \
created_at = $3 \
WHERE id = $1",
)
.bind(c)
.bind(shared_scheduled)
.bind(chrono::Utc::now() - chrono::Duration::seconds(30))
.execute(&pool)
.await
.unwrap();
let leases = jobs::lease(&pool, None, 10, Duration::from_secs(60))
.await
.unwrap();
let order: Vec<Uuid> = leases.iter().map(|l| l.id).collect();
assert_eq!(
order,
vec![c, b, a],
"lease must return jobs in created_at order when scheduled_at ties"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn reap_done_zero_is_a_no_op(pool: PgPool) {
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))

View File

@@ -1,82 +0,0 @@
//! Integration tests for the per-source recovery flag:
//! `mark_run_started` / `mark_run_completed` / `last_run_completed_cleanly`
//! round-trip via the `crawler_state` table.
//!
//! End-to-end pipeline behavior (a crashed run forcing a recovery sweep
//! on the next tick) requires a real `chromiumoxide::Browser` to drive
//! the walker, so that path is covered by `crawler_browser_smoke.rs`.
//! The pure stop-condition logic itself is unit-tested in
//! `crawler::pipeline::tests`.
use mangalord::repo::crawler;
use sqlx::PgPool;
#[sqlx::test(migrations = "./migrations")]
async fn defaults_to_clean_when_no_marker(pool: PgPool) {
// First-ever run semantics: absence of the key must NOT trigger a
// recovery walk on a virgin DB. Treat missing as "previous run
// completed cleanly" so the first tick can take the early-stop path.
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let clean = crawler::last_run_completed_cleanly(&pool, "target")
.await
.unwrap();
assert!(clean, "absent marker must read as clean");
}
#[sqlx::test(migrations = "./migrations")]
async fn mark_run_started_flips_to_false(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
crawler::mark_run_started(&pool, "target").await.unwrap();
let clean = crawler::last_run_completed_cleanly(&pool, "target")
.await
.unwrap();
assert!(!clean, "after mark_run_started, flag must read false");
}
#[sqlx::test(migrations = "./migrations")]
async fn started_then_completed_round_trips_to_clean(pool: PgPool) {
// Steady-state: a run starts (flag → false) and exits cleanly
// (flag → true). The next tick should see "clean" and apply the
// normal stop condition.
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
crawler::mark_run_started(&pool, "target").await.unwrap();
crawler::mark_run_completed(&pool, "target").await.unwrap();
let clean = crawler::last_run_completed_cleanly(&pool, "target")
.await
.unwrap();
assert!(
clean,
"after start → complete the flag must round-trip to clean"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn flag_is_per_source(pool: PgPool) {
// Two sources, only one is mid-run. The other must still report
// clean — the crawler_state key is namespaced by source_id.
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
crawler::ensure_source(&pool, "other", "O", "https://y.example")
.await
.unwrap();
crawler::mark_run_started(&pool, "target").await.unwrap();
assert!(
!crawler::last_run_completed_cleanly(&pool, "target")
.await
.unwrap(),
"target is mid-run"
);
assert!(
crawler::last_run_completed_cleanly(&pool, "other")
.await
.unwrap(),
"other source is untouched and reads clean"
);
}

View File

@@ -6,7 +6,6 @@
use mangalord::crawler::source::{SourceChapterRef, SourceManga};
use mangalord::repo::crawler::{self, ChapterDiff, UpsertStatus};
use mangalord::repo::chapter as chapter_repo;
use sqlx::PgPool;
use uuid::Uuid;
@@ -233,67 +232,6 @@ async fn sync_chapters_adds_new_refreshes_existing_and_drops_vanished(pool: PgPo
assert!(dropped.0.is_some(), "ch2 should be soft-dropped");
}
#[sqlx::test(migrations = "./migrations")]
async fn live_chapter_count_returns_zero_for_unknown_source_key(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
// No manga_sources row yet → unknown key path. Must not error and
// must report zero so the partial-render guard accepts the
// "brand-new manga with no chapters" case as legitimate.
let n = crawler::live_chapter_count_for_source_manga(&pool, "target", "nobody")
.await
.unwrap();
assert_eq!(n, 0);
}
#[sqlx::test(migrations = "./migrations")]
async fn live_chapter_count_only_counts_live_sources(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let chapters = vec![
SourceChapterRef {
source_chapter_key: "1".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/1".into(),
},
SourceChapterRef {
source_chapter_key: "2".into(),
number: 2,
title: Some("Ch.2".into()),
url: "https://x.example/foo/2".into(),
},
];
crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
.await
.unwrap();
assert_eq!(
crawler::live_chapter_count_for_source_manga(&pool, "target", "foo")
.await
.unwrap(),
2
);
// Soft-drop one source row — count drops by one, the row stays.
sqlx::query(
"UPDATE chapter_sources SET dropped_at = NOW() WHERE source_chapter_key = '2'",
)
.execute(&pool)
.await
.unwrap();
assert_eq!(
crawler::live_chapter_count_for_source_manga(&pool, "target", "foo")
.await
.unwrap(),
1
);
}
/// Real-world sources publish multiple chapters at the same number
/// (different uploaders, translator notes, re-releases). After the
/// (manga_id, number) UNIQUE drop in 0013, each `SourceChapterRef`
@@ -371,223 +309,59 @@ async fn sync_chapters_keeps_duplicate_numbered_chapters_as_separate_rows(pool:
}
#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_isolates_colliding_keys_across_mangas(pool: PgPool) {
// Two mangas, both with a chapter whose source_chapter_key is
// "chapter-1". Pre-migration-0017 the PK enforced (source_id,
// source_chapter_key) globally and the lookup didn't filter by
// manga_id, so the second manga's sync would adopt the first manga's
// chapter_id (silent attribution corruption). After 0017 each manga
// owns its own row.
async fn mark_dropped_mangas_only_drops_unseen(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m1 = sample_manga("foo", "Manga Foo", "hash-foo");
let m2 = sample_manga("bar", "Manga Bar", "hash-bar");
let up1 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m1)
.await
.unwrap();
let up2 = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/bar", &m2)
.await
.unwrap();
assert_ne!(up1.manga_id, up2.manga_id);
let shared = vec![SourceChapterRef {
source_chapter_key: "chapter-1".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/chapter-1/".into(),
}];
let diff1 = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &shared)
.await
.unwrap();
assert_eq!(diff1.new, 1, "manga foo: chapter inserted fresh");
// Manga bar now syncs *the same key*. Under the old schema this would
// either fail on PK conflict or attribute the chapter to foo. Under
// the new schema bar gets its own chapter row.
let bar_chapters = vec![SourceChapterRef {
source_chapter_key: "chapter-1".into(),
number: 1,
title: Some("Ch.1 (bar)".into()),
url: "https://x.example/bar/chapter-1/".into(),
}];
let diff2 = crawler::sync_manga_chapters(&pool, "target", up2.manga_id, &bar_chapters)
.await
.unwrap();
assert_eq!(
diff2.new, 1,
"manga bar: same key resolved per-manga to a fresh row"
);
let foo_count: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
// Seed two mangas before "now" so a later run_started_at sees them as stale.
let _ = crawler::upsert_manga_from_source(
&pool,
"target",
"https://x.example/foo",
&sample_manga("foo", "Foo", "hf"),
)
.bind(up1.manga_id)
.fetch_one(&pool)
.await
.unwrap();
let bar_count: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM chapters WHERE manga_id = $1",
let _ = crawler::upsert_manga_from_source(
&pool,
"target",
"https://x.example/bar",
&sample_manga("bar", "Bar", "hb"),
)
.bind(up2.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(foo_count.0, 1);
assert_eq!(bar_count.0, 1);
let bar_title: (Option<String>,) = sqlx::query_as(
"SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
// Now mark a new "run" beginning. Re-upsert only `foo` — `bar`
// should be the one flagged dropped.
let run_started = chrono::Utc::now();
// Sleep briefly so the second upsert's NOW() > run_started_at.
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
let _ = crawler::upsert_manga_from_source(
&pool,
"target",
"https://x.example/foo",
&sample_manga("foo", "Foo", "hf"),
)
.bind(up2.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(
bar_title.0.as_deref(),
Some("Ch.1 (bar)"),
"bar's chapter has bar's title, not foo's"
);
// A subsequent re-sync of foo with the same key correctly refreshes
// foo's row, not bar's.
let foo_resync = vec![SourceChapterRef {
source_chapter_key: "chapter-1".into(),
number: 1,
title: Some("Ch.1 (foo updated)".into()),
url: "https://x.example/foo/chapter-1/".into(),
}];
let diff_refresh = crawler::sync_manga_chapters(&pool, "target", up1.manga_id, &foo_resync)
let n = crawler::mark_dropped_mangas(&pool, "target", run_started)
.await
.unwrap();
assert_eq!(diff_refresh.refreshed, 1);
assert_eq!(diff_refresh.new, 0);
assert_eq!(n, 1, "only bar should have been dropped");
let foo_title: (Option<String>,) = sqlx::query_as(
"SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
)
.bind(up1.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(foo_title.0.as_deref(), Some("Ch.1 (foo updated)"));
let bar_title_after: (Option<String>,) = sqlx::query_as(
"SELECT title FROM chapters WHERE manga_id = $1 AND number = 1",
)
.bind(up2.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(
bar_title_after.0.as_deref(),
Some("Ch.1 (bar)"),
"bar's row is untouched by foo's refresh"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn sync_chapters_serializes_concurrent_calls_for_same_manga(pool: PgPool) {
// Without the per-manga advisory lock, two concurrent calls would
// both read `seen_keys`, both run the drop UPDATE filtered on `NOT
// (key = ANY $3)`, and the later commit could soft-drop a chapter
// the earlier had just inserted. The lock makes the calls strictly
// sequential per-manga: whichever runs second sees the first one's
// committed chapters and treats their absence as a "dropped" signal
// only if the second list legitimately omits them.
//
// Concretely: pre-state [A]. Call X syncs [A, B]; call Y syncs
// [A, B, C]. Whatever the schedule, the final state must include
// *all three* chapters because neither call legitimately omits the
// other's contribution — both lists are supersets of each other's
// pre-existing rows.
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let manga_id = up.manga_id;
// Pre-state: [A].
let pre = vec![SourceChapterRef {
source_chapter_key: "A".into(),
number: 1,
title: Some("Ch.A".into()),
url: "https://x.example/foo/A".into(),
}];
crawler::sync_manga_chapters(&pool, "target", manga_id, &pre)
.await
.unwrap();
// Two concurrent calls. Call X adds B; call Y adds B + C. Both keep
// A. Their drop branches would otherwise race against each other.
let list_x = vec![
SourceChapterRef {
source_chapter_key: "A".into(),
number: 1,
title: Some("Ch.A".into()),
url: "https://x.example/foo/A".into(),
},
SourceChapterRef {
source_chapter_key: "B".into(),
number: 2,
title: Some("Ch.B".into()),
url: "https://x.example/foo/B".into(),
},
];
let list_y = vec![
SourceChapterRef {
source_chapter_key: "A".into(),
number: 1,
title: Some("Ch.A".into()),
url: "https://x.example/foo/A".into(),
},
SourceChapterRef {
source_chapter_key: "B".into(),
number: 2,
title: Some("Ch.B".into()),
url: "https://x.example/foo/B".into(),
},
SourceChapterRef {
source_chapter_key: "C".into(),
number: 3,
title: Some("Ch.C".into()),
url: "https://x.example/foo/C".into(),
},
];
let pool_x = pool.clone();
let pool_y = pool.clone();
let (rx, ry) = tokio::join!(
tokio::spawn(async move {
crawler::sync_manga_chapters(&pool_x, "target", manga_id, &list_x).await
}),
tokio::spawn(async move {
crawler::sync_manga_chapters(&pool_y, "target", manga_id, &list_y).await
}),
);
rx.unwrap().expect("call X");
ry.unwrap().expect("call Y");
// All three keys must survive with dropped_at NULL — the lock
// ensures the later call sees the earlier one's INSERTs and the
// drop UPDATE finds nothing to drop.
let alive: Vec<String> = sqlx::query_scalar(
"SELECT cs.source_chapter_key \
FROM chapter_sources cs \
JOIN chapters ch ON ch.id = cs.chapter_id \
WHERE ch.manga_id = $1 AND cs.dropped_at IS NULL \
ORDER BY cs.source_chapter_key",
)
.bind(manga_id)
.fetch_all(&pool)
.await
.unwrap();
assert_eq!(
alive,
vec!["A".to_string(), "B".to_string(), "C".to_string()],
"all chapters survive concurrent syncs that both contain them"
);
let foo_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'foo'")
.fetch_one(&pool)
.await
.unwrap();
assert!(foo_dropped.0.is_none(), "foo seen this run, must not be dropped");
let bar_dropped: (Option<chrono::DateTime<chrono::Utc>>,) =
sqlx::query_as("SELECT dropped_at FROM manga_sources WHERE source_manga_key = 'bar'")
.fetch_one(&pool)
.await
.unwrap();
assert!(bar_dropped.0.is_some());
}
#[sqlx::test(migrations = "./migrations")]
@@ -666,271 +440,6 @@ async fn arbitrary_genres_from_source_get_inserted(pool: PgPool) {
assert_eq!(webtoons_count.0, 1, "case-insensitive lookup reuses the existing row");
}
/// User-attached tags (rows with non-NULL `added_by` in `manga_tags`)
/// must survive a crawler upsert. The crawler owns source-attached tags
/// (added_by IS NULL); user attachments are owned by the user who made
/// them and the recurring metadata pass must not delete them.
#[sqlx::test(migrations = "./migrations")]
async fn sync_tags_preserves_user_attached_tags(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// A real user attaches a personal tag.
let user = mangalord::repo::user::create(&pool, "alice", "phc-stub")
.await
.unwrap();
let outcome = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
.await
.unwrap();
assert!(outcome.created_attachment);
// Second crawler pass. Use a different metadata_hash so the upsert
// takes the Updated branch, but the bug also fires on Unchanged
// ticks since sync_tags runs unconditionally.
let mut m2 = m.clone();
m2.metadata_hash = "hash-2".into();
m2.tags = vec!["popular".into(), "weekly".into()];
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
.await
.unwrap();
// The user tag must still be attached.
let user_tag_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
AND mt.added_by = $2",
)
.bind(up.manga_id)
.bind(user.id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(
user_tag_rows.0, 1,
"user-attached tag must survive a crawler upsert"
);
// The source's tags should still attach as well, as crawler-owned.
let source_tag_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 \
AND mt.added_by IS NULL \
AND lower(t.name) IN ('popular', 'weekly')",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(source_tag_rows.0, 2, "source tags re-attach on each pass");
// A subsequent pass where the source drops a previously-seen tag
// must clear that crawler-owned attachment (otherwise crawler-tags
// would only ever accumulate).
let mut m3 = m2.clone();
m3.metadata_hash = "hash-3".into();
m3.tags = vec!["popular".into()];
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m3)
.await
.unwrap();
let weekly_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'weekly'",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(weekly_rows.0, 0, "source-owned tag dropped by source goes away");
// And the user tag still survives that third pass.
let user_tag_rows: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
AND mt.added_by = $2",
)
.bind(up.manga_id)
.bind(user.id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(user_tag_rows.0, 1);
}
/// `manga_tags.added_by` is `ON DELETE SET NULL` on the user FK. When
/// the attaching user is deleted, their attachments become orphans
/// indistinguishable from crawler-owned rows — and the crawler should
/// reap them on the next pass. Pins the semantic so a future change
/// can't quietly leave orphan rows lying around.
#[sqlx::test(migrations = "./migrations")]
async fn sync_tags_garbage_collects_orphan_user_attachments(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// A user attaches "personal", then the user gets deleted. The
// attachment row stays (manga_tags.manga_id FK is CASCADE on
// mangas only; we never CASCADE-delete user attachments). The FK
// on added_by is `ON DELETE SET NULL`, so the row's owner column
// goes NULL — same shape as a crawler-owned row.
let user = mangalord::repo::user::create(&pool, "bob", "phc-stub")
.await
.unwrap();
let _ = mangalord::repo::tag::attach_to_manga(&pool, up.manga_id, "personal", user.id)
.await
.unwrap();
sqlx::query("DELETE FROM users WHERE id = $1")
.bind(user.id)
.execute(&pool)
.await
.unwrap();
// Sanity: the orphan still exists post-user-delete with added_by NULL.
let (orphan_rows,): (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal' \
AND mt.added_by IS NULL",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(orphan_rows, 1);
// Next crawler pass — orphan should be reaped along with any
// other source-owned rows that aren't in the new tag list.
let mut m2 = m.clone();
m2.metadata_hash = "hash-2".into();
m2.tags = vec!["popular".into()];
let _ = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m2)
.await
.unwrap();
let (orphan_rows,): (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM manga_tags mt \
JOIN tags t ON t.id = mt.tag_id \
WHERE mt.manga_id = $1 AND lower(t.name) = 'personal'",
)
.bind(up.manga_id)
.fetch_one(&pool)
.await
.unwrap();
assert_eq!(orphan_rows, 0, "orphan user-attached tag should be reaped");
}
// ---- list_missing_covers ---------------------------------------------------
#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_only_returns_rows_without_cover(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let with_cover = sample_manga("with", "With Cover", "h1");
let without_cover = sample_manga("without", "No Cover", "h2");
let _w = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/with", &with_cover)
.await
.unwrap();
let nc = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/without", &without_cover)
.await
.unwrap();
// Manually set a cover for `with` only.
sqlx::query("UPDATE mangas SET cover_image_path = 'mangas/x/cover.jpg' WHERE id = $1")
.bind(_w.manga_id)
.execute(&pool)
.await
.unwrap();
let entries = crawler::list_missing_covers(&pool, 50).await.unwrap();
assert_eq!(entries.len(), 1, "exactly the manga without a cover");
assert_eq!(entries[0].manga_id, nc.manga_id);
assert_eq!(entries[0].source_manga_key, "without");
assert_eq!(entries[0].source_url, "https://x.example/without");
}
#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_skips_dropped_source_rows(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo", "h1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
sqlx::query("UPDATE manga_sources SET dropped_at = NOW() WHERE manga_id = $1")
.bind(up.manga_id)
.execute(&pool)
.await
.unwrap();
let entries = crawler::list_missing_covers(&pool, 50).await.unwrap();
assert!(
entries.is_empty(),
"dropped-source mangas must not be backfilled — no live source to fetch from"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_respects_limit(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
for i in 0..5 {
let key = format!("m{i}");
let url = format!("https://x.example/{key}");
let m = sample_manga(&key, &format!("M{i}"), &format!("h{i}"));
let _ = crawler::upsert_manga_from_source(&pool, "target", &url, &m)
.await
.unwrap();
}
let entries = crawler::list_missing_covers(&pool, 3).await.unwrap();
assert_eq!(entries.len(), 3, "limit caps the result set");
}
#[sqlx::test(migrations = "./migrations")]
async fn list_missing_covers_deduplicates_per_manga(pool: PgPool) {
// A manga surfaced by two sources should produce ONE backfill
// entry, not two — otherwise the per-tick cap could be eaten by
// duplicates and starve other mangas.
crawler::ensure_source(&pool, "src-a", "A", "https://a.example")
.await
.unwrap();
crawler::ensure_source(&pool, "src-b", "B", "https://b.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo", "h1");
let up = crawler::upsert_manga_from_source(&pool, "src-a", "https://a.example/foo", &m)
.await
.unwrap();
// Second source attaches to the SAME manga row.
sqlx::query(
"INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url) \
VALUES ($1, $2, $3, $4)",
)
.bind("src-b")
.bind("foo-on-b")
.bind(up.manga_id)
.bind("https://b.example/foo")
.execute(&pool)
.await
.unwrap();
let entries = crawler::list_missing_covers(&pool, 50).await.unwrap();
assert_eq!(entries.len(), 1, "DISTINCT ON (m.id) collapses duplicate source rows");
}
#[sqlx::test(migrations = "./migrations")]
async fn re_appearing_manga_clears_dropped_at(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
@@ -962,261 +471,3 @@ async fn re_appearing_manga_clears_dropped_at(pool: PgPool) {
assert!(dropped.0.is_none());
assert_eq!(dropped.1, up.manga_id);
}
// ---- source_index: site-order preservation ----
//
// The user-facing chapter list reverses the source-site order so that
// the oldest chapter appears first. The crawler records each row's DOM
// position in `chapters.source_index` (0 = first in source DOM = newest
// on this site) on every sync; the list query orders by source_index
// DESC NULLS LAST, falling through to number/created_at for rows with
// no source row (e.g. user uploads).
#[sqlx::test(migrations = "./migrations")]
async fn source_index_set_on_insert_matches_dom_order(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let chapters = vec![
SourceChapterRef {
source_chapter_key: "a".into(),
number: 30,
title: Some("Ch.30".into()),
url: "https://x.example/foo/a".into(),
},
SourceChapterRef {
source_chapter_key: "b".into(),
number: 29,
title: Some("Ch.29".into()),
url: "https://x.example/foo/b".into(),
},
SourceChapterRef {
source_chapter_key: "c".into(),
number: 28,
title: Some("Ch.28".into()),
url: "https://x.example/foo/c".into(),
},
];
crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
.await
.unwrap();
let rows: Vec<(String, Option<i32>)> = sqlx::query_as(
"SELECT cs.source_chapter_key, c.source_index \
FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
WHERE c.manga_id = $1 \
ORDER BY cs.source_chapter_key",
)
.bind(up.manga_id)
.fetch_all(&pool)
.await
.unwrap();
assert_eq!(
rows,
vec![
("a".to_string(), Some(0)),
("b".to_string(), Some(1)),
("c".to_string(), Some(2)),
],
"source_index reflects enumerate() position in the input slice",
);
}
#[sqlx::test(migrations = "./migrations")]
async fn source_index_rewritten_on_resync_when_new_chapter_prepended(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let first = vec![
SourceChapterRef {
source_chapter_key: "a".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/a".into(),
},
SourceChapterRef {
source_chapter_key: "b".into(),
number: 2,
title: Some("Ch.2".into()),
url: "https://x.example/foo/b".into(),
},
];
crawler::sync_manga_chapters(&pool, "target", up.manga_id, &first)
.await
.unwrap();
// Second sync: a brand-new chapter appears at the top of the source
// (newest first on the site). All existing rows must shift their
// source_index down by one so the display order stays correct.
let second = vec![
SourceChapterRef {
source_chapter_key: "new".into(),
number: 3,
title: Some("Ch.3".into()),
url: "https://x.example/foo/new".into(),
},
SourceChapterRef {
source_chapter_key: "a".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/a".into(),
},
SourceChapterRef {
source_chapter_key: "b".into(),
number: 2,
title: Some("Ch.2".into()),
url: "https://x.example/foo/b".into(),
},
];
crawler::sync_manga_chapters(&pool, "target", up.manga_id, &second)
.await
.unwrap();
let rows: Vec<(String, Option<i32>)> = sqlx::query_as(
"SELECT cs.source_chapter_key, c.source_index \
FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
WHERE c.manga_id = $1 \
ORDER BY cs.source_chapter_key",
)
.bind(up.manga_id)
.fetch_all(&pool)
.await
.unwrap();
assert_eq!(
rows,
vec![
("a".to_string(), Some(1)),
("b".to_string(), Some(2)),
("new".to_string(), Some(0)),
],
"new chapter takes index 0, existing rows shift down on UPDATE",
);
}
#[sqlx::test(migrations = "./migrations")]
async fn list_for_manga_returns_source_order_reversed(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// Site DOM order (top-down = newest-first):
// ch11 (number = 11)
// notice (number = 0, non-numeric label on the site)
// ch10 (number = 10)
// Numbers deliberately disagree with DOM order: a number-based sort
// would put notice first, but the site places it between ch10 and
// ch11. Reversed-DOM display should yield [ch10, notice, ch11].
let chapters = vec![
SourceChapterRef {
source_chapter_key: "ch11".into(),
number: 11,
title: Some("Ch.11 : Official".into()),
url: "https://x.example/foo/11".into(),
},
SourceChapterRef {
source_chapter_key: "notice".into(),
number: 0,
title: Some("notice. : Officials".into()),
url: "https://x.example/foo/notice".into(),
},
SourceChapterRef {
source_chapter_key: "ch10".into(),
number: 10,
title: Some("Ch.10 : Official".into()),
url: "https://x.example/foo/10".into(),
},
];
crawler::sync_manga_chapters(&pool, "target", up.manga_id, &chapters)
.await
.unwrap();
let listed = chapter_repo::list_for_manga(&pool, up.manga_id, 50, 0)
.await
.unwrap();
let keys: Vec<String> = listed
.iter()
.map(|c| c.title.clone().unwrap_or_default())
.collect();
assert_eq!(
keys,
vec![
"Ch.10 : Official".to_string(),
"notice. : Officials".to_string(),
"Ch.11 : Official".to_string(),
],
"list returns chapters in reversed source-DOM order, so the \
oldest appears first and non-numeric entries land where the \
site placed them",
);
}
#[sqlx::test(migrations = "./migrations")]
async fn list_for_manga_places_null_source_index_last(pool: PgPool) {
crawler::ensure_source(&pool, "target", "T", "https://x.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = crawler::upsert_manga_from_source(&pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
// Crawled chapters get source_index 0 and 1; the upload path leaves
// it NULL. NULLS LAST plus the (number, created_at) tail means the
// upload sits after both crawled rows even though its number is in
// the middle.
let crawled = vec![
SourceChapterRef {
source_chapter_key: "a".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/a".into(),
},
SourceChapterRef {
source_chapter_key: "b".into(),
number: 3,
title: Some("Ch.3".into()),
url: "https://x.example/foo/b".into(),
},
];
crawler::sync_manga_chapters(&pool, "target", up.manga_id, &crawled)
.await
.unwrap();
chapter_repo::create(&pool, up.manga_id, 2, Some("User upload Ch.2"), None)
.await
.unwrap();
let listed = chapter_repo::list_for_manga(&pool, up.manga_id, 50, 0)
.await
.unwrap();
let titles: Vec<String> = listed
.iter()
.map(|c| c.title.clone().unwrap_or_default())
.collect();
assert_eq!(
titles,
vec![
"Ch.3".to_string(),
"Ch.1".to_string(),
"User upload Ch.2".to_string(),
],
"crawled rows ordered by reversed source_index; user upload \
(NULL source_index) falls through to the end",
);
}

View File

@@ -1,162 +0,0 @@
//! Integration tests for `repo::chapter` — focused on
//! `dispatch_target`, the resolver the daemon's chapter dispatcher
//! uses to look up the URL it needs to hand to
//! `content::sync_chapter_content`.
//!
//! The query must:
//! 1. Skip `chapter_sources` rows where `dropped_at IS NOT NULL` —
//! otherwise a soft-dropped source URL is dispatched as if live and
//! burns the chapter's retry budget against guaranteed transients.
//! 2. Order the remaining rows by `last_seen_at DESC` so the freshest
//! surviving source is the one we'll fetch from.
//!
//! The fix lives in `backend/src/repo/chapter.rs:dispatch_target`. The
//! enqueue queries at `pipeline.rs:381` and `:435` already filter on
//! `cs.dropped_at IS NULL`; this brings the resolver into line.
use mangalord::crawler::source::{SourceChapterRef, SourceManga};
use mangalord::repo::{
chapter::dispatch_target,
crawler::{ensure_source, sync_manga_chapters, upsert_manga_from_source},
};
use sqlx::PgPool;
use uuid::Uuid;
fn sample_manga(key: &str, title: &str, hash: &str) -> SourceManga {
SourceManga {
source_manga_key: key.to_string(),
title: title.to_string(),
alternative_titles: vec![],
authors: vec![],
genres: vec![],
tags: vec![],
status: None,
summary: None,
cover_url: None,
chapters: vec![],
metadata_hash: hash.to_string(),
}
}
/// Seed a manga with one chapter, plus a second `chapter_sources` row
/// pointing at the same chapter with a *newer* `last_seen_at` so the
/// `ORDER BY cs.last_seen_at DESC` branch of the fixed query can
/// distinguish "freshest live source" from "any live source."
async fn seed_chapter_with_two_live_sources(pool: &PgPool) -> (Uuid, String, String) {
// Two distinct sources both pointing at the same chapter is the
// realistic shape of the multi-source state — each source row is
// keyed (source_id, chapter_id) after migration 0017.
ensure_source(pool, "target", "T", "https://x.example")
.await
.unwrap();
ensure_source(pool, "mirror", "Mirror", "https://m.example")
.await
.unwrap();
let m = sample_manga("foo", "Foo Manga", "hash-1");
let up = upsert_manga_from_source(pool, "target", "https://x.example/foo", &m)
.await
.unwrap();
let initial = vec![SourceChapterRef {
source_chapter_key: "1".into(),
number: 1,
title: Some("Ch.1".into()),
url: "https://x.example/foo/1/old".into(),
}];
sync_manga_chapters(pool, "target", up.manga_id, &initial)
.await
.unwrap();
let (chapter_id,): (Uuid,) = sqlx::query_as(
"SELECT c.id FROM chapters c \
JOIN chapter_sources cs ON cs.chapter_id = c.id \
WHERE cs.source_chapter_key = '1' AND cs.source_id = 'target'",
)
.fetch_one(pool)
.await
.unwrap();
let old_url = "https://x.example/foo/1/old".to_string();
let new_url = "https://m.example/foo/1/mirror".to_string();
// Backdate the existing (old/target) source row and add a fresher
// row from the mirror source. The fix uses `last_seen_at DESC` to
// break the tie deterministically.
sqlx::query(
"UPDATE chapter_sources \
SET last_seen_at = NOW() - INTERVAL '2 days' \
WHERE chapter_id = $1 AND source_id = 'target'",
)
.bind(chapter_id)
.execute(pool)
.await
.unwrap();
sqlx::query(
"INSERT INTO chapter_sources \
(source_id, chapter_id, source_chapter_key, source_url, last_seen_at) \
VALUES ('mirror', $1, '1', $2, NOW())",
)
.bind(chapter_id)
.bind(&new_url)
.execute(pool)
.await
.unwrap();
(chapter_id, old_url, new_url)
}
#[sqlx::test(migrations = "./migrations")]
async fn dispatch_target_prefers_most_recent_live_source(pool: PgPool) {
let (chapter_id, _old_url, new_url) =
seed_chapter_with_two_live_sources(&pool).await;
let row = dispatch_target(&pool, chapter_id).await.unwrap();
let (_manga_id, source_url, _title, _number) =
row.expect("two live sources should yield a dispatch target");
assert_eq!(
source_url, new_url,
"ORDER BY last_seen_at DESC LIMIT 1 must return the freshest source"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn dispatch_target_skips_dropped_sources(pool: PgPool) {
let (chapter_id, _old_url, new_url) =
seed_chapter_with_two_live_sources(&pool).await;
// Soft-drop the fresher row. The dispatcher must now return the
// *older* still-live row instead of the dropped one.
sqlx::query(
"UPDATE chapter_sources SET dropped_at = NOW() WHERE source_url = $1",
)
.bind(&new_url)
.execute(&pool)
.await
.unwrap();
let row = dispatch_target(&pool, chapter_id).await.unwrap();
let (_manga_id, source_url, _title, _number) =
row.expect("a single live source should still yield a dispatch target");
assert!(
source_url != new_url,
"dispatch_target must not return a dropped source"
);
}
#[sqlx::test(migrations = "./migrations")]
async fn dispatch_target_returns_none_when_only_dropped_sources_remain(
pool: PgPool,
) {
let (chapter_id, _old_url, _new_url) =
seed_chapter_with_two_live_sources(&pool).await;
sqlx::query("UPDATE chapter_sources SET dropped_at = NOW() WHERE chapter_id = $1")
.bind(chapter_id)
.execute(&pool)
.await
.unwrap();
let row = dispatch_target(&pool, chapter_id).await.unwrap();
assert!(
row.is_none(),
"every source is dropped — dispatch_target must return None"
);
}

View File

@@ -17,28 +17,5 @@ services:
timeout: 5s
retries: 10
# Optional: TOR daemon for crawler dev. Ports bind to 127.0.0.1 only
# — never the LAN — so a native `cargo run` on the host can reach
# 127.0.0.1:9050 / 9051. Mirrors the prod tor service (see
# docker-compose.yml), just with host-loopback ports and a default
# password baked in for friction-free dev.
tor:
image: dockurr/tor:latest
entrypoint: ["/bin/sh", "/usr/local/bin/mangalord-entrypoint.sh"]
environment:
PASSWORD: ${TOR_CONTROL_PASSWORD:-dev-tor-password}
volumes:
- ./tor/torrc:/etc/tor/torrc:ro
- ./tor/entrypoint.sh:/usr/local/bin/mangalord-entrypoint.sh:ro
ports:
- "127.0.0.1:9050:9050"
- "127.0.0.1:9051:9051"
healthcheck:
test: ["CMD-SHELL", "nc -z 127.0.0.1 9050 && nc -z 127.0.0.1 9051"]
interval: 5s
timeout: 5s
retries: 20
start_period: 30s
volumes:
mangalord-postgres-dev:

View File

@@ -1,15 +1,9 @@
# Production-like compose. Requires a populated `.env` next to this
# file: at minimum POSTGRES_PASSWORD must be set to a non-default
# value (the `?required` form below fails fast otherwise). The
# frontend container expects HTTPS in front (Caddy/Traefik/nginx)
# because COOKIE_SECURE=true browsers will refuse to send the session
# cookie over plain HTTP.
services:
postgres:
image: postgres:16-alpine
environment:
POSTGRES_USER: ${POSTGRES_USER:-mangalord}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set in .env}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-mangalord}
POSTGRES_DB: ${POSTGRES_DB:-mangalord}
volumes:
- postgres-data:/var/lib/postgresql/data
@@ -19,50 +13,13 @@ services:
timeout: 5s
retries: 10
tor:
# SOCKS5 proxy for the crawler, plus a control port so the backend
# can signal NEWNYM on bad pages. See tor/torrc for the daemon
# config; both ports are only `expose`d (compose-internal), never
# bound on the host.
#
# We bypass dockurr/tor's stock entrypoint because it binds the
# control port to localhost (unreachable from the backend
# container) and skips its own HashedControlPassword injection
# when the user's torrc declares a ControlPort. Our wrapper
# (tor/entrypoint.sh) generates the hash from $PASSWORD and execs
# tor with our torrc. Backend authenticates with the same plain
# string via CRAWLER_TOR_CONTROL_PASSWORD.
image: dockurr/tor:latest
entrypoint: ["/bin/sh", "/usr/local/bin/mangalord-entrypoint.sh"]
environment:
PASSWORD: ${TOR_CONTROL_PASSWORD:?TOR_CONTROL_PASSWORD must be set in .env}
volumes:
- ./tor/torrc:/etc/tor/torrc:ro
- ./tor/entrypoint.sh:/usr/local/bin/mangalord-entrypoint.sh:ro
expose:
- "9050"
- "9051"
# Wait for both control + SOCKS ports to listen before downstream
# services start. dockurr/tor's main process spawns before tor
# itself is bound, so `service_started` alone races the first
# NEWNYM call.
healthcheck:
test: ["CMD-SHELL", "nc -z 127.0.0.1 9050 && nc -z 127.0.0.1 9051"]
interval: 5s
timeout: 5s
retries: 20
start_period: 30s
restart: unless-stopped
backend:
build: ./backend
depends_on:
postgres:
condition: service_healthy
tor:
condition: service_healthy
environment:
DATABASE_URL: postgres://${POSTGRES_USER:-mangalord}:${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set in .env}@postgres:5432/${POSTGRES_DB:-mangalord}
DATABASE_URL: postgres://${POSTGRES_USER:-mangalord}:${POSTGRES_PASSWORD:-mangalord}@postgres:5432/${POSTGRES_DB:-mangalord}
BIND_ADDRESS: 0.0.0.0:8080
STORAGE_DIR: /var/lib/mangalord/storage
RUST_LOG: ${RUST_LOG:-info,mangalord=debug}
@@ -76,21 +33,6 @@ services:
# Upload limits.
MAX_REQUEST_BYTES: ${MAX_REQUEST_BYTES:-209715200}
MAX_FILE_BYTES: ${MAX_FILE_BYTES:-20971520}
# System-chromium override for the crawler. Leave blank to use the
# bundled fetcher; set to e.g. /usr/bin/chromium-headless-shell on
# arm64 deployments. Pair with `--build-arg INSTALL_CHROMIUM=true`
# so the image actually contains the binary.
CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-}
# TOR proxy + NEWNYM recircuit (see .env.example for details).
# Defaults assume the bundled `tor` service above; override
# CRAWLER_PROXY= and CRAWLER_TOR_CONTROL_URL= (both empty) in
# .env to disable. CRAWLER_TOR_CONTROL_PASSWORD MUST match the
# tor service's PASSWORD (both wired to the same TOR_CONTROL_PASSWORD
# .env var below).
CRAWLER_PROXY: ${CRAWLER_PROXY-socks5h://tor:9050}
CRAWLER_TOR_CONTROL_URL: ${CRAWLER_TOR_CONTROL_URL-tcp://tor:9051}
CRAWLER_TOR_CONTROL_PASSWORD: ${TOR_CONTROL_PASSWORD:?TOR_CONTROL_PASSWORD must be set in .env}
CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS: ${CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS:-3}
volumes:
- storage-data:/var/lib/mangalord/storage
# No host port mapping in the default setup — the frontend proxies

View File

@@ -1,11 +1,7 @@
FROM node:22-alpine AS builder
WORKDIR /app
COPY package.json package-lock.json* ./
# `npm ci` installs the locked versions exactly; `npm install` would
# silently rewrite package-lock.json mid-build. CI (.gitea/workflows)
# also uses `npm ci`, so this keeps the image build deterministic and
# matches what the test job validated.
RUN npm ci
RUN npm install
COPY . .
RUN npm run build
@@ -14,22 +10,8 @@ WORKDIR /app
ENV NODE_ENV=production
ENV HOST=0.0.0.0
ENV PORT=3000
# node:22-alpine ships a `node` user (UID 1000); use it instead of
# running the SvelteKit server as root.
COPY --from=builder --chown=node:node /app/build ./build
COPY --from=builder --chown=node:node /app/node_modules ./node_modules
COPY --from=builder --chown=node:node /app/package.json ./
USER node
COPY --from=builder /app/build ./build
COPY --from=builder /app/node_modules ./node_modules
COPY --from=builder /app/package.json ./
EXPOSE 3000
# Alpine's busybox `wget` is the canonical lightweight HTTP probe. Probe
# 127.0.0.1, not `localhost`: musl resolves `localhost` to IPv6 ::1 first,
# but the Node server binds IPv4 0.0.0.0 only, so a localhost probe gets
# "connection refused" and the container is wrongly marked unhealthy. Use a
# GET (`-O /dev/null`) since `node build` serves 200 on `/`.
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD wget -q -O /dev/null http://127.0.0.1:3000/ || exit 1
CMD ["node", "build"]

View File

@@ -10,15 +10,6 @@ import { test, expect, type Page } from '@playwright/test';
const emptyPage = { items: [], page: { limit: 50, offset: 0, total: null } };
async function mockAnonymous(page: Page) {
// Force public mode so the root +layout.ts doesn't bounce us to /login
// (a dev backend with PRIVATE_MODE=true must not leak into E2E runs).
await page.route('**/api/v1/auth/config', async (route) => {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({ self_register_enabled: true, private_mode: false })
});
});
await page.route('**/api/v1/auth/me', async (route) => {
await route.fulfill({
status: 401,
@@ -78,53 +69,3 @@ test('search updates the manga list', async ({ page }) => {
await expect(page.getByTestId('manga-list')).toContainText('Berserk');
expect(lastSearch).toBe('berserk');
});
test('clicking Next paginates to page 2 and updates the URL', async ({ page }) => {
await mockAnonymous(page);
// Fake a catalogue of 75 mangas; page 1 is ids 1..50, page 2 is ids 51..75.
const TOTAL = 75;
function mangaAt(i: number) {
return {
id: `m${i}`,
title: `Manga ${i}`,
author: 'Test',
description: null,
cover_image_path: null,
created_at: '2026-01-01T00:00:00Z',
updated_at: '2026-01-01T00:00:00Z',
authors: [],
genres: []
};
}
await page.route('**/api/v1/mangas*', async (route) => {
const url = new URL(route.request().url());
const limit = Number(url.searchParams.get('limit') ?? '50');
const offset = Number(url.searchParams.get('offset') ?? '0');
const items: ReturnType<typeof mangaAt>[] = [];
for (let i = offset + 1; i <= Math.min(offset + limit, TOTAL); i++) {
items.push(mangaAt(i));
}
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({
items,
page: { limit, offset, total: TOTAL }
})
});
});
await page.goto('/');
await expect(page.getByTestId('manga-total')).toContainText('Showing 150 of 75');
await expect(page.getByTestId('manga-list')).toContainText('Manga 1');
await expect(page.getByTestId('manga-list')).not.toContainText('Manga 75');
await page.getByTestId('manga-pager').getByRole('button', { name: /next/i }).click();
await expect(page).toHaveURL(/[?&]page=2(&|$)/);
await expect(page.getByTestId('manga-total')).toContainText('Showing 5175 of 75');
await expect(page.getByTestId('manga-list')).toContainText('Manga 75');
await expect(page.getByTestId('manga-list')).not.toContainText('Manga 1');
});

View File

@@ -1,67 +0,0 @@
import { test, expect, type Page } from '@playwright/test';
// Guards the title-on-nav behavior: without this, a stale title from
// the last manga / author page lingers when the user navigates to a
// generic page like /upload.
async function mockAnonymous(page: Page) {
await page.route('**/api/v1/auth/config', async (route) => {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({ self_register_enabled: true, private_mode: false })
});
});
await page.route('**/api/v1/auth/me', async (route) => {
await route.fulfill({
status: 401,
contentType: 'application/json',
body: JSON.stringify({ error: { code: 'unauthenticated', message: 'unauthenticated' } })
});
});
await page.route('**/api/v1/mangas*', async (route) => {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({ items: [], page: { limit: 50, offset: 0, total: 0 } })
});
});
}
test('static route titles use the brand-first layout map', async ({ page }) => {
await mockAnonymous(page);
await page.goto('/');
await expect(page).toHaveTitle('Mangalord');
await page.goto('/upload');
await expect(page).toHaveTitle('Mangalord | Upload');
await page.goto('/login');
await expect(page).toHaveTitle('Mangalord | Login');
await page.goto('/bookmarks');
await expect(page).toHaveTitle('Mangalord | Bookmarks');
await page.goto('/collections');
await expect(page).toHaveTitle('Mangalord | Collections');
});
test('title updates when navigating away from a content page', async ({ page }) => {
await mockAnonymous(page);
// Pretend we just left a manga detail page — the document title
// would have been overridden to "Mangalord | Berserk". Use evaluate
// to set it synthetically so we can assert the regression cleanly
// even though the dynamic page itself isn't mocked here.
await page.goto('/');
await page.evaluate(() => {
document.title = 'Mangalord | Berserk';
});
expect(await page.title()).toBe('Mangalord | Berserk');
// Client-side nav to /upload — the root layout must reassert its
// mapped title or the stale "Berserk" lingers.
await page.goto('/upload');
await expect(page).toHaveTitle('Mangalord | Upload');
});

View File

@@ -1,101 +0,0 @@
import { test, expect, type Page } from '@playwright/test';
// Network-level mocks for the private-mode UX. The backend integration
// tests (api_private_mode.rs) cover the actual gate; here we only
// verify that the SvelteKit universal load redirects anonymous
// visitors to /login and then back to where they were going.
const userFixture = {
id: 'user-1',
username: 'alice',
created_at: '2026-01-01T00:00:00Z',
is_admin: false
};
const emptyPage = { items: [], page: { limit: 50, offset: 0, total: null } };
async function stubPrivateInstance(page: Page) {
let loggedIn = false;
// The flag that flips the gate on. Frontend reads it in
// `+layout.ts` to decide whether to redirect.
await page.route('**/api/v1/auth/config', async (route) => {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({
self_register_enabled: false,
private_mode: true
})
});
});
await page.route('**/api/v1/auth/me', async (route) => {
if (loggedIn) {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({ user: userFixture })
});
} else {
await route.fulfill({
status: 401,
contentType: 'application/json',
body: JSON.stringify({
error: { code: 'unauthenticated', message: 'unauthenticated' }
})
});
}
});
await page.route('**/api/v1/auth/login', async (route) => {
loggedIn = true;
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({ user: userFixture })
});
});
// The real backend would 401 these too in private mode; we stub
// success so the post-login navigation can render the home page
// without an additional redirect cycle.
await page.route('**/api/v1/mangas*', async (route) => {
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify(emptyPage)
});
});
}
test('private mode: anonymous visit to / redirects to /login?next=%2F', async ({ page }) => {
await stubPrivateInstance(page);
await page.goto('/');
await expect(page).toHaveURL(/\/login\?next=%2F$/);
await expect(page.getByTestId('login-username')).toBeVisible();
});
test('private mode: register link is hidden', async ({ page }) => {
await stubPrivateInstance(page);
await page.goto('/login');
await expect(page.getByTestId('nav-login')).toBeVisible();
// self_register_enabled is the effective value (false in private
// mode regardless of ALLOW_SELF_REGISTER), so the navbar must
// never render the register affordance here.
await expect(page.getByTestId('nav-register')).toHaveCount(0);
});
test('private mode: after login the user lands back on the requested page', async ({ page }) => {
await stubPrivateInstance(page);
// Visit a deep link → bounced to /login with next= preserving it.
await page.goto('/');
await expect(page).toHaveURL(/\/login\?next=%2F$/);
await page.getByTestId('login-username').fill('alice');
await page.getByTestId('login-password').fill('hunter2hunter2');
await page.getByTestId('login-submit').click();
// Authenticated → can now reach the home page without bouncing.
await expect(page.getByTestId('session-user')).toContainText('alice');
});

View File

@@ -1,167 +0,0 @@
import { test, expect, type Page } from '@playwright/test';
const mangaId = '33333333-3333-3333-3333-333333333333';
const chapter1Id = 'c1111111-3333-3333-3333-333333333333';
const chapter2Id = 'c2222222-3333-3333-3333-333333333333';
const chapter3Id = 'c3333333-3333-3333-3333-333333333333';
const mangaFixture = {
id: mangaId,
title: 'Vinland Saga',
author: 'Makoto Yukimura',
description: null,
cover_image_path: null,
created_at: '2026-01-01T00:00:00Z',
updated_at: '2026-01-01T00:00:00Z'
};
const chaptersFixture = [
{
id: chapter1Id,
manga_id: mangaId,
number: 1,
title: 'Somewhere, Not Here',
page_count: 1,
created_at: '2026-01-01T00:00:00Z'
},
{
id: chapter2Id,
manga_id: mangaId,
number: 2,
title: null,
page_count: 1,
created_at: '2026-01-02T00:00:00Z'
},
{
id: chapter3Id,
manga_id: mangaId,
number: 3,
title: 'Sword Dance',
page_count: 1,
created_at: '2026-01-03T00:00:00Z'
}
];
function pageFixture(chapterId: string) {
return [
{
id: `p1111111-${chapterId.slice(1, 8)}-3333-3333-333333333333`,
chapter_id: chapterId,
page_number: 1,
storage_key: `mangas/${mangaId}/chapters/${chapterId}/pages/0001.png`,
content_type: 'image/png'
}
];
}
async function mockReaderApis(page: Page) {
// Force public mode so the layout doesn't bounce anonymous visitors
// to /login (the dev backend on this machine runs with
// PRIVATE_MODE=true, which the layout's universal load respects).
await page.route('**/api/v1/auth/config', (route) =>
route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({ self_register_enabled: true, private_mode: false })
})
);
await page.route('**/api/v1/auth/me', (route) =>
route.fulfill({
status: 401,
contentType: 'application/json',
body: JSON.stringify({ error: { code: 'unauthenticated', message: '' } })
})
);
await page.route('**/api/v1/auth/me/preferences', (route) =>
route.fulfill({
status: 401,
contentType: 'application/json',
body: JSON.stringify({ error: { code: 'unauthenticated', message: '' } })
})
);
await page.route('**/api/v1/me/bookmarks*', (route) =>
route.fulfill({
status: 401,
contentType: 'application/json',
body: JSON.stringify({ error: { code: 'unauthenticated', message: '' } })
})
);
await page.route(`**/api/v1/mangas/${mangaId}`, (route) =>
route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify(mangaFixture)
})
);
await page.route(new RegExp(`/api/v1/mangas/${mangaId}/chapters(\\?.*)?$`), (route) =>
route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({
items: chaptersFixture,
page: { limit: 200, offset: 0, total: chaptersFixture.length }
})
})
);
for (const c of chaptersFixture) {
await page.route(`**/api/v1/mangas/${mangaId}/chapters/${c.id}`, (route) =>
route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify(c)
})
);
await page.route(
`**/api/v1/mangas/${mangaId}/chapters/${c.id}/pages`,
(route) =>
route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({ pages: pageFixture(c.id) })
})
);
}
const png = Buffer.from(
'89504e470d0a1a0a0000000d49484452000000010000000108060000001f15c4890000000d49444154789c63000100000005000158a3b62a0000000049454e44ae426082',
'hex'
);
await page.route('**/api/v1/files/**', (route) =>
route.fulfill({ status: 200, contentType: 'image/png', body: png })
);
}
test('reader chapter select lists every chapter with the manga-detail-style label', async ({
page
}) => {
await mockReaderApis(page);
await page.goto(`/manga/${mangaId}/chapter/${chapter2Id}`);
const select = page.getByTestId('reader-chapter-select');
await expect(select).toBeVisible();
// The current chapter is preselected.
await expect(select).toHaveValue(chapter2Id);
// Each chapter rendered as "Ch. N — Title" (or "Ch. N" when title is null),
// in ascending number order — matching the prev/next sort.
const labels = await select.locator('option').allTextContents();
expect(labels.map((l) => l.trim())).toEqual([
'Ch. 1 — Somewhere, Not Here',
'Ch. 2',
'Ch. 3 — Sword Dance'
]);
});
test('choosing a chapter from the select navigates to that chapter', async ({ page }) => {
await mockReaderApis(page);
await page.goto(`/manga/${mangaId}/chapter/${chapter1Id}`);
await expect(page.getByTestId('reader-chapter-select')).toHaveValue(chapter1Id);
await page.getByTestId('reader-chapter-select').selectOption(chapter3Id);
await expect(page).toHaveURL(
new RegExp(`/manga/${mangaId}/chapter/${chapter3Id}$`)
);
await expect(page.getByTestId('reader-chapter-select')).toHaveValue(chapter3Id);
});

View File

@@ -120,7 +120,7 @@ test('manga overview shows title, cover, and a chapter list', async ({ page }) =
await expect(page.getByTestId('manga-title')).toHaveText('Berserk');
await expect(page.getByTestId('manga-author')).toContainText('Kentaro Miura');
await expect(page.getByTestId('manga-cover')).toBeVisible();
await expect(page.getByTestId('chapter-list')).toContainText('The Brand');
await expect(page.getByTestId('chapter-list')).toContainText('Chapter 1');
await expect(page.getByTestId('bookmark-signin')).toBeVisible();
});

View File

@@ -1,6 +1,6 @@
{
"name": "mangalord-frontend",
"version": "0.54.0",
"version": "0.35.0",
"private": true,
"type": "module",
"scripts": {

View File

@@ -118,77 +118,4 @@ describe('hooks.server proxy', () => {
expect(body.error.code).toBe('upstream_unavailable');
expect(errSpy).toHaveBeenCalled();
});
it('strips every hop-by-hop header listed in RFC 7230 §6.1', async () => {
// Defence in depth: axum doesn't emit these, but a future
// middleware that did would otherwise leak per-connection
// state across the proxy boundary.
fetchSpy.mockResolvedValueOnce(new Response('[]', { status: 200 }));
const resolve = vi.fn();
await handle({
event: makeEvent('/api/v1/health', {
headers: {
host: 'app.example.com',
'content-length': '0',
connection: 'keep-alive',
'keep-alive': 'timeout=5',
'proxy-authenticate': 'Basic realm=x',
'proxy-authorization': 'Basic xyz',
te: 'trailers',
trailer: 'Expires',
'transfer-encoding': 'chunked',
upgrade: 'websocket',
// A non-hop-by-hop header to ensure non-targets
// aren't accidentally stripped.
'x-custom': 'pass-through'
}
}),
resolve
});
const init = fetchSpy.mock.calls[0][1] as RequestInit;
const headers = init.headers as Headers;
for (const h of [
'host',
'content-length',
'connection',
'keep-alive',
'proxy-authenticate',
'proxy-authorization',
'te',
'trailer',
'transfer-encoding',
'upgrade'
]) {
expect(headers.get(h), `${h} should be stripped`).toBeNull();
}
expect(headers.get('x-custom')).toBe('pass-through');
});
it('aborts and returns 502 when the upstream stalls past the timeout', async () => {
const errSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
// Simulate an aborted fetch (AbortController.abort() raises a
// DOMException with name 'AbortError' on Node's fetch). The
// handler should treat it as the same upstream_unavailable
// 502 it uses for any other network failure.
const abortErr = new DOMException('aborted', 'AbortError');
fetchSpy.mockRejectedValueOnce(abortErr);
const resolve = vi.fn();
const resp = await handle({ event: makeEvent('/api/v1/slow'), resolve });
expect(resp.status).toBe(502);
const body = await resp.json();
expect(body.error.code).toBe('upstream_unavailable');
expect(errSpy).toHaveBeenCalled();
});
it('attaches an AbortSignal to the upstream fetch so it can time out', async () => {
fetchSpy.mockResolvedValueOnce(new Response('[]', { status: 200 }));
const resolve = vi.fn();
await handle({ event: makeEvent('/api/v1/health'), resolve });
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.signal).toBeInstanceOf(AbortSignal);
// The signal hasn't fired (handler returned in time), but its
// presence is the contract this test is pinning.
expect(init.signal?.aborted).toBe(false);
});
});

View File

@@ -12,66 +12,20 @@ import type { Handle } from '@sveltejs/kit';
const BACKEND_URL = process.env.BACKEND_URL ?? 'http://localhost:8080';
/**
* Hop-by-hop headers per RFC 7230 §6.1. These are scoped to a single
* transport-level connection and must not be forwarded by a proxy.
* Plus `host` and `content-length`: `host` would mislead the backend
* about its origin, and `content-length` is recomputed by the upstream
* fetch from the body stream.
*/
const HOP_BY_HOP_HEADERS = [
'host',
'content-length',
'connection',
'keep-alive',
'proxy-authenticate',
'proxy-authorization',
'te',
'trailer',
'transfer-encoding',
'upgrade'
];
/**
* Cap each proxied request at 5 minutes. The bound exists to surface
* a wedged backend (stuck on a slow DB query, deadlocked, etc.) as a
* 502 rather than letting the browser request hang indefinitely.
*
* The default leans toward the slow-upload end of the spectrum: at a
* 1 Mbps upstream, a 200 MiB chapter upload (the default
* `MAX_REQUEST_BYTES` cap) needs ~27 minutes; 300 s covers the more
* realistic 25 Mbps urban-broadband case (~64 s for the same upload)
* with comfortable headroom. Operators serving very slow clients
* should raise `BACKEND_PROXY_TIMEOUT_MS`; operators behind a
* tighter upstream proxy may want to lower it. A future improvement
* is an idle-based timeout (reset per chunk) instead of this
* wall-clock budget — that's a fair bit more code, deferred.
*/
const PROXY_TIMEOUT_MS = (() => {
const raw = process.env.BACKEND_PROXY_TIMEOUT_MS;
const n = raw ? Number(raw) : 300_000;
return Number.isFinite(n) && n > 0 ? n : 300_000;
})();
export const handle: Handle = async ({ event, resolve }) => {
if (event.url.pathname.startsWith('/api/')) {
const target = `${BACKEND_URL}${event.url.pathname}${event.url.search}`;
// Strip hop-by-hop headers — `host` would mislead the backend
// about the origin, and `content-length` will be recomputed.
const headers = new Headers(event.request.headers);
for (const h of HOP_BY_HOP_HEADERS) headers.delete(h);
// AbortController times the upstream fetch out so a backend
// wedged on a slow DB query doesn't keep the browser request
// hanging forever. The `signal` is also wired into the
// RequestInit so the body stream is cancelled cleanly.
const ctrl = new AbortController();
const timeoutHandle = setTimeout(() => ctrl.abort(), PROXY_TIMEOUT_MS);
headers.delete('host');
headers.delete('content-length');
const init: RequestInit & { duplex?: 'half' } = {
method: event.request.method,
headers,
redirect: 'manual',
signal: ctrl.signal
redirect: 'manual'
};
if (event.request.method !== 'GET' && event.request.method !== 'HEAD') {
init.body = event.request.body;
@@ -85,13 +39,11 @@ export const handle: Handle = async ({ event, resolve }) => {
upstream = await fetch(target, init);
} catch (e) {
// Network-layer failure (DNS / connection refused / TLS
// handshake / abort by timeout) — most commonly "backend
// container restarting". SvelteKit's default 500 would be
// an HTML page that client.ts can't .json(), which masks
// the real cause. Emit the standard envelope with a
// dedicated code instead.
// handshake) — most commonly "backend container restarting".
// SvelteKit's default 500 would be an HTML page that
// client.ts can't .json(), which masks the real cause. Emit
// the standard envelope with a dedicated code instead.
console.error('Proxy to backend failed:', e);
clearTimeout(timeoutHandle);
return new Response(
JSON.stringify({
error: {
@@ -106,7 +58,6 @@ export const handle: Handle = async ({ event, resolve }) => {
);
}
clearTimeout(timeoutHandle);
return new Response(upstream.body, {
status: upstream.status,
statusText: upstream.statusText,

View File

@@ -1,464 +0,0 @@
import {
describe,
it,
expect,
vi,
beforeEach,
afterEach,
type MockInstance
} from 'vitest';
import {
listAdminUsers,
deleteAdminUser,
setUserAdmin,
createAdminUser,
listAdminMangas,
listAdminChapters,
getSystemStats,
resyncManga,
resyncChapter,
getCrawlerStatus,
crawlerStatusStreamUrl,
runCrawlerPass,
restartCrawlerBrowser,
updateCrawlerSession,
clearCrawlerSessionExpired,
listDeadJobs,
requeueDeadJobs,
listActiveJobs,
listMissingCovers
} from './admin';
function ok(body: unknown, status = 200): Response {
return new Response(JSON.stringify(body), {
status,
headers: { 'content-type': 'application/json' }
});
}
function noContent(): Response {
return new Response(null, { status: 204 });
}
function envelope(status: number, code: string, message: string): Response {
return new Response(JSON.stringify({ error: { code, message } }), {
status,
headers: { 'content-type': 'application/json' }
});
}
const userFixture = {
id: 'u-1',
username: 'alice',
created_at: '2026-01-01T00:00:00Z',
is_admin: false
};
const mangaFixture = {
id: 'm-1',
title: 'Test',
status: 'ongoing',
cover_image_path: null,
created_at: '2026-01-01T00:00:00Z',
updated_at: '2026-01-01T00:00:00Z',
sync_state: 'synced' as const,
chapter_count: 3,
latest_seen_at: '2026-01-02T00:00:00Z'
};
const systemFixture = {
disk: {
total_bytes: 1_000_000,
used_bytes: 500_000,
free_bytes: 500_000,
percent_used: 50.0
},
memory: { total_bytes: 8_000_000, used_bytes: 4_000_000, percent_used: 50.0 },
cpu: { percent_used: 12.3 },
alerts: []
};
describe('admin api client', () => {
let fetchSpy: MockInstance<typeof globalThis.fetch>;
beforeEach(() => {
fetchSpy = vi.spyOn(globalThis, 'fetch');
});
afterEach(() => {
vi.restoreAllMocks();
});
// ---- users ----
it('listAdminUsers GETs /v1/admin/users and parses the paged envelope', async () => {
fetchSpy.mockResolvedValueOnce(
ok({ items: [userFixture], page: { limit: 50, offset: 0, total: 1 } })
);
const page = await listAdminUsers({ limit: 50 });
expect(page.items).toHaveLength(1);
expect(page.items[0]).toEqual(userFixture);
expect(page.page.total).toBe(1);
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/users\?limit=50$/);
});
it('listAdminUsers forwards search + offset query params', async () => {
fetchSpy.mockResolvedValueOnce(
ok({ items: [], page: { limit: 50, offset: 10, total: 0 } })
);
await listAdminUsers({ search: 'al', offset: 10 });
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toContain('search=al');
expect(url).toContain('offset=10');
});
it('listAdminUsers surfaces 403 forbidden via ApiError.code', async () => {
fetchSpy.mockResolvedValueOnce(envelope(403, 'forbidden', 'forbidden'));
await expect(listAdminUsers()).rejects.toMatchObject({
status: 403,
code: 'forbidden'
});
});
it('deleteAdminUser DELETEs to /v1/admin/users/{id} and handles 204', async () => {
fetchSpy.mockResolvedValueOnce(noContent());
await expect(deleteAdminUser('u-1')).resolves.toBeUndefined();
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/users\/u-1$/);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('DELETE');
});
it('deleteAdminUser surfaces 409 conflict (self-delete / last-admin)', async () => {
fetchSpy.mockResolvedValueOnce(
envelope(409, 'conflict', 'cannot delete yourself; ask another admin')
);
await expect(deleteAdminUser('u-1')).rejects.toMatchObject({
status: 409,
code: 'conflict'
});
});
it('createAdminUser POSTs to /v1/admin/users with body and returns the created user', async () => {
const created = { ...userFixture, username: 'invited01' };
fetchSpy.mockResolvedValueOnce(ok(created, 201));
const got = await createAdminUser({
username: 'invited01',
password: 'freshpass1234'
});
expect(got).toEqual(created);
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/users$/);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('POST');
expect(JSON.parse(init.body as string)).toEqual({
username: 'invited01',
password: 'freshpass1234'
});
});
it('createAdminUser forwards is_admin when provided', async () => {
const created = { ...userFixture, username: 'coadmin', is_admin: true };
fetchSpy.mockResolvedValueOnce(ok(created, 201));
await createAdminUser({
username: 'coadmin',
password: 'freshpass1234',
is_admin: true
});
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(JSON.parse(init.body as string)).toEqual({
username: 'coadmin',
password: 'freshpass1234',
is_admin: true
});
});
it('createAdminUser surfaces 409 conflict on duplicate username', async () => {
fetchSpy.mockResolvedValueOnce(
envelope(409, 'conflict', 'username is already taken')
);
await expect(
createAdminUser({ username: 'taken', password: 'freshpass1234' })
).rejects.toMatchObject({ status: 409, code: 'conflict' });
});
it('setUserAdmin PATCHes is_admin and returns the updated user', async () => {
const updated = { ...userFixture, is_admin: true };
fetchSpy.mockResolvedValueOnce(ok(updated));
const got = await setUserAdmin('u-1', true);
expect(got).toEqual(updated);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('PATCH');
expect(JSON.parse(init.body as string)).toEqual({ is_admin: true });
});
// ---- mangas + chapters ----
it('listAdminMangas GETs /v1/admin/mangas and forwards sync_state filter', async () => {
fetchSpy.mockResolvedValueOnce(
ok({ items: [mangaFixture], page: { limit: 100, offset: 0, total: 1 } })
);
const page = await listAdminMangas({ syncState: 'in_progress', limit: 100 });
expect(page.items[0].sync_state).toBe('synced');
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toContain('sync_state=in_progress');
expect(url).toContain('limit=100');
});
it('listAdminChapters GETs the nested chapter route and parses the paged envelope', async () => {
const chapter = {
id: 'c-1',
manga_id: 'm-1',
number: 1,
title: null,
page_count: 12,
created_at: '2026-01-01T00:00:00Z',
sync_state: 'synced' as const,
latest_seen_at: null
};
fetchSpy.mockResolvedValueOnce(
ok({ items: [chapter], page: { limit: 200, offset: 0, total: 1 } })
);
const resp = await listAdminChapters('m-1');
expect(resp.items).toEqual([chapter]);
expect(resp.page.total).toBe(1);
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/mangas\/m-1\/chapters$/);
});
it('listAdminChapters forwards limit + offset query params', async () => {
fetchSpy.mockResolvedValueOnce(
ok({ items: [], page: { limit: 50, offset: 100, total: 0 } })
);
await listAdminChapters('m-1', { limit: 50, offset: 100 });
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toContain('limit=50');
expect(url).toContain('offset=100');
});
// ---- system ----
it('getSystemStats GETs /v1/admin/system and parses the four-key envelope', async () => {
fetchSpy.mockResolvedValueOnce(ok(systemFixture));
const s = await getSystemStats();
expect(s.disk?.percent_used).toBe(50);
expect(s.memory.percent_used).toBe(50);
expect(s.cpu.percent_used).toBe(12.3);
expect(s.alerts).toEqual([]);
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/system$/);
});
it('getSystemStats keeps disk null when backend reports a non-local store', async () => {
fetchSpy.mockResolvedValueOnce(ok({ ...systemFixture, disk: null }));
const s = await getSystemStats();
expect(s.disk).toBeNull();
});
// ---- force resync ----
it('resyncManga POSTs to /v1/admin/mangas/{id}/resync and returns the envelope', async () => {
const resp = {
manga: {
id: 'm-1',
title: 'T',
status: 'ongoing',
alt_titles: [],
description: null,
cover_image_path: 'mangas/m-1/cover.jpg',
created_at: '2026-01-01T00:00:00Z',
updated_at: '2026-01-02T00:00:00Z',
authors: [],
genres: [],
tags: []
},
metadata_status: 'updated',
cover_fetched: true
};
fetchSpy.mockResolvedValueOnce(ok(resp));
const got = await resyncManga('m-1');
expect(got.metadata_status).toBe('updated');
expect(got.cover_fetched).toBe(true);
expect(got.manga.id).toBe('m-1');
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/mangas\/m-1\/resync$/);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('POST');
});
it('resyncManga surfaces 503 service_unavailable when the daemon is off', async () => {
fetchSpy.mockResolvedValueOnce(
envelope(503, 'service_unavailable', 'crawler daemon is disabled')
);
await expect(resyncManga('m-1')).rejects.toMatchObject({
status: 503,
code: 'service_unavailable'
});
});
it('resyncChapter POSTs to /v1/admin/chapters/{id}/resync and returns the envelope', async () => {
const resp = {
chapter: {
id: 'c-1',
manga_id: 'm-1',
number: 1,
title: 'Foo',
page_count: 7,
created_at: '2026-01-01T00:00:00Z'
},
outcome: 'fetched',
pages: 7
};
fetchSpy.mockResolvedValueOnce(ok(resp));
const got = await resyncChapter('c-1');
expect(got.outcome).toBe('fetched');
expect(got.pages).toBe(7);
expect(got.chapter.page_count).toBe(7);
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/chapters\/c-1\/resync$/);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('POST');
});
it('resyncChapter handles the "skipped" outcome envelope', async () => {
const resp = {
chapter: {
id: 'c-1',
manga_id: 'm-1',
number: 1,
title: null,
page_count: 7,
created_at: '2026-01-01T00:00:00Z'
},
outcome: 'skipped',
pages: null
};
fetchSpy.mockResolvedValueOnce(ok(resp));
const got = await resyncChapter('c-1');
expect(got.outcome).toBe('skipped');
expect(got.pages).toBeNull();
});
});
describe('admin crawler api client', () => {
let fetchSpy: MockInstance<typeof globalThis.fetch>;
beforeEach(() => {
fetchSpy = vi.spyOn(globalThis, 'fetch');
});
afterEach(() => {
vi.restoreAllMocks();
});
const statusFixture = {
daemon: 'running',
phase: { state: 'fetching_metadata', index: 3, total: 10, title: 'One Piece' },
worker_count: 2,
active_chapters: [
{
manga_id: 'm-1',
manga_title: 'Bleach',
chapter_id: 'c-1',
chapter_number: 12,
pages_done: 4,
pages_total: 20
}
],
current_cover: { manga_id: 'm-2', manga_title: 'Naruto' },
covers_queued: 7,
last_pass: { at: null, discovered: 0, upserted: 0, covers_fetched: 0, mangas_failed: 0 },
session: { expired: false, configured: true },
browser: 'healthy',
queue: { pending: 2, running: 1, dead: 4 }
};
it('crawlerStatusStreamUrl points at the SSE endpoint under the API base', () => {
expect(crawlerStatusStreamUrl()).toMatch(/\/v1\/admin\/crawler\/stream$/);
});
it('getCrawlerStatus GETs /v1/admin/crawler with live chapter/cover fields', async () => {
fetchSpy.mockResolvedValueOnce(ok(statusFixture));
const s = await getCrawlerStatus();
expect(s.queue.dead).toBe(4);
expect(s.phase?.state).toBe('fetching_metadata');
expect(s.active_chapters[0].pages_done).toBe(4);
expect(s.active_chapters[0].pages_total).toBe(20);
expect(s.current_cover?.manga_title).toBe('Naruto');
expect(s.covers_queued).toBe(7);
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/crawler$/);
});
it('listActiveJobs GETs /v1/admin/crawler/active-jobs with search', async () => {
fetchSpy.mockResolvedValueOnce(
ok({ items: [], page: { limit: 20, offset: 0, total: 0 } })
);
await listActiveJobs({ search: 'bleach' });
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/admin\/crawler\/active-jobs\?/);
expect(url).toContain('search=bleach');
});
it('listMissingCovers GETs /v1/admin/crawler/covers', async () => {
fetchSpy.mockResolvedValueOnce(
ok({ items: [{ manga_id: 'm-1', manga_title: 'X' }], page: { limit: 20, offset: 0, total: 1 } })
);
const r = await listMissingCovers();
expect(r.items[0].manga_title).toBe('X');
expect(fetchSpy.mock.calls[0][0]).toMatch(/\/v1\/admin\/crawler\/covers$/);
});
it('runCrawlerPass POSTs /v1/admin/crawler/run', async () => {
fetchSpy.mockResolvedValueOnce(ok({ started: true }));
const r = await runCrawlerPass();
expect(r.started).toBe(true);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('POST');
expect(fetchSpy.mock.calls[0][0]).toMatch(/\/v1\/admin\/crawler\/run$/);
});
it('restartCrawlerBrowser POSTs the restart endpoint', async () => {
fetchSpy.mockResolvedValueOnce(ok({ ok: true, error: null }));
const r = await restartCrawlerBrowser();
expect(r.ok).toBe(true);
expect(fetchSpy.mock.calls[0][0]).toMatch(/\/v1\/admin\/crawler\/browser\/restart$/);
});
it('updateCrawlerSession POSTs the phpsessid body', async () => {
fetchSpy.mockResolvedValueOnce(ok({ valid: true, error: null }));
const r = await updateCrawlerSession('abc123');
expect(r.valid).toBe(true);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('POST');
expect(JSON.parse(init.body as string)).toEqual({ phpsessid: 'abc123' });
});
it('clearCrawlerSessionExpired POSTs clear-expired', async () => {
fetchSpy.mockResolvedValueOnce(ok({ cleared: true }));
const r = await clearCrawlerSessionExpired();
expect(r.cleared).toBe(true);
expect(fetchSpy.mock.calls[0][0]).toMatch(/\/v1\/admin\/crawler\/session\/clear-expired$/);
});
it('listDeadJobs forwards search + pagination', async () => {
fetchSpy.mockResolvedValueOnce(
ok({ items: [], page: { limit: 20, offset: 20, total: 0 } })
);
await listDeadJobs({ search: 'naruto', limit: 20, offset: 20 });
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toContain('search=naruto');
expect(url).toContain('offset=20');
});
it('requeueDeadJobs POSTs the scope body', async () => {
fetchSpy.mockResolvedValueOnce(ok({ requeued: 3 }));
const r = await requeueDeadJobs({ scope: 'manga', manga_id: 'm-9' });
expect(r.requeued).toBe(3);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(JSON.parse(init.body as string)).toEqual({ scope: 'manga', manga_id: 'm-9' });
});
it('surfaces a 503 as ApiError', async () => {
fetchSpy.mockResolvedValueOnce(envelope(503, 'service_unavailable', 'disabled'));
await expect(runCrawlerPass()).rejects.toMatchObject({ status: 503 });
});
});

View File

@@ -1,385 +0,0 @@
// Admin-only API client. Every endpoint here is guarded by
// RequireAdmin on the backend (session cookie only — bearer tokens
// won't reach these routes). 403s thrown here propagate up to the
// /admin layout, which renders the framework error page.
import { request, apiUrl, type Page } from './client';
import type { User } from './auth';
import type { MangaDetail } from './mangas';
import type { Chapter } from './chapters';
// ---- users -----------------------------------------------------------------
export type AdminUsersPage = {
items: User[];
page: Page;
};
export type ListAdminUsersOptions = {
search?: string;
limit?: number;
offset?: number;
};
export async function listAdminUsers(
opts: ListAdminUsersOptions = {}
): Promise<AdminUsersPage> {
const params = new URLSearchParams();
if (opts.search) params.set('search', opts.search);
if (opts.limit != null) params.set('limit', String(opts.limit));
if (opts.offset != null) params.set('offset', String(opts.offset));
const qs = params.toString();
return request<AdminUsersPage>(`/v1/admin/users${qs ? `?${qs}` : ''}`);
}
export async function deleteAdminUser(id: string): Promise<void> {
await request<void>(`/v1/admin/users/${encodeURIComponent(id)}`, {
method: 'DELETE'
});
}
export async function setUserAdmin(id: string, isAdmin: boolean): Promise<User> {
return request<User>(`/v1/admin/users/${encodeURIComponent(id)}`, {
method: 'PATCH',
headers: { 'content-type': 'application/json' },
body: JSON.stringify({ is_admin: isAdmin })
});
}
export type CreateAdminUserInput = {
username: string;
password: string;
is_admin?: boolean;
};
/** POST /v1/admin/users — admin-initiated account creation. Works
* regardless of the ALLOW_SELF_REGISTER toggle, since the entire
* point is for an admin to enroll someone when self-register is off. */
export async function createAdminUser(input: CreateAdminUserInput): Promise<User> {
return request<User>('/v1/admin/users', {
method: 'POST',
headers: { 'content-type': 'application/json' },
body: JSON.stringify(input)
});
}
// ---- mangas / chapters with sync state -------------------------------------
export type MangaSyncState = 'in_progress' | 'dropped' | 'synced';
export type AdminMangaRow = {
id: string;
title: string;
status: string;
cover_image_path: string | null;
created_at: string;
updated_at: string;
sync_state: MangaSyncState;
chapter_count: number;
latest_seen_at: string | null;
};
export type AdminMangasPage = {
items: AdminMangaRow[];
page: Page;
};
export type ListAdminMangasOptions = {
search?: string;
syncState?: MangaSyncState;
limit?: number;
offset?: number;
};
export async function listAdminMangas(
opts: ListAdminMangasOptions = {}
): Promise<AdminMangasPage> {
const params = new URLSearchParams();
if (opts.search) params.set('search', opts.search);
if (opts.syncState) params.set('sync_state', opts.syncState);
if (opts.limit != null) params.set('limit', String(opts.limit));
if (opts.offset != null) params.set('offset', String(opts.offset));
const qs = params.toString();
return request<AdminMangasPage>(`/v1/admin/mangas${qs ? `?${qs}` : ''}`);
}
export type ChapterSyncState =
| 'downloading'
| 'dropped'
| 'failed'
| 'not_downloaded'
| 'synced';
export type AdminChapterRow = {
id: string;
manga_id: string;
number: number;
title: string | null;
page_count: number;
created_at: string;
sync_state: ChapterSyncState;
latest_seen_at: string | null;
};
export type AdminChaptersPage = {
items: AdminChapterRow[];
page: Page;
};
export type ListAdminChaptersOptions = {
limit?: number;
offset?: number;
};
export async function listAdminChapters(
mangaId: string,
opts: ListAdminChaptersOptions = {}
): Promise<AdminChaptersPage> {
const params = new URLSearchParams();
if (opts.limit != null) params.set('limit', String(opts.limit));
if (opts.offset != null) params.set('offset', String(opts.offset));
const qs = params.toString();
return request<AdminChaptersPage>(
`/v1/admin/mangas/${encodeURIComponent(mangaId)}/chapters${qs ? `?${qs}` : ''}`
);
}
// ---- system ----------------------------------------------------------------
export type DiskStats = {
total_bytes: number;
used_bytes: number;
free_bytes: number;
percent_used: number;
};
export type MemoryStats = {
total_bytes: number;
used_bytes: number;
percent_used: number;
};
export type CpuStats = {
percent_used: number;
};
export type Alert = {
level: 'warning';
message: string;
};
export type SystemStats = {
disk: DiskStats | null;
memory: MemoryStats;
cpu: CpuStats;
alerts: Alert[];
};
export async function getSystemStats(): Promise<SystemStats> {
return request<SystemStats>('/v1/admin/system');
}
// ---- force resync ----------------------------------------------------------
export type MangaResyncResponse = {
manga: MangaDetail;
metadata_status: 'new' | 'updated' | 'unchanged';
cover_fetched: boolean;
};
export type ChapterResyncResponse = {
chapter: Chapter;
outcome: 'fetched' | 'skipped';
/** Page count when `outcome === 'fetched'`; null when skipped. */
pages: number | null;
};
/** POST /v1/admin/mangas/:id/resync — refetches metadata + cover from
* the manga's live crawler source. Long-running (one HTTP request per
* Chromium nav + image download), so the UI should disable the trigger
* and surface progress. */
export async function resyncManga(id: string): Promise<MangaResyncResponse> {
return request<MangaResyncResponse>(
`/v1/admin/mangas/${encodeURIComponent(id)}/resync`,
{ method: 'POST' }
);
}
/** POST /v1/admin/chapters/:id/resync — force-refetches a chapter's
* pages even if `page_count > 0`. Same long-running caveat as
* `resyncManga`. */
export async function resyncChapter(id: string): Promise<ChapterResyncResponse> {
return request<ChapterResyncResponse>(
`/v1/admin/chapters/${encodeURIComponent(id)}/resync`,
{ method: 'POST' }
);
}
// ---- crawler observability + control ---------------------------------------
/** Current daemon activity. Discriminated on `state`. */
export type CrawlerPhase =
| { state: 'idle'; next_fire: string | null }
| { state: 'walking_list' }
| { state: 'fetching_metadata'; index: number; total: number | null; title: string }
| { state: 'cover_backfill'; index: number; total: number };
/** A chapter being crawled right now, with a live page count. */
export type ActiveChapter = {
manga_id: string;
manga_title: string;
chapter_id: string;
chapter_number: number;
pages_done: number;
pages_total: number | null;
};
export type CrawlerLastPass = {
at: string | null;
discovered: number;
upserted: number;
covers_fetched: number;
mangas_failed: number;
};
export type CrawlerStatus = {
daemon: 'running' | 'disabled';
phase: CrawlerPhase | null;
worker_count: number;
active_chapters: ActiveChapter[];
current_cover: { manga_id: string; manga_title: string } | null;
covers_queued: number;
last_pass: CrawlerLastPass;
session: { expired: boolean; configured: boolean };
browser: 'healthy' | 'draining' | 'restarting' | 'down';
queue: { pending: number; running: number; dead: number };
};
export async function getCrawlerStatus(): Promise<CrawlerStatus> {
return request<CrawlerStatus>('/v1/admin/crawler');
}
/** URL of the Server-Sent Events live-status stream. Open with
* `new EventSource(...)` while the crawler page is mounted and close it on
* navigate-away so the subscription is scoped to the active page. Each
* message is a named `status` event whose `data` is a {@link CrawlerStatus}. */
export function crawlerStatusStreamUrl(): string {
return apiUrl('/v1/admin/crawler/stream');
}
/** POST /v1/admin/crawler/run — trigger an out-of-cycle metadata pass. */
export async function runCrawlerPass(): Promise<{ started: boolean }> {
return request('/v1/admin/crawler/run', { method: 'POST' });
}
/** POST /v1/admin/crawler/browser/restart — coordinated Chromium restart. */
export async function restartCrawlerBrowser(): Promise<{ ok: boolean; error: string | null }> {
return request('/v1/admin/crawler/browser/restart', { method: 'POST' });
}
/** POST /v1/admin/crawler/session — refresh PHPSESSID and re-probe. */
export async function updateCrawlerSession(
phpsessid: string
): Promise<{ valid: boolean; error: string | null }> {
return request('/v1/admin/crawler/session', {
method: 'POST',
headers: { 'content-type': 'application/json' },
body: JSON.stringify({ phpsessid })
});
}
/** POST /v1/admin/crawler/session/clear-expired — resume idled workers. */
export async function clearCrawlerSessionExpired(): Promise<{ cleared: boolean }> {
return request('/v1/admin/crawler/session/clear-expired', { method: 'POST' });
}
export type DeadJob = {
id: string;
kind: string;
chapter_id: string | null;
manga_id: string | null;
manga_title: string | null;
chapter_number: number | null;
attempts: number;
max_attempts: number;
last_error: string | null;
updated_at: string;
};
export type DeadJobsPage = { items: DeadJob[]; page: Page };
export async function listDeadJobs(opts?: {
search?: string;
limit?: number;
offset?: number;
}): Promise<DeadJobsPage> {
const params = new URLSearchParams();
if (opts?.search) params.set('search', opts.search);
if (opts?.limit != null) params.set('limit', String(opts.limit));
if (opts?.offset != null) params.set('offset', String(opts.offset));
const qs = params.toString();
return request<DeadJobsPage>(`/v1/admin/crawler/dead-jobs${qs ? `?${qs}` : ''}`);
}
/** Requeue scope: all dead jobs, one manga's, one chapter's, or a single job. */
export type RequeueScope =
| { scope: 'all' }
| { scope: 'manga'; manga_id: string }
| { scope: 'chapter'; chapter_id: string }
| { scope: 'job'; job_id: string };
export async function requeueDeadJobs(scope: RequeueScope): Promise<{ requeued: number }> {
return request('/v1/admin/crawler/dead-jobs/requeue', {
method: 'POST',
headers: { 'content-type': 'application/json' },
body: JSON.stringify(scope)
});
}
/** A queued/running chapter-content job (which chapters are queued). */
export type ActiveJob = {
id: string;
chapter_id: string | null;
manga_id: string | null;
manga_title: string | null;
chapter_number: number | null;
state: 'pending' | 'running';
attempts: number;
max_attempts: number;
updated_at: string;
};
export type ActiveJobsPage = { items: ActiveJob[]; page: Page };
/** GET /v1/admin/crawler/active-jobs — which chapters of which mangas are
* queued or running now. */
export async function listActiveJobs(opts?: {
search?: string;
limit?: number;
offset?: number;
}): Promise<ActiveJobsPage> {
const params = new URLSearchParams();
if (opts?.search) params.set('search', opts.search);
if (opts?.limit != null) params.set('limit', String(opts.limit));
if (opts?.offset != null) params.set('offset', String(opts.offset));
const qs = params.toString();
return request<ActiveJobsPage>(`/v1/admin/crawler/active-jobs${qs ? `?${qs}` : ''}`);
}
/** A manga queued for a cover fetch (no cover yet + a live source). */
export type MissingCover = { manga_id: string; manga_title: string };
export type MissingCoversPage = { items: MissingCover[]; page: Page };
/** GET /v1/admin/crawler/covers — which manga covers are queued. */
export async function listMissingCovers(opts?: {
search?: string;
limit?: number;
offset?: number;
}): Promise<MissingCoversPage> {
const params = new URLSearchParams();
if (opts?.search) params.set('search', opts.search);
if (opts?.limit != null) params.set('limit', String(opts.limit));
if (opts?.offset != null) params.set('offset', String(opts.offset));
const qs = params.toString();
return request<MissingCoversPage>(`/v1/admin/crawler/covers${qs ? `?${qs}` : ''}`);
}

View File

@@ -14,8 +14,7 @@ import {
me,
changePassword,
createToken,
deleteToken,
getAuthConfig
deleteToken
} from './auth';
function ok(body: unknown, status = 200): Response {
@@ -95,11 +94,6 @@ describe('auth api client', () => {
expect(url).toMatch(/\/v1\/auth\/logout$/);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
expect(init.method).toBe('POST');
// Consistent content-type for all mutation requests, matching
// the rest of the module — axum doesn't require it but the
// header keeps the request style uniform.
const headers = new Headers(init.headers);
expect(headers.get('content-type')).toBe('application/json');
});
it('me returns the user on 200', async () => {
@@ -170,17 +164,6 @@ describe('auth api client', () => {
expect(url).toMatch(/\/v1\/auth\/tokens$/);
});
it('getAuthConfig GETs /v1/auth/config and parses the flag', async () => {
fetchSpy.mockResolvedValueOnce(ok({ self_register_enabled: false }));
const cfg = await getAuthConfig();
expect(cfg.self_register_enabled).toBe(false);
const url = fetchSpy.mock.calls[0][0] as string;
expect(url).toMatch(/\/v1\/auth\/config$/);
const init = fetchSpy.mock.calls[0][1] as RequestInit;
// Public endpoint; no method override means default GET.
expect(init?.method ?? 'GET').toBe('GET');
});
it('deleteToken DELETEs to /v1/auth/tokens/{id} and handles 204', async () => {
fetchSpy.mockResolvedValueOnce(noContent());
await expect(deleteToken('t1')).resolves.toBeUndefined();

View File

@@ -4,7 +4,6 @@ export type User = {
id: string;
username: string;
created_at: string;
is_admin: boolean;
};
export type Credentials = {
@@ -33,14 +32,7 @@ export async function login(creds: Credentials): Promise<User> {
}
export async function logout(): Promise<void> {
await request<void>('/v1/auth/logout', {
method: 'POST',
// Consistent with the other POST/PATCH helpers in this module.
// axum doesn't require it (no body), but keeping the header
// on every mutation request avoids the false-flag in logs and
// matches the project's style.
headers: { 'content-type': 'application/json' }
});
await request<void>('/v1/auth/logout', { method: 'POST' });
}
export type ChangePassword = {
@@ -100,19 +92,3 @@ export async function createToken(name: string): Promise<CreatedToken> {
export async function deleteToken(id: string): Promise<void> {
await request<void>(`/v1/auth/tokens/${encodeURIComponent(id)}`, { method: 'DELETE' });
}
export type AuthConfig = {
/** Effective value (`allow_self_register && !private_mode`).
* When false, /v1/auth/register returns 403 and the UI should
* hide its register affordance. Admins can still mint accounts
* via POST /v1/admin/users. */
self_register_enabled: boolean;
/** When true, every read endpoint requires auth and anonymous
* visitors are redirected to `/login` (see `+layout.ts`). */
private_mode: boolean;
};
/** Public — no auth, no cookie required. */
export async function getAuthConfig(): Promise<AuthConfig> {
return request<AuthConfig>('/v1/auth/config');
}

View File

@@ -11,8 +11,7 @@ import {
listChapters,
getChapter,
getChapterPages,
createChapter,
chapterLabel
createChapter
} from './chapters';
function ok(body: unknown): Response {
@@ -130,18 +129,6 @@ describe('chapters api client', () => {
}
});
describe('chapterLabel', () => {
it('returns the site title verbatim when present', () => {
expect(chapterLabel({ number: 7, title: 'Ch.7 : Official' })).toBe(
'Ch.7 : Official'
);
});
it('falls back to "Chapter {number}" when title is null', () => {
expect(chapterLabel({ number: 3, title: null })).toBe('Chapter 3');
});
});
it('getChapterPages unwraps the {pages} envelope into the array', async () => {
fetchSpy.mockResolvedValueOnce(
ok({

Some files were not shown because too many files have changed in this diff Show More