Compare commits
94 Commits
5e92a2c450
...
feat/crawl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e02d125f51 | ||
|
|
fb4182f68d | ||
|
|
da6e320836 | ||
|
|
832042d2b7 | ||
|
|
ec0a8f2b5d | ||
|
|
6f0a8d88c9 | ||
|
|
41bf9455a1 | ||
|
|
cd0a1e13a9 | ||
|
|
3f91bea768 | ||
|
|
7a6815661f | ||
|
|
679abae736 | ||
|
|
b812c6d16c | ||
|
|
e93eec89e5 | ||
|
|
8818c890c5 | ||
|
|
c134bdbbde | ||
|
|
5c22dfdb41 | ||
|
|
e50fc093c3 | ||
|
|
72756cfef2 | ||
|
|
4e20350645 | ||
|
|
713ca139c4 | ||
|
|
e3cff9d874 | ||
|
|
d47e832613 | ||
|
|
c30c7a546f | ||
|
|
a0db7beb81 | ||
|
|
ecbbebafc4 | ||
|
|
8c6378b877 | ||
|
|
8557e432a2 | ||
|
|
d6d84dedcb | ||
|
|
d37b94871e | ||
| 8e39fadd21 | |||
| 3b3d13a0f6 | |||
| 0f90af80cb | |||
| 6b49a47d0a | |||
| e851355f28 | |||
| 2a0cc24c07 | |||
| a615b0aee7 | |||
|
|
a2826d6467 | ||
|
|
1eebb90e25 | ||
|
|
030b27754b | ||
|
|
2f47faa11c | ||
|
|
6dd21451a8 | ||
|
|
f6728dc71a | ||
|
|
aa2159ca06 | ||
|
|
b434c9b68d | ||
|
|
cc4ec76d17 | ||
|
|
bf7c9b5c2a | ||
|
|
0b2018ceca | ||
|
|
ab8b7acc34 | ||
|
|
9925f54695 | ||
|
|
eaa5afda50 | ||
|
|
5c04b0532b | ||
|
|
655ea42731 | ||
|
|
70e8a7895c | ||
|
|
8e0b638e3f | ||
|
|
e2bd1462ba | ||
|
|
9f56f283d4 | ||
|
|
33f7e19077 | ||
|
|
c6bb9160e3 | ||
|
|
50763addcf | ||
|
|
766c6eebac | ||
|
|
c686d6eb51 | ||
|
|
dea9b1aaa8 | ||
|
|
f57ca8e45c | ||
|
|
8d34132883 | ||
|
|
c5c1179e9d | ||
|
|
c320eda7cd | ||
|
|
bd9a6bd257 | ||
|
|
ebc1966103 | ||
|
|
e4333631e1 | ||
|
|
e7662d18d6 | ||
|
|
45ce0d8f12 | ||
|
|
51f42b03e9 | ||
|
|
fa0a7da311 | ||
|
|
9ff49166a5 | ||
|
|
b845d88766 | ||
|
|
9fe0f26d75 | ||
|
|
93c7fd63fc | ||
|
|
89b84252a5 | ||
|
|
728d704a66 | ||
|
|
d24e68c78d | ||
|
|
51346227dd | ||
|
|
c51353ead3 | ||
|
|
b1a3a4e9d3 | ||
|
|
26eccd0abe | ||
|
|
89b8785a40 | ||
|
|
64ccc0ba84 | ||
|
|
215325ad2f | ||
|
|
7aa6e7e6d9 | ||
|
|
c95c1805df | ||
|
|
21f44cea3f | ||
|
|
58e637085d | ||
|
|
19c1276490 | ||
|
|
7560d59616 | ||
|
|
274cc819ca |
89
.env.example
89
.env.example
@@ -1,20 +1,30 @@
|
|||||||
# Copy to .env for `docker compose up --build`. Local-dev runs (cargo run
|
# Copy to .env for `docker compose up --build`. Local-dev runs (cargo run
|
||||||
# / npm run dev) read backend/.env if present, or pick up the variables
|
# / npm run dev) read backend/.env if present, or pick up the variables
|
||||||
# from your shell.
|
# from your shell.
|
||||||
|
#
|
||||||
|
# Production note: COOKIE_SECURE=true (the default below) makes browsers
|
||||||
|
# refuse to send the session cookie over plain HTTP. Run with a TLS-
|
||||||
|
# terminating reverse proxy (Caddy, Traefik, nginx) in front — the
|
||||||
|
# compose file here doesn't ship one. Local/dev runs without HTTPS
|
||||||
|
# should set COOKIE_SECURE=false.
|
||||||
|
|
||||||
# ----- Postgres -----
|
# ----- Postgres -----
|
||||||
# These are read by the Postgres container *and* by DATABASE_URL below;
|
# These are read by the Postgres container *and* by DATABASE_URL below;
|
||||||
# changing them after the first boot won't migrate existing data, so set
|
# changing them after the first boot won't migrate existing data, so set
|
||||||
# them up front for any new deployment.
|
# them up front for any new deployment.
|
||||||
|
#
|
||||||
|
# POSTGRES_PASSWORD is REQUIRED — docker-compose.yml fails fast if it
|
||||||
|
# isn't set in this file, to prevent a deploy without an .env booting
|
||||||
|
# Postgres with a publicly-known credential.
|
||||||
POSTGRES_USER=mangalord
|
POSTGRES_USER=mangalord
|
||||||
POSTGRES_PASSWORD=mangalord
|
POSTGRES_PASSWORD=change-me-to-a-strong-random-string
|
||||||
POSTGRES_DB=mangalord
|
POSTGRES_DB=mangalord
|
||||||
|
|
||||||
# ----- Backend -----
|
# ----- Backend -----
|
||||||
DATABASE_URL=postgres://mangalord:mangalord@postgres:5432/mangalord
|
DATABASE_URL=postgres://mangalord:mangalord@postgres:5432/mangalord
|
||||||
BIND_ADDRESS=0.0.0.0:8080
|
BIND_ADDRESS=0.0.0.0:8080
|
||||||
STORAGE_DIR=/var/lib/mangalord/storage
|
STORAGE_DIR=/var/lib/mangalord/storage
|
||||||
RUST_LOG=info,mangalord=debug
|
RUST_LOG=info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off
|
||||||
|
|
||||||
# ----- Auth / cookies -----
|
# ----- Auth / cookies -----
|
||||||
# COOKIE_SECURE controls whether the `Secure` flag is set on the session
|
# COOKIE_SECURE controls whether the `Secure` flag is set on the session
|
||||||
@@ -29,6 +39,13 @@ COOKIE_DOMAIN=
|
|||||||
# get reaped lazily.
|
# get reaped lazily.
|
||||||
SESSION_TTL_DAYS=30
|
SESSION_TTL_DAYS=30
|
||||||
|
|
||||||
|
# ----- Auth brute-force rate limits -----
|
||||||
|
# Token-bucket budget shared across /auth/login, /auth/register, and
|
||||||
|
# /auth/me/password. Set per_sec=0 to disable (e.g. behind a
|
||||||
|
# rate-limiting reverse proxy that already enforces a budget).
|
||||||
|
AUTH_RATE_PER_SEC=5
|
||||||
|
AUTH_RATE_BURST=10
|
||||||
|
|
||||||
# ----- CORS -----
|
# ----- CORS -----
|
||||||
# Comma-separated origins allowed to call the API with credentials.
|
# Comma-separated origins allowed to call the API with credentials.
|
||||||
# Default is empty: same-origin only. Set when frontend and backend live
|
# Default is empty: same-origin only. Set when frontend and backend live
|
||||||
@@ -44,6 +61,69 @@ MAX_REQUEST_BYTES=209715200
|
|||||||
# Default 20 MiB.
|
# Default 20 MiB.
|
||||||
MAX_FILE_BYTES=20971520
|
MAX_FILE_BYTES=20971520
|
||||||
|
|
||||||
|
# ----- Crawler download safety -----
|
||||||
|
# Hosts the crawler is allowed to fetch images/covers from, in addition
|
||||||
|
# to CRAWLER_START_URL's host and CRAWLER_CDN_HOST. Comma-separated.
|
||||||
|
# Defends against SSRF via scraped <img src="http://10.0.0.1/...">.
|
||||||
|
CRAWLER_DOWNLOAD_ALLOWLIST=
|
||||||
|
# Bypass the host allowlist entirely. Intended for sources that shard
|
||||||
|
# images across numbered CDN subdomains (cdn1/cdn2/…) where enumerating
|
||||||
|
# every host upfront is impractical. The private-IP / localhost / non-
|
||||||
|
# http(s) scheme defenses STAY ON — a scraped <img src="http://10.0.0.1/">
|
||||||
|
# is still refused with this flag set.
|
||||||
|
CRAWLER_ALLOW_ANY_HOST=false
|
||||||
|
# Hard cap on a single image body. Default 32 MiB.
|
||||||
|
CRAWLER_MAX_IMAGE_BYTES=33554432
|
||||||
|
# Max manga detail fetches per metadata pass (both the in-process daemon
|
||||||
|
# and the `bin/crawler` CLI). 0 means no cap — let the source walker run
|
||||||
|
# to completion. Useful for capped test runs against a new source.
|
||||||
|
CRAWLER_LIMIT=0
|
||||||
|
# Path to a system Chromium binary. When set, the crawler skips the
|
||||||
|
# bundled-fetcher download. Required on platforms without a usable
|
||||||
|
# upstream Chromium build (notably Linux_arm64 / Raspberry Pi). On
|
||||||
|
# Debian: /usr/bin/chromium-headless-shell or /usr/bin/chromium. On
|
||||||
|
# Ubuntu the package is chromium-browser (different path). Pair with
|
||||||
|
# `docker compose build --build-arg INSTALL_CHROMIUM=true backend` so
|
||||||
|
# the image actually contains the binary.
|
||||||
|
CRAWLER_CHROMIUM_BINARY=
|
||||||
|
|
||||||
|
# ----- Crawler TOR proxy + recircuit -----
|
||||||
|
# The compose stack ships a `tor` service (dockurr/tor) and defaults
|
||||||
|
# CRAWLER_PROXY to it, so by default all crawler traffic exits via the
|
||||||
|
# TOR network. To opt out, set CRAWLER_PROXY= (empty) AND
|
||||||
|
# CRAWLER_TOR_CONTROL_URL= (empty) below — the tor service can stay
|
||||||
|
# running, it just won't be used.
|
||||||
|
#
|
||||||
|
# Going through TOR adds latency to every fetch; image downloads in
|
||||||
|
# particular slow noticeably. The win is on sites that rate-limit or
|
||||||
|
# fingerprint by exit IP — NEWNYM recirculation makes a fresh exit
|
||||||
|
# cheap to reach for.
|
||||||
|
#
|
||||||
|
# CRAWLER_PROXY: SOCKS5(h) URL. Use `socks5h://` (not `socks5://`) so
|
||||||
|
# DNS resolution also goes through TOR, avoiding leaks via the host's
|
||||||
|
# resolver. Leave unset to talk to the upstream directly.
|
||||||
|
CRAWLER_PROXY=socks5h://tor:9050
|
||||||
|
# Control-port URL for SIGNAL NEWNYM ("get a fresh circuit"). Triggered
|
||||||
|
# automatically on bad pages (broken-page body, missing #logo) and on
|
||||||
|
# the Unauthenticated session probe outcome. Leave unset to disable
|
||||||
|
# the recircuit feature (the SOCKS proxy still works).
|
||||||
|
CRAWLER_TOR_CONTROL_URL=tcp://tor:9051
|
||||||
|
# Max NEWNYM-and-retry cycles per recircuit-eligible failure. Default 3.
|
||||||
|
CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS=3
|
||||||
|
|
||||||
|
# ----- TOR control-port password -----
|
||||||
|
# Shared between the bundled dockurr/tor service (which hashes it into
|
||||||
|
# its HashedControlPassword) and the backend's
|
||||||
|
# CRAWLER_TOR_CONTROL_PASSWORD. REQUIRED — docker-compose.yml fails
|
||||||
|
# fast if absent. Generate a strong random string; rotate by setting
|
||||||
|
# a new value and restarting both `tor` and `backend`.
|
||||||
|
#
|
||||||
|
# Operators running their own non-dockurr tor daemon with cookie-file
|
||||||
|
# auth can ignore this var and instead set
|
||||||
|
# CRAWLER_TOR_CONTROL_COOKIE_PATH on the backend — the TorController
|
||||||
|
# prefers cookie when both are present.
|
||||||
|
TOR_CONTROL_PASSWORD=change-me-to-a-strong-random-string
|
||||||
|
|
||||||
# ----- Frontend -----
|
# ----- Frontend -----
|
||||||
# The frontend container runs SvelteKit's Node adapter on :3000 and
|
# The frontend container runs SvelteKit's Node adapter on :3000 and
|
||||||
# proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the
|
# proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the
|
||||||
@@ -51,3 +131,8 @@ MAX_FILE_BYTES=20971520
|
|||||||
# internal docker network. Override only if you're running the
|
# internal docker network. Override only if you're running the
|
||||||
# frontend container against a backend somewhere else.
|
# frontend container against a backend somewhere else.
|
||||||
BACKEND_URL=http://backend:8080
|
BACKEND_URL=http://backend:8080
|
||||||
|
# Per-request wall-clock cap for the /api/* reverse proxy (milliseconds).
|
||||||
|
# Default 300000 (5 min) covers a typical 200 MiB chapter upload over
|
||||||
|
# 25 Mbps; raise for users on slower upstream links or lower if a
|
||||||
|
# tighter front proxy already bounds the request lifetime.
|
||||||
|
BACKEND_PROXY_TIMEOUT_MS=300000
|
||||||
|
|||||||
71
.gitea/README.md
Normal file
71
.gitea/README.md
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
# Gitea Actions
|
||||||
|
|
||||||
|
The [`deploy`](workflows/deploy.yml) workflow runs on every push to `main`
|
||||||
|
(and via manual `workflow_dispatch`). It tests, builds, pushes the images
|
||||||
|
to a private registry, and rolls the stack over by SSH on the target host.
|
||||||
|
|
||||||
|
## Required secrets
|
||||||
|
|
||||||
|
Set under *Repo Settings → Actions → Secrets*:
|
||||||
|
|
||||||
|
| Name | Example | Purpose |
|
||||||
|
| -------------------- | ------------------------ | ---------------------------------------------------------------- |
|
||||||
|
| `REGISTRY_URL` | `registry.example.com` | Registry host. No scheme, no trailing slash. |
|
||||||
|
| `REGISTRY_USERNAME` | `mangalord-ci` | `docker login` user. |
|
||||||
|
| `REGISTRY_PASSWORD` | `<token>` | `docker login` token/password. |
|
||||||
|
| `SSH_HOST` | `mangalord.example.com` | Deploy target hostname/IP. |
|
||||||
|
| `SSH_USER` | `deploy` | SSH user on the target (must be in the `docker` group). |
|
||||||
|
| `SSH_PRIVATE_KEY` | `-----BEGIN OPENSSH...` | Private key authorised in the target user's `authorized_keys`. |
|
||||||
|
| `SSH_PORT` | `22` | Optional. Defaults to `22` if unset. |
|
||||||
|
|
||||||
|
## Required variables
|
||||||
|
|
||||||
|
Set under *Repo Settings → Actions → Variables* (not secrets — they appear
|
||||||
|
in logs):
|
||||||
|
|
||||||
|
| Name | Example | Purpose |
|
||||||
|
| ------------- | ------------------------ | ---------------------------------------------------------------------- |
|
||||||
|
| `DEPLOY_PATH` | `/srv/mangalord` | Directory on target holding `docker-compose.yml`, `.env`, and the prod overlay. |
|
||||||
|
|
||||||
|
## One-time host setup
|
||||||
|
|
||||||
|
The workflow assumes the deploy target already has:
|
||||||
|
|
||||||
|
1. Docker + Docker Compose v2 installed and the `SSH_USER` in the `docker` group.
|
||||||
|
2. `$DEPLOY_PATH/docker-compose.yml` (copy of the repo's [docker-compose.yml](../docker-compose.yml)).
|
||||||
|
3. `$DEPLOY_PATH/docker-compose.prod.yml` (copy of the repo's [docker-compose.prod.yml](../docker-compose.prod.yml)).
|
||||||
|
4. `$DEPLOY_PATH/.env` populated from [.env.example](../.env.example) with production values (real `POSTGRES_PASSWORD`, `COOKIE_SECURE=true`, etc.).
|
||||||
|
|
||||||
|
Bootstrap once:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh deploy@mangalord.example.com
|
||||||
|
sudo mkdir -p /srv/mangalord && sudo chown deploy:deploy /srv/mangalord
|
||||||
|
cd /srv/mangalord
|
||||||
|
# place docker-compose.yml, docker-compose.prod.yml, and .env here
|
||||||
|
```
|
||||||
|
|
||||||
|
The first workflow run will pull the images, bring the stack up, and run
|
||||||
|
the embedded migrations on startup.
|
||||||
|
|
||||||
|
## Image tags
|
||||||
|
|
||||||
|
Every push produces three tags per image:
|
||||||
|
|
||||||
|
- `mangalord-{backend,frontend}:latest`
|
||||||
|
- `mangalord-{backend,frontend}:<git-sha>` — used by the deploy job; lets
|
||||||
|
you pin a deploy to a specific commit
|
||||||
|
- `mangalord-{backend,frontend}:<version>` — the version from
|
||||||
|
[backend/Cargo.toml](../backend/Cargo.toml) (verified in lockstep with
|
||||||
|
[frontend/package.json](../frontend/package.json))
|
||||||
|
|
||||||
|
## Rollback
|
||||||
|
|
||||||
|
SSH to the target, set `IMAGE_TAG` to a previous commit SHA, and re-up:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/mangalord
|
||||||
|
export REGISTRY_URL=registry.example.com
|
||||||
|
export IMAGE_TAG=<previous-sha>
|
||||||
|
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d
|
||||||
|
```
|
||||||
153
.gitea/workflows/deploy.yml
Normal file
153
.gitea/workflows/deploy.yml
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
name: deploy
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test-backend:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
env:
|
||||||
|
POSTGRES_USER: mangalord
|
||||||
|
POSTGRES_PASSWORD: mangalord
|
||||||
|
POSTGRES_DB: mangalord
|
||||||
|
options: >-
|
||||||
|
--health-cmd "pg_isready -U mangalord"
|
||||||
|
--health-interval 5s
|
||||||
|
--health-timeout 5s
|
||||||
|
--health-retries 10
|
||||||
|
env:
|
||||||
|
DATABASE_URL: postgres://mangalord:mangalord@postgres:5432/mangalord
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
# ubuntu-latest has node (so JS actions like checkout/cache run) but no
|
||||||
|
# Rust. We intentionally avoid `container: rust:1-slim` because act_runner
|
||||||
|
# runs JS actions with node *inside* the job container, and the slim Rust
|
||||||
|
# image ships no node (checkout would fail with exit 127).
|
||||||
|
- name: Install Rust + build deps
|
||||||
|
run: |
|
||||||
|
set -eu
|
||||||
|
SUDO=""; [ "$(id -u)" = "0" ] || SUDO="sudo"
|
||||||
|
$SUDO apt-get update
|
||||||
|
$SUDO apt-get install -y --no-install-recommends pkg-config libssl-dev ca-certificates curl
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable
|
||||||
|
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
|
||||||
|
- name: Cache cargo registry and target
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
backend/target
|
||||||
|
key: cargo-${{ runner.os }}-${{ hashFiles('backend/Cargo.lock') }}
|
||||||
|
restore-keys: |
|
||||||
|
cargo-${{ runner.os }}-
|
||||||
|
- name: cargo test
|
||||||
|
working-directory: backend
|
||||||
|
run: cargo test --locked
|
||||||
|
|
||||||
|
test-frontend:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-node@v4
|
||||||
|
with:
|
||||||
|
node-version: '22'
|
||||||
|
cache: npm
|
||||||
|
cache-dependency-path: frontend/package-lock.json
|
||||||
|
- name: npm ci
|
||||||
|
working-directory: frontend
|
||||||
|
run: npm ci
|
||||||
|
- name: vitest
|
||||||
|
working-directory: frontend
|
||||||
|
run: npm test
|
||||||
|
|
||||||
|
build-and-push:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [test-backend, test-frontend]
|
||||||
|
# PRs only run the test jobs; build + deploy are reserved for
|
||||||
|
# post-merge pushes to main.
|
||||||
|
if: github.event_name != 'pull_request'
|
||||||
|
# Build on the host docker daemon directly (docker-outside-of-docker):
|
||||||
|
# the runner shares the deploy host's daemon, so a plain `docker build`
|
||||||
|
# reuses the host's layer cache and avoids buildx's docker-container
|
||||||
|
# driver + the gha cache exporter — neither works against this single-host
|
||||||
|
# act_runner, and there is no in-job daemon socket unless we mount it.
|
||||||
|
container:
|
||||||
|
image: docker.gitea.com/runner-images:ubuntu-latest
|
||||||
|
volumes:
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
outputs:
|
||||||
|
image_tag: ${{ steps.meta.outputs.image_tag }}
|
||||||
|
version: ${{ steps.meta.outputs.version }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Resolve image tags
|
||||||
|
id: meta
|
||||||
|
run: |
|
||||||
|
version="$(grep -m1 '^version' backend/Cargo.toml | cut -d'"' -f2)"
|
||||||
|
frontend_version="$(grep -m1 '"version"' frontend/package.json | cut -d'"' -f4)"
|
||||||
|
if [ "$version" != "$frontend_version" ]; then
|
||||||
|
echo "Version mismatch: backend=$version frontend=$frontend_version" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "image_tag=${GITHUB_SHA}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "version=${version}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Build & push backend + frontend
|
||||||
|
env:
|
||||||
|
REGISTRY_URL: ${{ secrets.REGISTRY_URL }}
|
||||||
|
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
|
||||||
|
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
|
||||||
|
IMAGE_TAG: ${{ steps.meta.outputs.image_tag }}
|
||||||
|
VERSION: ${{ steps.meta.outputs.version }}
|
||||||
|
run: |
|
||||||
|
set -eu
|
||||||
|
echo "$REGISTRY_PASSWORD" | docker login "$REGISTRY_URL" -u "$REGISTRY_USERNAME" --password-stdin
|
||||||
|
for svc in backend frontend; do
|
||||||
|
img="$REGISTRY_URL/mangalord-$svc"
|
||||||
|
docker build -t "$img:$IMAGE_TAG" -t "$img:latest" -t "$img:$VERSION" "./$svc"
|
||||||
|
for tag in "$IMAGE_TAG" latest "$VERSION"; do docker push "$img:$tag"; done
|
||||||
|
done
|
||||||
|
docker logout "$REGISTRY_URL"
|
||||||
|
|
||||||
|
deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: build-and-push
|
||||||
|
if: github.event_name != 'pull_request'
|
||||||
|
# Single-host deploy: the runner lives on the same box as the stack, so we
|
||||||
|
# drive the host docker daemon directly (the job mounts the host docker
|
||||||
|
# socket) instead of SSHing out. The compose dir is bind-mounted at its
|
||||||
|
# REAL host path so compose's relative bind-mounts (./mangalord/...,
|
||||||
|
# ./Caddyfile) resolve; both paths must be in the runner's
|
||||||
|
# container.valid_volumes. The central compose references the images as
|
||||||
|
# registry.mc02.dev/mangalord-*:${MANGALORD_TAG:-latest}, so we only pull
|
||||||
|
# and recreate the two mangalord services at the freshly built SHA.
|
||||||
|
container:
|
||||||
|
image: docker:cli
|
||||||
|
volumes:
|
||||||
|
- /mnt/ssd/docker-data:/mnt/ssd/docker-data
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
steps:
|
||||||
|
- name: Deploy to the local stack
|
||||||
|
working-directory: /mnt/ssd/docker-data
|
||||||
|
env:
|
||||||
|
REGISTRY_URL: ${{ secrets.REGISTRY_URL }}
|
||||||
|
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
|
||||||
|
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
|
||||||
|
IMAGE_TAG: ${{ needs.build-and-push.outputs.image_tag }}
|
||||||
|
run: |
|
||||||
|
set -eu
|
||||||
|
echo "$REGISTRY_PASSWORD" | docker login "$REGISTRY_URL" -u "$REGISTRY_USERNAME" --password-stdin
|
||||||
|
export MANGALORD_TAG="$IMAGE_TAG"
|
||||||
|
docker compose pull mangalord-backend mangalord-frontend
|
||||||
|
docker compose up -d mangalord-backend mangalord-frontend
|
||||||
|
docker image prune -f
|
||||||
|
docker logout "$REGISTRY_URL"
|
||||||
1516
backend/Cargo.lock
generated
1516
backend/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,8 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
version = "0.16.0"
|
version = "0.54.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
default-run = "mangalord"
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
path = "src/lib.rs"
|
path = "src/lib.rs"
|
||||||
@@ -10,6 +11,10 @@ path = "src/lib.rs"
|
|||||||
name = "mangalord"
|
name = "mangalord"
|
||||||
path = "src/main.rs"
|
path = "src/main.rs"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "crawler"
|
||||||
|
path = "src/bin/crawler.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
axum = { version = "0.7", features = ["macros", "multipart"] }
|
axum = { version = "0.7", features = ["macros", "multipart"] }
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
@@ -18,6 +23,7 @@ serde = { version = "1", features = ["derive"] }
|
|||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
uuid = { version = "1", features = ["v4", "serde"] }
|
uuid = { version = "1", features = ["v4", "serde"] }
|
||||||
chrono = { version = "0.4", features = ["serde"] }
|
chrono = { version = "0.4", features = ["serde"] }
|
||||||
|
chrono-tz = "0.9"
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
tower = { version = "0.5", features = ["util"] }
|
tower = { version = "0.5", features = ["util"] }
|
||||||
@@ -36,7 +42,13 @@ time = "0.3"
|
|||||||
infer = "0.16"
|
infer = "0.16"
|
||||||
tokio-util = { version = "0.7", features = ["io"] }
|
tokio-util = { version = "0.7", features = ["io"] }
|
||||||
futures-core = "0.3"
|
futures-core = "0.3"
|
||||||
|
futures-util = "0.3"
|
||||||
bytes = "1"
|
bytes = "1"
|
||||||
|
chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false }
|
||||||
|
sysinfo = { version = "0.32", default-features = false, features = ["system"] }
|
||||||
|
nix = { version = "0.29", features = ["fs"] }
|
||||||
|
scraper = "0.20"
|
||||||
|
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies", "stream"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3"
|
tempfile = "3"
|
||||||
@@ -44,3 +56,14 @@ tower = { version = "0.5", features = ["util"] }
|
|||||||
http-body-util = "0.1"
|
http-body-util = "0.1"
|
||||||
mime = "0.3"
|
mime = "0.3"
|
||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
|
tokio = { version = "1", features = ["test-util"] }
|
||||||
|
|
||||||
|
# Trim debug builds: keep line numbers in panics / backtraces but drop the
|
||||||
|
# full DWARF info (variable-level inspection in gdb/lldb). With a sqlx +
|
||||||
|
# axum + tokio dep tree the default ("full") leaves backend/target on the
|
||||||
|
# order of tens of GiB; this typically cuts ~50–70% off that.
|
||||||
|
[profile.dev]
|
||||||
|
debug = "line-tables-only"
|
||||||
|
|
||||||
|
[profile.test]
|
||||||
|
debug = "line-tables-only"
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ RUN apt-get update \
|
|||||||
# exact crate versions CI tested. Without Cargo.lock + the flag, cargo
|
# exact crate versions CI tested. Without Cargo.lock + the flag, cargo
|
||||||
# would silently resolve fresh on every image build.
|
# would silently resolve fresh on every image build.
|
||||||
COPY Cargo.toml Cargo.lock ./
|
COPY Cargo.toml Cargo.lock ./
|
||||||
RUN mkdir src && echo "fn main() {}" > src/main.rs && echo "" > src/lib.rs \
|
RUN mkdir -p src/bin && echo "fn main() {}" > src/main.rs && echo "" > src/lib.rs \
|
||||||
|
&& echo "fn main() {}" > src/bin/crawler.rs \
|
||||||
&& cargo build --locked --release \
|
&& cargo build --locked --release \
|
||||||
&& rm -rf src
|
&& rm -rf src
|
||||||
|
|
||||||
@@ -18,13 +19,68 @@ COPY src ./src
|
|||||||
COPY migrations ./migrations
|
COPY migrations ./migrations
|
||||||
RUN touch src/main.rs src/lib.rs && cargo build --locked --release
|
RUN touch src/main.rs src/lib.rs && cargo build --locked --release
|
||||||
|
|
||||||
FROM debian:bookworm-slim
|
FROM debian:trixie-slim
|
||||||
|
# Runtime base must match the builder's Debian release: `rust:1-slim` tracks
|
||||||
|
# trixie (glibc 2.41), so a bookworm runtime (glibc 2.36) can't run the
|
||||||
|
# binary ("GLIBC_2.39 not found"). Keep these two in lockstep on bumps.
|
||||||
|
# `curl` is for the container HEALTHCHECK; `ca-certificates` is for
|
||||||
|
# outbound HTTPS (crawler covers/pages).
|
||||||
|
#
|
||||||
|
# INSTALL_CHROMIUM is an opt-in for deployments that can't use the
|
||||||
|
# chromiumoxide fetcher path (notably Linux_arm64 / Raspberry Pi, where
|
||||||
|
# the upstream snapshot bucket has no usable build). When `true`, adds
|
||||||
|
# Debian's apt-packaged headless chromium plus a baseline font set —
|
||||||
|
# pair with `CRAWLER_CHROMIUM_BINARY=/usr/bin/chromium-headless-shell`
|
||||||
|
# at runtime so the launcher uses it. Default `false` keeps cloud/x86
|
||||||
|
# images slim.
|
||||||
|
#
|
||||||
|
# Build the Pi image with:
|
||||||
|
# docker compose build --build-arg INSTALL_CHROMIUM=true backend
|
||||||
|
ARG INSTALL_CHROMIUM=false
|
||||||
RUN apt-get update \
|
RUN apt-get update \
|
||||||
&& apt-get install -y --no-install-recommends ca-certificates \
|
&& apt-get install -y --no-install-recommends ca-certificates curl \
|
||||||
|
&& if [ "$INSTALL_CHROMIUM" = "true" ]; then \
|
||||||
|
apt-get install -y --no-install-recommends chromium-headless-shell fonts-liberation; \
|
||||||
|
fi \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Non-root runtime user. The API binary doesn't need any root
|
||||||
|
# privilege; the crawler daemon's Chromium launcher uses --no-sandbox
|
||||||
|
# precisely because user-namespace sandboxing is fragile, so dropping
|
||||||
|
# privileges costs nothing operationally and shrinks the blast radius
|
||||||
|
# of any RCE.
|
||||||
|
ARG APP_UID=10001
|
||||||
|
ARG APP_GID=10001
|
||||||
|
RUN groupadd --system --gid ${APP_GID} app \
|
||||||
|
&& useradd --system --uid ${APP_UID} --gid app --home-dir /home/app --create-home --shell /usr/sbin/nologin app
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY --from=builder /app/target/release/mangalord /usr/local/bin/mangalord
|
COPY --from=builder /app/target/release/mangalord /usr/local/bin/mangalord
|
||||||
COPY --from=builder /app/migrations /app/migrations
|
COPY --from=builder /app/migrations /app/migrations
|
||||||
|
|
||||||
ENV STORAGE_DIR=/var/lib/mangalord/storage
|
ENV STORAGE_DIR=/var/lib/mangalord/storage
|
||||||
|
# Pre-create the storage dir so the entrypoint doesn't need to
|
||||||
|
# mkdir-as-root and so the named volume mount inherits the right
|
||||||
|
# ownership.
|
||||||
|
#
|
||||||
|
# UPGRADE NOTE for operators: if you're moving from an older image
|
||||||
|
# that ran as root, the existing `storage-data` volume has files owned
|
||||||
|
# by UID 0 and the new UID-10001 user can't write them. Run once
|
||||||
|
# before the upgrade:
|
||||||
|
# docker compose run --rm --user 0 backend \
|
||||||
|
# chown -R 10001:10001 /var/lib/mangalord/storage
|
||||||
|
# (Postgres is unaffected — that image's `postgres` user UID hasn't
|
||||||
|
# changed.)
|
||||||
|
RUN mkdir -p ${STORAGE_DIR} \
|
||||||
|
&& chown -R app:app ${STORAGE_DIR} /app /home/app
|
||||||
|
|
||||||
|
USER app
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
|
|
||||||
|
# `--start-period` is generous because first boot runs sqlx::migrate
|
||||||
|
# against postgres which can take a few seconds; subsequent restarts
|
||||||
|
# are sub-second.
|
||||||
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
|
||||||
|
CMD curl -fsS http://localhost:8080/api/v1/health > /dev/null || exit 1
|
||||||
|
|
||||||
CMD ["mangalord"]
|
CMD ["mangalord"]
|
||||||
|
|||||||
31
backend/migrations/0010_collections.sql
Normal file
31
backend/migrations/0010_collections.sql
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
-- User-owned manga collections. Each user can curate any number of
|
||||||
|
-- named lists (e.g., "Favorites", "Reading list"); mangas can belong
|
||||||
|
-- to many collections of many users without restriction.
|
||||||
|
|
||||||
|
CREATE TABLE collections (
|
||||||
|
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
|
||||||
|
name text NOT NULL,
|
||||||
|
description text,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
updated_at timestamptz NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Per-user case-insensitive name uniqueness so "Favorites" and
|
||||||
|
-- "favorites" don't both end up in someone's sidebar.
|
||||||
|
CREATE UNIQUE INDEX collections_user_name_lower_uniq
|
||||||
|
ON collections (user_id, lower(name));
|
||||||
|
|
||||||
|
CREATE INDEX collections_user_idx ON collections (user_id, created_at DESC);
|
||||||
|
|
||||||
|
CREATE TABLE collection_mangas (
|
||||||
|
collection_id uuid NOT NULL REFERENCES collections(id) ON DELETE CASCADE,
|
||||||
|
manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
|
||||||
|
added_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
PRIMARY KEY (collection_id, manga_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Reverse lookup: which collections contain this manga? Used by the
|
||||||
|
-- "Add to collection" modal to pre-check the boxes for the user's
|
||||||
|
-- collections this manga is already in.
|
||||||
|
CREATE INDEX collection_mangas_manga_idx ON collection_mangas (manga_id);
|
||||||
39
backend/migrations/0011_history.sql
Normal file
39
backend/migrations/0011_history.sql
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
-- Per-user reading progress and uploader attribution.
|
||||||
|
--
|
||||||
|
-- Reading progress is the simplest shape that supports "jump to last
|
||||||
|
-- read chapter" — one row per (user, manga). The reader writes
|
||||||
|
-- through on chapter open and on page advance (debounced); the
|
||||||
|
-- history view shows them sorted by most-recently-touched.
|
||||||
|
--
|
||||||
|
-- Uploader attribution adds nullable `uploaded_by` columns to the two
|
||||||
|
-- upload sinks. Historical rows have NULL because the original
|
||||||
|
-- handlers didn't track this; new uploads stamp the current user.
|
||||||
|
|
||||||
|
CREATE TABLE read_progress (
|
||||||
|
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
|
||||||
|
manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
|
||||||
|
-- Chapter is nullable so a deleted chapter doesn't blow away
|
||||||
|
-- the user's progress row entirely — they just see "(chapter
|
||||||
|
-- removed)" in the history UI.
|
||||||
|
chapter_id uuid REFERENCES chapters(id) ON DELETE SET NULL,
|
||||||
|
page integer NOT NULL DEFAULT 1 CHECK (page >= 1),
|
||||||
|
updated_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
PRIMARY KEY (user_id, manga_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Most queries on this table want "most recent first" per user; the
|
||||||
|
-- composite index makes both filter and sort index-only.
|
||||||
|
CREATE INDEX read_progress_user_idx
|
||||||
|
ON read_progress (user_id, updated_at DESC);
|
||||||
|
|
||||||
|
ALTER TABLE mangas
|
||||||
|
ADD COLUMN uploaded_by uuid REFERENCES users(id) ON DELETE SET NULL;
|
||||||
|
CREATE INDEX mangas_uploaded_by_idx
|
||||||
|
ON mangas (uploaded_by, created_at DESC)
|
||||||
|
WHERE uploaded_by IS NOT NULL;
|
||||||
|
|
||||||
|
ALTER TABLE chapters
|
||||||
|
ADD COLUMN uploaded_by uuid REFERENCES users(id) ON DELETE SET NULL;
|
||||||
|
CREATE INDEX chapters_uploaded_by_idx
|
||||||
|
ON chapters (uploaded_by, created_at DESC)
|
||||||
|
WHERE uploaded_by IS NOT NULL;
|
||||||
72
backend/migrations/0012_crawler.sql
Normal file
72
backend/migrations/0012_crawler.sql
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
-- Crawler tables.
|
||||||
|
--
|
||||||
|
-- Same philosophy as 0001_init.sql: new concepts go in new tables
|
||||||
|
-- joined to existing ones, not jammed onto `mangas`/`chapters`. A
|
||||||
|
-- crawled manga IS a manga; the only thing the source-link tables
|
||||||
|
-- carry is "where did this come from and when did we last see it".
|
||||||
|
-- That keeps the API and frontend source-agnostic.
|
||||||
|
|
||||||
|
-- 1. Source registry. One row per site the crawler knows about.
|
||||||
|
-- `config` carries per-site knobs (base URL, rate limits, custom
|
||||||
|
-- selectors) so adding a source is a row insert plus a `Source`
|
||||||
|
-- trait impl — no schema change.
|
||||||
|
CREATE TABLE sources (
|
||||||
|
id text PRIMARY KEY,
|
||||||
|
name text NOT NULL,
|
||||||
|
base_url text NOT NULL,
|
||||||
|
enabled boolean NOT NULL DEFAULT true,
|
||||||
|
config jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- 2. Link tables. `(source_id, source_*_key)` is the natural key the
|
||||||
|
-- source itself exposes; the FK to `mangas`/`chapters` is what
|
||||||
|
-- threads it back into our domain. `metadata_hash` is the signal
|
||||||
|
-- used by `crawler::diff` to detect updates without re-comparing
|
||||||
|
-- every field. `last_seen_at` + `dropped_at` is the soft-drop pair.
|
||||||
|
CREATE TABLE manga_sources (
|
||||||
|
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
||||||
|
source_manga_key text NOT NULL,
|
||||||
|
manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE,
|
||||||
|
source_url text NOT NULL,
|
||||||
|
metadata_hash text,
|
||||||
|
first_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
last_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
dropped_at timestamptz,
|
||||||
|
PRIMARY KEY (source_id, source_manga_key)
|
||||||
|
);
|
||||||
|
CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id);
|
||||||
|
CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at);
|
||||||
|
|
||||||
|
CREATE TABLE chapter_sources (
|
||||||
|
source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
||||||
|
source_chapter_key text NOT NULL,
|
||||||
|
chapter_id uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE,
|
||||||
|
source_url text NOT NULL,
|
||||||
|
first_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
last_seen_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
dropped_at timestamptz,
|
||||||
|
PRIMARY KEY (source_id, source_chapter_key)
|
||||||
|
);
|
||||||
|
CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id);
|
||||||
|
|
||||||
|
-- 3. Persistent job queue. Workers lease with
|
||||||
|
-- `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack
|
||||||
|
-- by transitioning state. The partial index keeps the hot path
|
||||||
|
-- (pick the next ready job) off the bulk of done/dead rows.
|
||||||
|
CREATE TABLE crawler_jobs (
|
||||||
|
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
payload jsonb NOT NULL,
|
||||||
|
state text NOT NULL DEFAULT 'pending'
|
||||||
|
CHECK (state IN ('pending','running','done','failed','dead')),
|
||||||
|
attempts integer NOT NULL DEFAULT 0,
|
||||||
|
max_attempts integer NOT NULL DEFAULT 5,
|
||||||
|
scheduled_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
leased_until timestamptz,
|
||||||
|
last_error text,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
updated_at timestamptz NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
CREATE INDEX crawler_jobs_ready_idx
|
||||||
|
ON crawler_jobs (scheduled_at)
|
||||||
|
WHERE state IN ('pending', 'failed');
|
||||||
18
backend/migrations/0013_drop_chapters_unique_number.sql
Normal file
18
backend/migrations/0013_drop_chapters_unique_number.sql
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
-- Real-world sources publish multiple chapters at the same number:
|
||||||
|
-- different uploaders, translator notices/farewells, paid-vs-free
|
||||||
|
-- re-uploads, and our own users can legitimately have two versions of
|
||||||
|
-- "Ch.52" with different scanlations. The (manga_id, number) UNIQUE
|
||||||
|
-- from 0001_init silently collapses all of those into a single row via
|
||||||
|
-- ON CONFLICT, dropping data. Drop the constraint and lean on the
|
||||||
|
-- chapter id (UUID) as the only chapter identity going forward.
|
||||||
|
|
||||||
|
ALTER TABLE chapters DROP CONSTRAINT chapters_manga_id_number_key;
|
||||||
|
|
||||||
|
-- The UNIQUE was also our only index on (manga_id, number) since
|
||||||
|
-- 0007 dropped the redundant explicit one. Chapter list pages
|
||||||
|
-- ORDER BY number ASC and the manga page is a hot read path, so put
|
||||||
|
-- the index back without the uniqueness. Secondary sort by created_at
|
||||||
|
-- so duplicate-numbered chapters have a stable order in lists and
|
||||||
|
-- prev/next navigation.
|
||||||
|
CREATE INDEX chapters_manga_id_number_idx
|
||||||
|
ON chapters (manga_id, number, created_at);
|
||||||
15
backend/migrations/0014_crawler_jobs_dedup_index.sql
Normal file
15
backend/migrations/0014_crawler_jobs_dedup_index.sql
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
-- Dedup SyncChapterContent jobs in flight.
|
||||||
|
--
|
||||||
|
-- Without this, the daemon's bookmark/cron enqueue paths would have to do a
|
||||||
|
-- pre-check + insert race that's incorrect under concurrency. The partial
|
||||||
|
-- unique index lets both producers use plain `INSERT ... ON CONFLICT DO
|
||||||
|
-- NOTHING`: at most one (pending|running) job per chapter_id exists, and the
|
||||||
|
-- slot frees again as soon as the job transitions to done/failed/dead so a
|
||||||
|
-- re-enqueue is possible after the row is reaped or a force-refetch is wanted.
|
||||||
|
--
|
||||||
|
-- Scoped to sync_chapter_content payloads only so Discover / SyncManga /
|
||||||
|
-- SyncChapterList jobs (which don't carry a chapter_id) remain un-deduped.
|
||||||
|
CREATE UNIQUE INDEX crawler_jobs_chapter_content_dedup_idx
|
||||||
|
ON crawler_jobs ((payload->>'chapter_id'))
|
||||||
|
WHERE state IN ('pending', 'running')
|
||||||
|
AND payload->>'kind' = 'sync_chapter_content';
|
||||||
12
backend/migrations/0015_crawler_state.sql
Normal file
12
backend/migrations/0015_crawler_state.sql
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
-- Small key-value table for daemon state that needs to survive restarts.
|
||||||
|
--
|
||||||
|
-- Used so far only by the cron scheduler (`last_metadata_tick_at`) so it can
|
||||||
|
-- detect that the most recent slot was missed (e.g. the backend was down at
|
||||||
|
-- midnight) and fire immediately on startup before resuming the regular
|
||||||
|
-- schedule. JSONB on the value column lets future keys carry richer payloads
|
||||||
|
-- without another migration.
|
||||||
|
CREATE TABLE crawler_state (
|
||||||
|
key text PRIMARY KEY,
|
||||||
|
value jsonb NOT NULL,
|
||||||
|
updated_at timestamptz NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
15
backend/migrations/0016_crawler_jobs_drop_failed_state.sql
Normal file
15
backend/migrations/0016_crawler_jobs_drop_failed_state.sql
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
-- The original 0012 partial index covers `state IN ('pending','failed')`,
|
||||||
|
-- but `ack_failed` in src/crawler/jobs.rs only writes `dead` or
|
||||||
|
-- `pending` — `failed` is never set. The index branch on `failed`
|
||||||
|
-- never matches any row, so it's dead weight on every write.
|
||||||
|
--
|
||||||
|
-- Drop and recreate the index without the dead branch. The CHECK
|
||||||
|
-- constraint on `state` still allows `'failed'` so a future migration
|
||||||
|
-- can adopt that terminal-but-retryable state without a second
|
||||||
|
-- schema change.
|
||||||
|
|
||||||
|
DROP INDEX IF EXISTS crawler_jobs_ready_idx;
|
||||||
|
|
||||||
|
CREATE INDEX crawler_jobs_ready_idx
|
||||||
|
ON crawler_jobs (scheduled_at)
|
||||||
|
WHERE state = 'pending';
|
||||||
20
backend/migrations/0017_chapter_sources_per_manga.sql
Normal file
20
backend/migrations/0017_chapter_sources_per_manga.sql
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
-- chapter_sources: drop the global (source_id, source_chapter_key) PK
|
||||||
|
-- and rekey on (source_id, chapter_id).
|
||||||
|
--
|
||||||
|
-- The old PK assumed chapter slugs are unique per source. Sources whose
|
||||||
|
-- chapter naming is per-manga (chapter-1, chapter-2, ...) instead of per-
|
||||||
|
-- catalog (br_chapter-379272 with a global counter) would collide on the
|
||||||
|
-- second manga: the INSERT would conflict on (source_id, "chapter-1") and
|
||||||
|
-- the lookup would attribute the row to the first manga's chapter_id.
|
||||||
|
--
|
||||||
|
-- The new key is the natural identity of a source attachment: "this source
|
||||||
|
-- has this chapter". An (source_id, source_chapter_key) index preserves
|
||||||
|
-- the lookup path (find existing source row by source's identifier) but
|
||||||
|
-- no longer enforces uniqueness — the application combines it with the
|
||||||
|
-- chapters table's manga_id to scope the lookup per-manga.
|
||||||
|
|
||||||
|
ALTER TABLE chapter_sources DROP CONSTRAINT chapter_sources_pkey;
|
||||||
|
ALTER TABLE chapter_sources ADD PRIMARY KEY (source_id, chapter_id);
|
||||||
|
|
||||||
|
CREATE INDEX chapter_sources_source_key_idx
|
||||||
|
ON chapter_sources (source_id, source_chapter_key);
|
||||||
5
backend/migrations/0018_admin_role.sql
Normal file
5
backend/migrations/0018_admin_role.sql
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
-- Admin role flag on users. Booted from ADMIN_USERNAME / ADMIN_PASSWORD env at
|
||||||
|
-- startup (see app::build). Demotion is instant: the RequireAdmin extractor
|
||||||
|
-- re-reads the user row every request, so flipping this column takes effect on
|
||||||
|
-- the next call without a session purge.
|
||||||
|
ALTER TABLE users ADD COLUMN is_admin BOOLEAN NOT NULL DEFAULT false;
|
||||||
20
backend/migrations/0019_admin_audit.sql
Normal file
20
backend/migrations/0019_admin_audit.sql
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
-- Admin audit log. Written from inside the same transaction as the action
|
||||||
|
-- it records, so a failed COMMIT also rolls back the audit row — the log
|
||||||
|
-- never claims an action happened that didn't.
|
||||||
|
--
|
||||||
|
-- `actor_user_id` is ON DELETE SET NULL so audit rows outlive a deleted
|
||||||
|
-- admin (the answer to "who promoted Bob to admin?" survives even after
|
||||||
|
-- Alice's account is removed). `target_id` is intentionally not a FK
|
||||||
|
-- because future audit kinds may target non-user rows (manga, source,
|
||||||
|
-- etc.) and a single typed FK can't express that.
|
||||||
|
CREATE TABLE admin_audit (
|
||||||
|
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
actor_user_id uuid REFERENCES users(id) ON DELETE SET NULL,
|
||||||
|
action text NOT NULL,
|
||||||
|
target_kind text NOT NULL,
|
||||||
|
target_id uuid,
|
||||||
|
payload jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||||
|
at timestamptz NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX admin_audit_at_idx ON admin_audit (at DESC);
|
||||||
14
backend/migrations/0020_admin_jobs_payload_index.sql
Normal file
14
backend/migrations/0020_admin_jobs_payload_index.sql
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
-- Per-manga sync-state derivation joins crawler_jobs to manga_sources via
|
||||||
|
-- (payload->>'source_id', payload->>'source_manga_key') for the
|
||||||
|
-- `sync_manga` job kind (whose payload doesn't carry a manga_id directly).
|
||||||
|
-- Without this index the join falls back to a seqscan of crawler_jobs on
|
||||||
|
-- every admin manga listing — a noticeable cost as the job table grows
|
||||||
|
-- with the daily metadata pass.
|
||||||
|
--
|
||||||
|
-- Partial on `state IN ('pending','running')` so it covers only in-flight
|
||||||
|
-- jobs (the bulk of the table is done/dead and irrelevant to "is this
|
||||||
|
-- manga being synced right now").
|
||||||
|
CREATE INDEX crawler_jobs_sync_manga_key_idx
|
||||||
|
ON crawler_jobs ((payload->>'source_manga_key'))
|
||||||
|
WHERE state IN ('pending', 'running')
|
||||||
|
AND payload->>'kind' = 'sync_manga';
|
||||||
18
backend/migrations/0021_chapter_source_index.sql
Normal file
18
backend/migrations/0021_chapter_source_index.sql
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
-- Capture each chapter's position in the source site's chapter list so
|
||||||
|
-- the user-facing list can preserve site order: variants of the same
|
||||||
|
-- chapter number (e.g. "Ch.14 : PH" next to "Ch.14 : Official") stay
|
||||||
|
-- adjacent, and non-numeric entries like "notice. : Officials" land
|
||||||
|
-- where the site placed them rather than clustering at the top under
|
||||||
|
-- number = 0.
|
||||||
|
--
|
||||||
|
-- Lower source_index = closer to the top of the source DOM = newer
|
||||||
|
-- chapter on this site (it renders newest-first). The list query
|
||||||
|
-- reverses this with ORDER BY source_index DESC so the oldest chapter
|
||||||
|
-- appears first in our UI.
|
||||||
|
--
|
||||||
|
-- NULL is the sentinel for user-uploaded chapters (no source row) and
|
||||||
|
-- for crawled rows that pre-date this migration. The list query keeps
|
||||||
|
-- the existing (number, created_at) tiebreak via NULLS LAST so those
|
||||||
|
-- fall through to the prior behaviour until the next crawler tick
|
||||||
|
-- populates the column.
|
||||||
|
ALTER TABLE chapters ADD COLUMN source_index INTEGER;
|
||||||
491
backend/src/api/admin/crawler.rs
Normal file
491
backend/src/api/admin/crawler.rs
Normal file
@@ -0,0 +1,491 @@
|
|||||||
|
//! Admin-only crawler observability + control endpoints.
|
||||||
|
//!
|
||||||
|
//! Mounted under `/api/v1/admin/crawler*`, cookie-only via `RequireAdmin`.
|
||||||
|
//! All control endpoints return 503 when the crawler daemon is disabled
|
||||||
|
//! (`AppState.crawler == None`). Reads compose the live in-process status
|
||||||
|
//! ([`crate::crawler::status`]) with DB-derived queue counts and the
|
||||||
|
//! session/browser flags.
|
||||||
|
|
||||||
|
use std::convert::Infallible;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use axum::extract::{Query, State};
|
||||||
|
use axum::response::sse::{Event, KeepAlive, Sse};
|
||||||
|
use axum::routing::{get, post};
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use futures_util::stream::Stream;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::json;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::app::{AppState, CrawlerControl};
|
||||||
|
use crate::auth::extractor::RequireAdmin;
|
||||||
|
use crate::crawler::browser_manager::RestartPhase;
|
||||||
|
use crate::crawler::status::{ActiveChapter, CoverTarget, LastPass, Phase};
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
use crate::repo;
|
||||||
|
use crate::repo::crawler::{ActiveJob, DeadJob, MissingCoverRow, RequeueScope};
|
||||||
|
|
||||||
|
/// Backstop recompose interval for the SSE stream. Phase/worker/session
|
||||||
|
/// changes push instantly via the status `watch`; this only bounds the
|
||||||
|
/// staleness of DB-derived queue counts and the browser phase when those
|
||||||
|
/// change without an accompanying status poke.
|
||||||
|
const SSE_BACKSTOP: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new()
|
||||||
|
.route("/admin/crawler", get(get_status))
|
||||||
|
.route("/admin/crawler/stream", get(stream_status))
|
||||||
|
.route("/admin/crawler/run", post(run_now))
|
||||||
|
.route("/admin/crawler/browser/restart", post(restart_browser))
|
||||||
|
.route("/admin/crawler/session", post(update_session))
|
||||||
|
.route(
|
||||||
|
"/admin/crawler/session/clear-expired",
|
||||||
|
post(clear_session_expired),
|
||||||
|
)
|
||||||
|
.route("/admin/crawler/dead-jobs", get(list_dead_jobs))
|
||||||
|
.route("/admin/crawler/dead-jobs/requeue", post(requeue_dead_jobs))
|
||||||
|
.route("/admin/crawler/active-jobs", get(list_active_jobs))
|
||||||
|
.route("/admin/crawler/covers", get(list_covers))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// GET /admin/crawler — live status
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct QueueCounts {
|
||||||
|
pending: i64,
|
||||||
|
running: i64,
|
||||||
|
dead: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct SessionStatus {
|
||||||
|
/// Whether the sticky session-expired flag is set (chapter workers idle).
|
||||||
|
expired: bool,
|
||||||
|
/// Whether a PHPSESSID is currently configured at all.
|
||||||
|
configured: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct CrawlerStatusResponse {
|
||||||
|
/// `"running"` | `"disabled"`.
|
||||||
|
daemon: &'static str,
|
||||||
|
phase: Option<Phase>,
|
||||||
|
/// Configured chapter-worker count (for "N busy / M workers").
|
||||||
|
worker_count: usize,
|
||||||
|
/// Chapters being crawled right now, with live page counts.
|
||||||
|
active_chapters: Vec<ActiveChapter>,
|
||||||
|
/// The cover being fetched right now, if any.
|
||||||
|
current_cover: Option<CoverTarget>,
|
||||||
|
/// Mangas still queued for a cover fetch.
|
||||||
|
covers_queued: i64,
|
||||||
|
last_pass: LastPass,
|
||||||
|
session: SessionStatus,
|
||||||
|
/// `"healthy"` | `"draining"` | `"restarting"` | `"down"`.
|
||||||
|
browser: &'static str,
|
||||||
|
queue: QueueCounts,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn browser_phase_str(p: RestartPhase) -> &'static str {
|
||||||
|
match p {
|
||||||
|
RestartPhase::Healthy => "healthy",
|
||||||
|
RestartPhase::Draining => "draining",
|
||||||
|
RestartPhase::Restarting => "restarting",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compose a full status snapshot from the in-memory status, the
|
||||||
|
/// browser/session flags, and a fresh DB queue-count query. Shared by the
|
||||||
|
/// one-shot `get_status` and the SSE `stream_status`.
|
||||||
|
async fn compose_status(state: &AppState) -> AppResult<CrawlerStatusResponse> {
|
||||||
|
let (pending, running, dead) = repo::crawler::job_state_counts(&state.db).await?;
|
||||||
|
let queue = QueueCounts {
|
||||||
|
pending,
|
||||||
|
running,
|
||||||
|
dead,
|
||||||
|
};
|
||||||
|
let covers_queued = repo::crawler::count_missing_covers(&state.db).await?;
|
||||||
|
|
||||||
|
Ok(match state.crawler.as_ref() {
|
||||||
|
None => CrawlerStatusResponse {
|
||||||
|
daemon: "disabled",
|
||||||
|
phase: None,
|
||||||
|
worker_count: 0,
|
||||||
|
active_chapters: Vec::new(),
|
||||||
|
current_cover: None,
|
||||||
|
covers_queued,
|
||||||
|
last_pass: LastPass::default(),
|
||||||
|
session: SessionStatus {
|
||||||
|
expired: false,
|
||||||
|
configured: false,
|
||||||
|
},
|
||||||
|
browser: "down",
|
||||||
|
queue,
|
||||||
|
},
|
||||||
|
Some(c) => {
|
||||||
|
let snap = c.status.snapshot().await;
|
||||||
|
CrawlerStatusResponse {
|
||||||
|
daemon: "running",
|
||||||
|
phase: Some(snap.phase),
|
||||||
|
worker_count: snap.worker_count,
|
||||||
|
active_chapters: snap.active_chapters,
|
||||||
|
current_cover: snap.current_cover,
|
||||||
|
covers_queued,
|
||||||
|
last_pass: snap.last_pass,
|
||||||
|
session: SessionStatus {
|
||||||
|
expired: c.session.is_expired(),
|
||||||
|
configured: c.session.current().await.is_some(),
|
||||||
|
},
|
||||||
|
browser: browser_phase_str(c.browser_manager.phase()),
|
||||||
|
queue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_status(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
) -> AppResult<Json<CrawlerStatusResponse>> {
|
||||||
|
Ok(Json(compose_status(&state).await?))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// GET /admin/crawler/stream — Server-Sent Events live status
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Push live status to the dashboard instead of polling. Emits a snapshot
|
||||||
|
/// immediately on connect, then on every status change (instant, via the
|
||||||
|
/// `watch` notifier) and on a [`SSE_BACKSTOP`] tick (to refresh DB queue
|
||||||
|
/// counts / browser phase that change without a status poke). The browser
|
||||||
|
/// opens this only while the crawler page is mounted and closes it on
|
||||||
|
/// navigate-away, so the subscription is scoped to the active page.
|
||||||
|
async fn stream_status(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
|
||||||
|
// Subscribe before the first emit so no change between the initial
|
||||||
|
// snapshot and the first await is lost.
|
||||||
|
let rx = state.crawler.as_ref().map(|c| c.status.subscribe());
|
||||||
|
|
||||||
|
let stream = futures_util::stream::unfold(
|
||||||
|
(state, rx, true),
|
||||||
|
|(state, mut rx, first)| async move {
|
||||||
|
// After the first immediate emit, wait for a change or the
|
||||||
|
// backstop tick before recomposing.
|
||||||
|
if !first {
|
||||||
|
match rx.as_mut() {
|
||||||
|
Some(rx) => {
|
||||||
|
tokio::select! {
|
||||||
|
_ = rx.changed() => {}
|
||||||
|
_ = tokio::time::sleep(SSE_BACKSTOP) => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => tokio::time::sleep(SSE_BACKSTOP).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Compose; on a transient DB error, emit a keep-alive comment
|
||||||
|
// rather than tearing down the stream.
|
||||||
|
let event = match compose_status(&state).await {
|
||||||
|
Ok(resp) => Event::default()
|
||||||
|
.event("status")
|
||||||
|
.json_data(&resp)
|
||||||
|
.unwrap_or_else(|_| Event::default().comment("serialize error")),
|
||||||
|
Err(_) => Event::default().comment("status unavailable"),
|
||||||
|
};
|
||||||
|
Some((Ok(event), (state, rx, false)))
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
Sse::new(stream).keep_alive(KeepAlive::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// POST /admin/crawler/run — trigger an out-of-cycle metadata pass
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct RunResponse {
|
||||||
|
started: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_now(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
admin: RequireAdmin,
|
||||||
|
) -> AppResult<Json<RunResponse>> {
|
||||||
|
let c = require_crawler(&state)?;
|
||||||
|
let mp = c.metadata_pass.as_ref().ok_or_else(|| {
|
||||||
|
AppError::ServiceUnavailable("no source configured (CRAWLER_START_URL unset)".into())
|
||||||
|
})?;
|
||||||
|
let mp = std::sync::Arc::clone(mp);
|
||||||
|
// Fire-and-forget: the pass can run for minutes; the dashboard polls
|
||||||
|
// status for progress. Overlap with the daily cron is rare (daily) and
|
||||||
|
// both serialise on the single browser lease.
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = mp.run().await {
|
||||||
|
tracing::warn!(error = ?e, "manual metadata pass failed");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
repo::admin_audit::insert(&state.db, admin.0.id, "crawler_run", "crawler", None, json!({}))
|
||||||
|
.await?;
|
||||||
|
Ok(Json(RunResponse { started: true }))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// POST /admin/crawler/browser/restart — coordinated restart
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct RestartResponse {
|
||||||
|
ok: bool,
|
||||||
|
error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn restart_browser(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
admin: RequireAdmin,
|
||||||
|
) -> AppResult<Json<RestartResponse>> {
|
||||||
|
let c = require_crawler(&state)?;
|
||||||
|
let result = c.browser_manager.coordinated_restart(c.drain_deadline).await;
|
||||||
|
// A successful coordinated_restart re-runs on_launch, which re-injects
|
||||||
|
// PHPSESSID and re-probes — i.e. the session is live. Drop the sticky
|
||||||
|
// `session_expired` flag so chapter workers stop idling without
|
||||||
|
// requiring a second click on "Clear expired".
|
||||||
|
if result.is_ok() {
|
||||||
|
c.session.clear_expired();
|
||||||
|
}
|
||||||
|
// Push the post-restart browser phase to live subscribers immediately.
|
||||||
|
c.status.poke();
|
||||||
|
repo::admin_audit::insert(
|
||||||
|
&state.db,
|
||||||
|
admin.0.id,
|
||||||
|
"crawler_browser_restart",
|
||||||
|
"crawler",
|
||||||
|
None,
|
||||||
|
json!({ "ok": result.is_ok() }),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(match result {
|
||||||
|
Ok(()) => RestartResponse {
|
||||||
|
ok: true,
|
||||||
|
error: None,
|
||||||
|
},
|
||||||
|
Err(e) => RestartResponse {
|
||||||
|
ok: false,
|
||||||
|
error: Some(format!("{e:#}")),
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// POST /admin/crawler/session — refresh PHPSESSID
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct UpdateSessionRequest {
|
||||||
|
phpsessid: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct UpdateSessionResponse {
|
||||||
|
/// Whether the post-update browser relaunch + session probe succeeded.
|
||||||
|
valid: bool,
|
||||||
|
error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn update_session(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
admin: RequireAdmin,
|
||||||
|
Json(body): Json<UpdateSessionRequest>,
|
||||||
|
) -> AppResult<Json<UpdateSessionResponse>> {
|
||||||
|
let c = require_crawler(&state)?;
|
||||||
|
c.session
|
||||||
|
.update(&body.phpsessid)
|
||||||
|
.await
|
||||||
|
.map_err(|e| AppError::InvalidInput(format!("{e:#}")))?;
|
||||||
|
// Relaunch the browser so on_launch re-injects the new cookie and
|
||||||
|
// re-probes — the restart's success IS the session-validity signal.
|
||||||
|
let probe = c.browser_manager.coordinated_restart(c.drain_deadline).await;
|
||||||
|
// Session + browser state changed — push to live subscribers.
|
||||||
|
c.status.poke();
|
||||||
|
repo::admin_audit::insert(
|
||||||
|
&state.db,
|
||||||
|
admin.0.id,
|
||||||
|
"crawler_session_update",
|
||||||
|
"crawler",
|
||||||
|
None,
|
||||||
|
json!({ "valid": probe.is_ok() }),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(match probe {
|
||||||
|
Ok(()) => UpdateSessionResponse {
|
||||||
|
valid: true,
|
||||||
|
error: None,
|
||||||
|
},
|
||||||
|
Err(e) => UpdateSessionResponse {
|
||||||
|
valid: false,
|
||||||
|
error: Some(format!("{e:#}")),
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct ClearExpiredResponse {
|
||||||
|
cleared: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn clear_session_expired(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
admin: RequireAdmin,
|
||||||
|
) -> AppResult<Json<ClearExpiredResponse>> {
|
||||||
|
let c = require_crawler(&state)?;
|
||||||
|
c.session.clear_expired();
|
||||||
|
// session.expired flipped — push to live subscribers.
|
||||||
|
c.status.poke();
|
||||||
|
repo::admin_audit::insert(
|
||||||
|
&state.db,
|
||||||
|
admin.0.id,
|
||||||
|
"crawler_session_clear_expired",
|
||||||
|
"crawler",
|
||||||
|
None,
|
||||||
|
json!({}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(ClearExpiredResponse { cleared: true }))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Dead jobs
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Default)]
|
||||||
|
struct DeadJobsParams {
|
||||||
|
#[serde(default)]
|
||||||
|
search: Option<String>,
|
||||||
|
#[serde(default = "default_limit")]
|
||||||
|
limit: i64,
|
||||||
|
#[serde(default)]
|
||||||
|
offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_limit() -> i64 {
|
||||||
|
50
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_dead_jobs(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
Query(params): Query<DeadJobsParams>,
|
||||||
|
) -> AppResult<Json<crate::api::pagination::PagedResponse<DeadJob>>> {
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let search = params.search.filter(|s| !s.trim().is_empty());
|
||||||
|
let (items, total) =
|
||||||
|
repo::crawler::list_dead_jobs(&state.db, search.as_deref(), limit, offset).await?;
|
||||||
|
Ok(Json(crate::api::pagination::PagedResponse::with_total(
|
||||||
|
items, limit, offset, total,
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
#[serde(tag = "scope", rename_all = "snake_case")]
|
||||||
|
enum RequeueRequest {
|
||||||
|
All,
|
||||||
|
Manga { manga_id: Uuid },
|
||||||
|
Chapter { chapter_id: Uuid },
|
||||||
|
Job { job_id: Uuid },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct RequeueResponse {
|
||||||
|
requeued: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn requeue_dead_jobs(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
admin: RequireAdmin,
|
||||||
|
Json(body): Json<RequeueRequest>,
|
||||||
|
) -> AppResult<Json<RequeueResponse>> {
|
||||||
|
let scope = match &body {
|
||||||
|
RequeueRequest::All => RequeueScope::All,
|
||||||
|
RequeueRequest::Manga { manga_id } => RequeueScope::Manga(*manga_id),
|
||||||
|
RequeueRequest::Chapter { chapter_id } => RequeueScope::Chapter(*chapter_id),
|
||||||
|
RequeueRequest::Job { job_id } => RequeueScope::Job(*job_id),
|
||||||
|
};
|
||||||
|
let requeued = repo::crawler::requeue_dead_jobs(&state.db, scope).await?;
|
||||||
|
repo::admin_audit::insert(
|
||||||
|
&state.db,
|
||||||
|
admin.0.id,
|
||||||
|
"crawler_dead_jobs_requeue",
|
||||||
|
"crawler",
|
||||||
|
None,
|
||||||
|
json!({ "requeued": requeued, "scope": scope_label(&body) }),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(RequeueResponse { requeued }))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scope_label(r: &RequeueRequest) -> &'static str {
|
||||||
|
match r {
|
||||||
|
RequeueRequest::All => "all",
|
||||||
|
RequeueRequest::Manga { .. } => "manga",
|
||||||
|
RequeueRequest::Chapter { .. } => "chapter",
|
||||||
|
RequeueRequest::Job { .. } => "job",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Queued-chapters + queued-covers backlogs (paginated, fetched on demand)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Pagination + title-search params shared by the backlog list endpoints.
|
||||||
|
#[derive(Debug, Deserialize, Default)]
|
||||||
|
struct ListParams {
|
||||||
|
#[serde(default)]
|
||||||
|
search: Option<String>,
|
||||||
|
#[serde(default = "default_limit")]
|
||||||
|
limit: i64,
|
||||||
|
#[serde(default)]
|
||||||
|
offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_active_jobs(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
Query(params): Query<ListParams>,
|
||||||
|
) -> AppResult<Json<crate::api::pagination::PagedResponse<ActiveJob>>> {
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let search = params.search.filter(|s| !s.trim().is_empty());
|
||||||
|
let (items, total) =
|
||||||
|
repo::crawler::list_active_jobs(&state.db, search.as_deref(), limit, offset).await?;
|
||||||
|
Ok(Json(crate::api::pagination::PagedResponse::with_total(
|
||||||
|
items, limit, offset, total,
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_covers(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
Query(params): Query<ListParams>,
|
||||||
|
) -> AppResult<Json<crate::api::pagination::PagedResponse<MissingCoverRow>>> {
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let search = params.search.filter(|s| !s.trim().is_empty());
|
||||||
|
let (items, total) =
|
||||||
|
repo::crawler::list_missing_cover_mangas(&state.db, search.as_deref(), limit, offset)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(crate::api::pagination::PagedResponse::with_total(
|
||||||
|
items, limit, offset, total,
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn require_crawler(state: &AppState) -> Result<&std::sync::Arc<CrawlerControl>, AppError> {
|
||||||
|
state.crawler.as_ref().ok_or_else(|| {
|
||||||
|
AppError::ServiceUnavailable("crawler daemon is disabled".into())
|
||||||
|
})
|
||||||
|
}
|
||||||
110
backend/src/api/admin/mangas.rs
Normal file
110
backend/src/api/admin/mangas.rs
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
//! Admin manga/chapter overview with derived sync state.
|
||||||
|
//!
|
||||||
|
//! Sync state comes from `repo::admin_view`, which joins the manga /
|
||||||
|
//! chapter tables with the crawler signals at query time — there is no
|
||||||
|
//! persisted sync_state column. See [`repo::admin_view`] for the
|
||||||
|
//! derivation priority order.
|
||||||
|
|
||||||
|
use axum::extract::{Path, Query, State};
|
||||||
|
use axum::routing::get;
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::api::pagination::PagedResponse;
|
||||||
|
use crate::app::AppState;
|
||||||
|
use crate::auth::extractor::RequireAdmin;
|
||||||
|
use crate::domain::MangaSyncState;
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
use crate::repo;
|
||||||
|
use crate::repo::admin_view::{AdminChapterRow, AdminMangaRow};
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new()
|
||||||
|
.route("/admin/mangas", get(list_mangas))
|
||||||
|
.route("/admin/mangas/:id/chapters", get(list_chapters))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Default)]
|
||||||
|
pub struct ListChaptersParams {
|
||||||
|
#[serde(default = "default_chapter_limit")]
|
||||||
|
pub limit: i64,
|
||||||
|
#[serde(default)]
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_chapter_limit() -> i64 {
|
||||||
|
200
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Default)]
|
||||||
|
pub struct ListMangasParams {
|
||||||
|
#[serde(default)]
|
||||||
|
pub search: Option<String>,
|
||||||
|
/// `in_progress` | `dropped` | `synced`. Unrecognised values are a 400.
|
||||||
|
#[serde(default)]
|
||||||
|
pub sync_state: Option<String>,
|
||||||
|
#[serde(default = "default_limit")]
|
||||||
|
pub limit: i64,
|
||||||
|
#[serde(default)]
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_limit() -> i64 {
|
||||||
|
50
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_mangas(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
Query(params): Query<ListMangasParams>,
|
||||||
|
) -> AppResult<Json<PagedResponse<AdminMangaRow>>> {
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
|
||||||
|
let sync_state = match params.sync_state.as_deref() {
|
||||||
|
None | Some("") => None,
|
||||||
|
Some("in_progress") => Some(MangaSyncState::InProgress),
|
||||||
|
Some("dropped") => Some(MangaSyncState::Dropped),
|
||||||
|
Some("synced") => Some(MangaSyncState::Synced),
|
||||||
|
Some(other) => {
|
||||||
|
return Err(AppError::InvalidInput(format!(
|
||||||
|
"sync_state must be one of in_progress|dropped|synced (got {other:?})"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let q = repo::admin_view::ListAdminMangasQuery {
|
||||||
|
search: params.search.filter(|s| !s.trim().is_empty()),
|
||||||
|
sync_state,
|
||||||
|
limit,
|
||||||
|
offset,
|
||||||
|
};
|
||||||
|
let (items, total) = repo::admin_view::list_mangas_with_sync_state(&state.db, &q).await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_chapters(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
Path(manga_id): Path<Uuid>,
|
||||||
|
Query(params): Query<ListChaptersParams>,
|
||||||
|
) -> AppResult<Json<PagedResponse<AdminChapterRow>>> {
|
||||||
|
// Explicit existence check so a typo / deleted manga returns 404
|
||||||
|
// rather than a misleading "no chapters" 200.
|
||||||
|
if !repo::manga::exists(&state.db, manga_id).await? {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
}
|
||||||
|
// Cap at 500 to bound the per-row scalar-subquery cost on
|
||||||
|
// long-runners with thousands of chapters; default 200 covers
|
||||||
|
// typical browsing without paging round-trips.
|
||||||
|
let limit = params.limit.clamp(1, 500);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let q = repo::admin_view::ListAdminChaptersQuery {
|
||||||
|
manga_id,
|
||||||
|
limit,
|
||||||
|
offset,
|
||||||
|
};
|
||||||
|
let (items, total) = repo::admin_view::list_chapters_with_sync_state(&state.db, &q).await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
|
||||||
|
}
|
||||||
24
backend/src/api/admin/mod.rs
Normal file
24
backend/src/api/admin/mod.rs
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
//! Admin-only endpoints. Mounted under `/api/v1/admin/*` by
|
||||||
|
//! `crate::api::routes`. Every handler in this subtree is guarded by
|
||||||
|
//! `RequireAdmin`, which only accepts session-cookie authentication —
|
||||||
|
//! bot/API tokens cannot reach admin routes (see
|
||||||
|
//! `crate::auth::extractor::RequireAdmin`).
|
||||||
|
|
||||||
|
pub mod crawler;
|
||||||
|
pub mod mangas;
|
||||||
|
pub mod resync;
|
||||||
|
pub mod system;
|
||||||
|
pub mod users;
|
||||||
|
|
||||||
|
use axum::Router;
|
||||||
|
|
||||||
|
use crate::app::AppState;
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new()
|
||||||
|
.merge(users::routes())
|
||||||
|
.merge(mangas::routes())
|
||||||
|
.merge(resync::routes())
|
||||||
|
.merge(system::routes())
|
||||||
|
.merge(crawler::routes())
|
||||||
|
}
|
||||||
176
backend/src/api/admin/resync.rs
Normal file
176
backend/src/api/admin/resync.rs
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
//! Admin-triggered force resync of a single manga's metadata + cover,
|
||||||
|
//! or a single chapter's content.
|
||||||
|
//!
|
||||||
|
//! Both endpoints are admin-only (`RequireAdmin`, cookie-only) and run
|
||||||
|
//! synchronously with the request — the response carries the refreshed
|
||||||
|
//! resource so the UI can swap it in without a follow-up GET. The work
|
||||||
|
//! itself is delegated to [`ResyncService`] (set on AppState by
|
||||||
|
//! `app::build` when the crawler daemon is enabled); when the daemon
|
||||||
|
//! is disabled, both handlers return 503.
|
||||||
|
|
||||||
|
use axum::extract::{Path, State};
|
||||||
|
use axum::routing::post;
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use serde::Serialize;
|
||||||
|
use serde_json::json;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::app::AppState;
|
||||||
|
use crate::auth::extractor::RequireAdmin;
|
||||||
|
use crate::crawler::resync::{ChapterResyncOutcome, ResyncError};
|
||||||
|
use crate::domain::manga::MangaDetail;
|
||||||
|
use crate::domain::Chapter;
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
use crate::repo;
|
||||||
|
use crate::repo::crawler::UpsertStatus;
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new()
|
||||||
|
.route("/admin/mangas/:id/resync", post(resync_manga))
|
||||||
|
.route("/admin/chapters/:id/resync", post(resync_chapter))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct MangaResyncResponse {
|
||||||
|
pub manga: MangaDetail,
|
||||||
|
/// `"new" | "updated" | "unchanged"` — mirrors [`UpsertStatus`].
|
||||||
|
pub metadata_status: &'static str,
|
||||||
|
pub cover_fetched: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct ChapterResyncResponse {
|
||||||
|
pub chapter: Chapter,
|
||||||
|
/// `"fetched" | "skipped"` — whether new pages landed or the
|
||||||
|
/// service short-circuited (e.g. chapter already had pages and the
|
||||||
|
/// session was lost so force was downgraded).
|
||||||
|
pub outcome: &'static str,
|
||||||
|
/// Page count when `outcome == "fetched"`. `None` for `skipped`.
|
||||||
|
pub pages: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn resync_manga(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
admin: RequireAdmin,
|
||||||
|
Path(manga_id): Path<Uuid>,
|
||||||
|
) -> AppResult<Json<MangaResyncResponse>> {
|
||||||
|
if !repo::manga::exists(&state.db, manga_id).await? {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
}
|
||||||
|
let resync = state
|
||||||
|
.resync
|
||||||
|
.as_ref()
|
||||||
|
.ok_or_else(|| AppError::ServiceUnavailable(
|
||||||
|
"crawler daemon is disabled; force resync unavailable".into(),
|
||||||
|
))?;
|
||||||
|
|
||||||
|
let outcome = resync.resync_manga(manga_id).await.map_err(map_resync_err)?;
|
||||||
|
|
||||||
|
// Audit the action with the actor + the resync outcome so an
|
||||||
|
// operator-of-operators can answer "who refetched this manga, and
|
||||||
|
// did the cover land?" from the log alone.
|
||||||
|
repo::admin_audit::insert(
|
||||||
|
&state.db,
|
||||||
|
admin.0.id,
|
||||||
|
"manga_resync",
|
||||||
|
"manga",
|
||||||
|
Some(manga_id),
|
||||||
|
json!({
|
||||||
|
"metadata_status": status_str(outcome.metadata_status),
|
||||||
|
"cover_fetched": outcome.cover_fetched,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let manga = repo::manga::get_detail(&state.db, manga_id).await?;
|
||||||
|
Ok(Json(MangaResyncResponse {
|
||||||
|
manga,
|
||||||
|
metadata_status: status_str(outcome.metadata_status),
|
||||||
|
cover_fetched: outcome.cover_fetched,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn resync_chapter(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
admin: RequireAdmin,
|
||||||
|
Path(chapter_id): Path<Uuid>,
|
||||||
|
) -> AppResult<Json<ChapterResyncResponse>> {
|
||||||
|
let resync = state
|
||||||
|
.resync
|
||||||
|
.as_ref()
|
||||||
|
.ok_or_else(|| AppError::ServiceUnavailable(
|
||||||
|
"crawler daemon is disabled; force resync unavailable".into(),
|
||||||
|
))?;
|
||||||
|
|
||||||
|
// Look up the manga the chapter belongs to so we can return the
|
||||||
|
// refreshed chapter row in the response and 404 for unknown ids.
|
||||||
|
let manga_id: Option<Uuid> =
|
||||||
|
sqlx::query_scalar("SELECT manga_id FROM chapters WHERE id = $1")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.fetch_optional(&state.db)
|
||||||
|
.await?;
|
||||||
|
let Some(manga_id) = manga_id else {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
};
|
||||||
|
|
||||||
|
let outcome = resync
|
||||||
|
.resync_chapter(chapter_id)
|
||||||
|
.await
|
||||||
|
.map_err(map_resync_err)?;
|
||||||
|
|
||||||
|
let (outcome_str, pages) = match &outcome {
|
||||||
|
ChapterResyncOutcome::Fetched { pages, .. } => ("fetched", Some(*pages)),
|
||||||
|
ChapterResyncOutcome::Skipped { .. } => ("skipped", None),
|
||||||
|
};
|
||||||
|
|
||||||
|
repo::admin_audit::insert(
|
||||||
|
&state.db,
|
||||||
|
admin.0.id,
|
||||||
|
"chapter_resync",
|
||||||
|
"chapter",
|
||||||
|
Some(chapter_id),
|
||||||
|
json!({
|
||||||
|
"outcome": outcome_str,
|
||||||
|
"pages": pages,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let chapter = repo::chapter::find_by_id_in_manga(&state.db, manga_id, chapter_id)
|
||||||
|
.await?
|
||||||
|
.ok_or(AppError::NotFound)?;
|
||||||
|
Ok(Json(ChapterResyncResponse {
|
||||||
|
chapter,
|
||||||
|
outcome: outcome_str,
|
||||||
|
pages,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn status_str(s: UpsertStatus) -> &'static str {
|
||||||
|
match s {
|
||||||
|
UpsertStatus::New => "new",
|
||||||
|
UpsertStatus::Updated => "updated",
|
||||||
|
UpsertStatus::Unchanged => "unchanged",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map [`ResyncError`] (and the anyhow envelopes wrapping it) onto the
|
||||||
|
/// right [`AppError`]. Anything else surfaces as a generic 500 via the
|
||||||
|
/// `Other` arm — the operator sees the underlying anyhow chain in
|
||||||
|
/// server logs, the client sees a clean envelope.
|
||||||
|
fn map_resync_err(err: anyhow::Error) -> AppError {
|
||||||
|
if let Some(rerr) = err.downcast_ref::<ResyncError>() {
|
||||||
|
match rerr {
|
||||||
|
ResyncError::NoMangaSource => AppError::ValidationFailed {
|
||||||
|
message: "manga has no live crawler source — cannot resync".into(),
|
||||||
|
details: json!({ "manga": "no_source" }),
|
||||||
|
},
|
||||||
|
ResyncError::NoChapterSource => AppError::ValidationFailed {
|
||||||
|
message: "chapter has no live crawler source — cannot resync".into(),
|
||||||
|
details: json!({ "chapter": "no_source" }),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
AppError::Other(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
163
backend/src/api/admin/system.rs
Normal file
163
backend/src/api/admin/system.rs
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
//! System metrics for the admin dashboard.
|
||||||
|
//!
|
||||||
|
//! Disk is `statvfs(storage_dir)` so the number reflects the volume the
|
||||||
|
//! app actually writes to (not the root filesystem of the host). When the
|
||||||
|
//! storage backend doesn't expose a local path (e.g. a future S3 impl)
|
||||||
|
//! the disk fields are `null` rather than fabricated.
|
||||||
|
//!
|
||||||
|
//! Memory and CPU come from `sysinfo`. CPU requires two refreshes with
|
||||||
|
//! at least 200ms between them to compute a meaningful delta; the
|
||||||
|
//! handler eats the 250ms wall-clock cost on each request. Admin
|
||||||
|
//! traffic is low-volume so a background cache isn't worth the moving
|
||||||
|
//! parts yet — revisit if polling becomes frequent.
|
||||||
|
|
||||||
|
use std::path::Path;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use axum::extract::State;
|
||||||
|
use axum::routing::get;
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use serde::Serialize;
|
||||||
|
use sysinfo::{CpuRefreshKind, MemoryRefreshKind, RefreshKind, System};
|
||||||
|
|
||||||
|
use crate::app::AppState;
|
||||||
|
use crate::auth::extractor::RequireAdmin;
|
||||||
|
use crate::error::AppResult;
|
||||||
|
|
||||||
|
const ALERT_THRESHOLD_PERCENT: f64 = 90.0;
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new().route("/admin/system", get(system))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct SystemStats {
|
||||||
|
pub disk: Option<DiskStats>,
|
||||||
|
pub memory: MemoryStats,
|
||||||
|
pub cpu: CpuStats,
|
||||||
|
pub alerts: Vec<Alert>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct DiskStats {
|
||||||
|
pub total_bytes: u64,
|
||||||
|
pub used_bytes: u64,
|
||||||
|
pub free_bytes: u64,
|
||||||
|
pub percent_used: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct MemoryStats {
|
||||||
|
pub total_bytes: u64,
|
||||||
|
pub used_bytes: u64,
|
||||||
|
pub percent_used: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct CpuStats {
|
||||||
|
pub percent_used: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct Alert {
|
||||||
|
pub level: AlertLevel,
|
||||||
|
pub message: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Clone, Copy)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum AlertLevel {
|
||||||
|
Warning,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn system(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
) -> AppResult<Json<SystemStats>> {
|
||||||
|
let disk = state.storage.local_root().and_then(disk_stats_for);
|
||||||
|
let (memory, cpu) = memory_and_cpu().await;
|
||||||
|
let mut alerts = Vec::new();
|
||||||
|
if let Some(d) = &disk {
|
||||||
|
if d.percent_used >= ALERT_THRESHOLD_PERCENT {
|
||||||
|
alerts.push(Alert {
|
||||||
|
level: AlertLevel::Warning,
|
||||||
|
message: format!(
|
||||||
|
"disk near full ({:.0}% used)",
|
||||||
|
d.percent_used
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if memory.percent_used >= ALERT_THRESHOLD_PERCENT {
|
||||||
|
alerts.push(Alert {
|
||||||
|
level: AlertLevel::Warning,
|
||||||
|
message: format!(
|
||||||
|
"memory near full ({:.0}% used)",
|
||||||
|
memory.percent_used
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(Json(SystemStats {
|
||||||
|
disk,
|
||||||
|
memory,
|
||||||
|
cpu,
|
||||||
|
alerts,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn disk_stats_for(root: &Path) -> Option<DiskStats> {
|
||||||
|
let s = nix::sys::statvfs::statvfs(root).ok()?;
|
||||||
|
// statvfs reports `f_frsize * f_blocks` for total bytes. `f_bavail`
|
||||||
|
// is "free to non-root callers" which is what an operator actually
|
||||||
|
// cares about — `f_bfree` includes blocks reserved for root.
|
||||||
|
let block = s.fragment_size();
|
||||||
|
let total = block * s.blocks();
|
||||||
|
let avail = block * s.blocks_available();
|
||||||
|
let used = total.saturating_sub(avail);
|
||||||
|
let percent_used = if total > 0 {
|
||||||
|
(used as f64) * 100.0 / (total as f64)
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
Some(DiskStats {
|
||||||
|
total_bytes: total,
|
||||||
|
used_bytes: used,
|
||||||
|
free_bytes: avail,
|
||||||
|
percent_used,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn memory_and_cpu() -> (MemoryStats, CpuStats) {
|
||||||
|
// sysinfo's CPU sampling needs two refreshes with a delay between
|
||||||
|
// them — the first seeds the delta counters, the second measures.
|
||||||
|
// We do this once per request; admin traffic is low enough that the
|
||||||
|
// 250ms cost is invisible.
|
||||||
|
let mut sys = System::new_with_specifics(
|
||||||
|
RefreshKind::new()
|
||||||
|
.with_cpu(CpuRefreshKind::everything())
|
||||||
|
.with_memory(MemoryRefreshKind::everything()),
|
||||||
|
);
|
||||||
|
sys.refresh_cpu_all();
|
||||||
|
// Yield the runtime instead of blocking it for the gap.
|
||||||
|
tokio::time::sleep(Duration::from_millis(250)).await;
|
||||||
|
sys.refresh_cpu_all();
|
||||||
|
sys.refresh_memory();
|
||||||
|
|
||||||
|
let total = sys.total_memory();
|
||||||
|
let used = sys.used_memory();
|
||||||
|
let mem_pct = if total > 0 {
|
||||||
|
(used as f64) * 100.0 / (total as f64)
|
||||||
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
|
let memory = MemoryStats {
|
||||||
|
total_bytes: total,
|
||||||
|
used_bytes: used,
|
||||||
|
percent_used: mem_pct,
|
||||||
|
};
|
||||||
|
|
||||||
|
let cpu = CpuStats {
|
||||||
|
percent_used: sys.global_cpu_usage() as f64,
|
||||||
|
};
|
||||||
|
(memory, cpu)
|
||||||
|
}
|
||||||
128
backend/src/api/admin/users.rs
Normal file
128
backend/src/api/admin/users.rs
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
//! Admin user management: list, delete, promote/demote.
|
||||||
|
//!
|
||||||
|
//! All handlers are gated by `RequireAdmin` and rely on
|
||||||
|
//! `repo::user::admin_safe_*` for self-protection and the last-admin
|
||||||
|
//! invariant. Audit rows are written inside the same DB transaction as
|
||||||
|
//! the action they record.
|
||||||
|
|
||||||
|
use axum::extract::{Path, Query, State};
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::routing::{delete, get};
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::api::auth::{validate_password, validate_username};
|
||||||
|
use crate::api::pagination::PagedResponse;
|
||||||
|
use crate::app::AppState;
|
||||||
|
use crate::auth::extractor::RequireAdmin;
|
||||||
|
use crate::auth::password::hash_password;
|
||||||
|
use crate::domain::User;
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
use crate::repo;
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new()
|
||||||
|
.route("/admin/users", get(list_users).post(create_user))
|
||||||
|
.route(
|
||||||
|
"/admin/users/:id",
|
||||||
|
delete(delete_user).patch(update_user),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize, Default)]
|
||||||
|
pub struct ListUsersParams {
|
||||||
|
#[serde(default)]
|
||||||
|
pub search: Option<String>,
|
||||||
|
#[serde(default = "default_limit")]
|
||||||
|
pub limit: i64,
|
||||||
|
#[serde(default)]
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_limit() -> i64 {
|
||||||
|
50
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_users(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
_admin: RequireAdmin,
|
||||||
|
Query(params): Query<ListUsersParams>,
|
||||||
|
) -> AppResult<Json<PagedResponse<User>>> {
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let (items, total) = repo::user::list_with_total(
|
||||||
|
&state.db,
|
||||||
|
&repo::user::ListUsersQuery {
|
||||||
|
search: params.search.filter(|s| !s.trim().is_empty()),
|
||||||
|
limit,
|
||||||
|
offset,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct UpdateUserInput {
|
||||||
|
pub is_admin: Option<bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn update_user(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
RequireAdmin(actor): RequireAdmin,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
Json(input): Json<UpdateUserInput>,
|
||||||
|
) -> AppResult<Json<User>> {
|
||||||
|
let Some(is_admin) = input.is_admin else {
|
||||||
|
return Err(AppError::InvalidInput(
|
||||||
|
"no updatable fields supplied".into(),
|
||||||
|
));
|
||||||
|
};
|
||||||
|
let updated =
|
||||||
|
repo::user::admin_safe_set_is_admin(&state.db, actor.id, id, is_admin).await?;
|
||||||
|
Ok(Json(updated))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn delete_user(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
RequireAdmin(actor): RequireAdmin,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
) -> AppResult<StatusCode> {
|
||||||
|
repo::user::admin_safe_delete(&state.db, actor.id, id).await?;
|
||||||
|
Ok(StatusCode::NO_CONTENT)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct CreateUserInput {
|
||||||
|
pub username: String,
|
||||||
|
pub password: String,
|
||||||
|
/// Defaults to false; admins may mint other admins in a single
|
||||||
|
/// call. Doing it as one POST avoids a second audit row for the
|
||||||
|
/// common "invite a co-admin" flow.
|
||||||
|
#[serde(default)]
|
||||||
|
pub is_admin: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_user(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
RequireAdmin(actor): RequireAdmin,
|
||||||
|
Json(input): Json<CreateUserInput>,
|
||||||
|
) -> AppResult<(StatusCode, Json<User>)> {
|
||||||
|
let username = input.username.trim();
|
||||||
|
// Reuse the canonical self-register validators so the admin-create
|
||||||
|
// path can never produce a username that self-register would
|
||||||
|
// reject (and vice versa).
|
||||||
|
validate_username(username)?;
|
||||||
|
validate_password(&input.password)?;
|
||||||
|
let pwhash = hash_password(&input.password)?;
|
||||||
|
let user = repo::user::admin_create_user(
|
||||||
|
&state.db,
|
||||||
|
actor.id,
|
||||||
|
username,
|
||||||
|
&pwhash,
|
||||||
|
input.is_admin,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok((StatusCode::CREATED, Json(user)))
|
||||||
|
}
|
||||||
@@ -4,6 +4,8 @@
|
|||||||
//! expire naturally rather than being explicitly invalidated, so other
|
//! expire naturally rather than being explicitly invalidated, so other
|
||||||
//! devices keep their existing logins).
|
//! devices keep their existing logins).
|
||||||
|
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
use axum::extract::{Path, State};
|
use axum::extract::{Path, State};
|
||||||
use axum::http::StatusCode;
|
use axum::http::StatusCode;
|
||||||
use axum::response::IntoResponse;
|
use axum::response::IntoResponse;
|
||||||
@@ -26,6 +28,7 @@ use crate::repo;
|
|||||||
|
|
||||||
pub fn routes() -> Router<AppState> {
|
pub fn routes() -> Router<AppState> {
|
||||||
Router::new()
|
Router::new()
|
||||||
|
.route("/auth/config", get(auth_config))
|
||||||
.route("/auth/register", post(register))
|
.route("/auth/register", post(register))
|
||||||
.route("/auth/login", post(login))
|
.route("/auth/login", post(login))
|
||||||
.route("/auth/logout", post(logout))
|
.route("/auth/logout", post(logout))
|
||||||
@@ -39,6 +42,25 @@ pub fn routes() -> Router<AppState> {
|
|||||||
.route("/auth/tokens/:id", delete(delete_token))
|
.route("/auth/tokens/:id", delete(delete_token))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Public, unauthenticated. Exposes anonymous-relevant auth policy so
|
||||||
|
/// the frontend can render its login / register affordances correctly
|
||||||
|
/// without a probe request that would conflate "disabled" with
|
||||||
|
/// "rate-limited". `self_register_enabled` is the *effective* value
|
||||||
|
/// (`allow_self_register && !private_mode`), so a private-mode
|
||||||
|
/// instance reports `false` even if the raw flag is on.
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct AuthConfigResponse {
|
||||||
|
pub self_register_enabled: bool,
|
||||||
|
pub private_mode: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn auth_config(State(state): State<AppState>) -> Json<AuthConfigResponse> {
|
||||||
|
Json(AuthConfigResponse {
|
||||||
|
self_register_enabled: state.auth.allow_self_register && !state.auth.private_mode,
|
||||||
|
private_mode: state.auth.private_mode,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
pub struct Credentials {
|
pub struct Credentials {
|
||||||
pub username: String,
|
pub username: String,
|
||||||
@@ -80,6 +102,17 @@ async fn register(
|
|||||||
jar: CookieJar,
|
jar: CookieJar,
|
||||||
Json(input): Json<Credentials>,
|
Json(input): Json<Credentials>,
|
||||||
) -> AppResult<impl IntoResponse> {
|
) -> AppResult<impl IntoResponse> {
|
||||||
|
// Rate limit before the disabled check so an operator who flips
|
||||||
|
// the toggle can't be probed for the toggle state via timing —
|
||||||
|
// disabled and enabled paths both consume a token, and disabled
|
||||||
|
// returns 403 instead of running argon2.
|
||||||
|
check_auth_rate_limit(&state, "register")?;
|
||||||
|
// Private mode force-blocks self-registration regardless of
|
||||||
|
// ALLOW_SELF_REGISTER — operators of locked-down instances mint
|
||||||
|
// accounts via `POST /admin/users` instead.
|
||||||
|
if !state.auth.allow_self_register || state.auth.private_mode {
|
||||||
|
return Err(AppError::Forbidden);
|
||||||
|
}
|
||||||
let username = input.username.trim();
|
let username = input.username.trim();
|
||||||
validate_username(username)?;
|
validate_username(username)?;
|
||||||
validate_password(&input.password)?;
|
validate_password(&input.password)?;
|
||||||
@@ -95,6 +128,7 @@ async fn login(
|
|||||||
jar: CookieJar,
|
jar: CookieJar,
|
||||||
Json(input): Json<Credentials>,
|
Json(input): Json<Credentials>,
|
||||||
) -> AppResult<impl IntoResponse> {
|
) -> AppResult<impl IntoResponse> {
|
||||||
|
check_auth_rate_limit(&state, "login")?;
|
||||||
let username = input.username.trim();
|
let username = input.username.trim();
|
||||||
if username.is_empty() || input.password.is_empty() {
|
if username.is_empty() || input.password.is_empty() {
|
||||||
return Err(AppError::InvalidInput(
|
return Err(AppError::InvalidInput(
|
||||||
@@ -102,9 +136,15 @@ async fn login(
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let user = repo::user::find_by_username(&state.db, username)
|
let user = repo::user::find_by_username(&state.db, username).await?;
|
||||||
.await?
|
let Some(user) = user else {
|
||||||
.ok_or(AppError::Unauthenticated)?;
|
// No such user. Run argon2 against a stable dummy hash so the
|
||||||
|
// response time matches the wrong-password branch — otherwise
|
||||||
|
// an attacker can enumerate usernames by timing the no-user
|
||||||
|
// 401 against the wrong-password 401.
|
||||||
|
let _ = verify_password(&input.password, dummy_password_hash());
|
||||||
|
return Err(AppError::Unauthenticated);
|
||||||
|
};
|
||||||
if !verify_password(&input.password, &user.password_hash) {
|
if !verify_password(&input.password, &user.password_hash) {
|
||||||
return Err(AppError::Unauthenticated);
|
return Err(AppError::Unauthenticated);
|
||||||
}
|
}
|
||||||
@@ -113,6 +153,21 @@ async fn login(
|
|||||||
Ok((StatusCode::OK, jar, Json(AuthResponse { user })))
|
Ok((StatusCode::OK, jar, Json(AuthResponse { user })))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Lazily-computed argon2 hash used to equalise login response time
|
||||||
|
/// across the "no such user" and "wrong password" branches. Computing
|
||||||
|
/// it once (on the first login of the process) is enough — the hash is
|
||||||
|
/// never compared against a real password, only used to force argon2
|
||||||
|
/// to do the same amount of work it would for a real verify.
|
||||||
|
fn dummy_password_hash() -> &'static str {
|
||||||
|
static DUMMY: OnceLock<String> = OnceLock::new();
|
||||||
|
DUMMY
|
||||||
|
.get_or_init(|| {
|
||||||
|
crate::auth::password::hash_password("login-timing-equaliser")
|
||||||
|
.expect("hash_password on a fixed input cannot fail")
|
||||||
|
})
|
||||||
|
.as_str()
|
||||||
|
}
|
||||||
|
|
||||||
async fn logout(
|
async fn logout(
|
||||||
State(state): State<AppState>,
|
State(state): State<AppState>,
|
||||||
jar: CookieJar,
|
jar: CookieJar,
|
||||||
@@ -149,6 +204,7 @@ async fn change_password(
|
|||||||
jar: CookieJar,
|
jar: CookieJar,
|
||||||
Json(input): Json<ChangePassword>,
|
Json(input): Json<ChangePassword>,
|
||||||
) -> AppResult<impl IntoResponse> {
|
) -> AppResult<impl IntoResponse> {
|
||||||
|
check_auth_rate_limit(&state, "change_password")?;
|
||||||
if !verify_password(&input.current_password, &user.password_hash) {
|
if !verify_password(&input.current_password, &user.password_hash) {
|
||||||
return Err(AppError::Unauthenticated);
|
return Err(AppError::Unauthenticated);
|
||||||
}
|
}
|
||||||
@@ -230,8 +286,24 @@ async fn create_token(
|
|||||||
Json(input): Json<CreateTokenInput>,
|
Json(input): Json<CreateTokenInput>,
|
||||||
) -> AppResult<impl IntoResponse> {
|
) -> AppResult<impl IntoResponse> {
|
||||||
let name = input.name.trim();
|
let name = input.name.trim();
|
||||||
|
// Both arms use `ValidationFailed` (422 with field details) to
|
||||||
|
// match the structured-error shape `attach_tag` returns for the
|
||||||
|
// same kind of free-form-identifier validation. The other
|
||||||
|
// /auth/* handlers in this file use `InvalidInput` (400); the
|
||||||
|
// divergence is pre-existing and would warrant a project-wide
|
||||||
|
// pass to flip them all if the client side wants uniform per-
|
||||||
|
// field error rendering.
|
||||||
if name.is_empty() {
|
if name.is_empty() {
|
||||||
return Err(AppError::InvalidInput("token name is required".into()));
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "token name is required".into(),
|
||||||
|
details: serde_json::json!({ "name": "required" }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if name.chars().count() > 64 {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "token name too long".into(),
|
||||||
|
details: serde_json::json!({ "name": "max 64 characters" }),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
let (raw, hash) = generate_token();
|
let (raw, hash) = generate_token();
|
||||||
let token = repo::api_token::create(&state.db, user.id, name, &hash).await?;
|
let token = repo::api_token::create(&state.db, user.id, name, &hash).await?;
|
||||||
@@ -267,6 +339,18 @@ async fn start_session(
|
|||||||
Ok(jar.add(build_session_cookie(raw, &state.auth)))
|
Ok(jar.add(build_session_cookie(raw, &state.auth)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CSRF posture: `SameSite=Lax` is the project's primary CSRF defense.
|
||||||
|
// Browsers refuse to attach this cookie to cross-site POST / PATCH /
|
||||||
|
// DELETE requests, which covers every state-changing endpoint (auth
|
||||||
|
// mutations, uploads, bookmarks, collections, admin user management,
|
||||||
|
// etc. — all JSON over POST/PATCH/DELETE). Lax DOES still attach the
|
||||||
|
// cookie on top-level cross-site GETs, so this defense breaks the
|
||||||
|
// instant anyone adds a state-changing GET. If you reach for one,
|
||||||
|
// switch to `SameSite=Strict` here AND add an explicit CSRF-token
|
||||||
|
// check on the new endpoint. The Bearer-token branch in the
|
||||||
|
// extractor is unaffected (bots authenticate with the token header,
|
||||||
|
// not the cookie) and admin routes reject Bearer entirely — see
|
||||||
|
// `auth::extractor::RequireAdmin`.
|
||||||
fn build_session_cookie(raw: String, cfg: &AuthConfig) -> Cookie<'static> {
|
fn build_session_cookie(raw: String, cfg: &AuthConfig) -> Cookie<'static> {
|
||||||
let mut builder = Cookie::build((SESSION_COOKIE_NAME, raw))
|
let mut builder = Cookie::build((SESSION_COOKIE_NAME, raw))
|
||||||
.http_only(true)
|
.http_only(true)
|
||||||
@@ -293,7 +377,38 @@ fn build_expired_cookie(cfg: &AuthConfig) -> Cookie<'static> {
|
|||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn validate_username(u: &str) -> AppResult<()> {
|
/// Consume one token from the shared auth rate limiter. Called at the
|
||||||
|
/// start of `register`, `login`, and `change_password` so credential
|
||||||
|
/// stuffing / spraying / username-probe loops are throttled by the
|
||||||
|
/// configured budget (default 5/sec with a 10-request burst).
|
||||||
|
///
|
||||||
|
/// All three endpoints share one bucket — they all expose the same
|
||||||
|
/// argon2-verify-or-create work and the same enumeration channels, so
|
||||||
|
/// any one of them in a tight loop should trip the limit. `endpoint`
|
||||||
|
/// is included in the rate-limit-hit log line so operators can tell
|
||||||
|
/// which endpoint is being probed.
|
||||||
|
fn check_auth_rate_limit(state: &AppState, endpoint: &'static str) -> AppResult<()> {
|
||||||
|
use crate::auth::rate_limit::AcquireResult;
|
||||||
|
match state.auth_limiter.try_acquire() {
|
||||||
|
AcquireResult::Allowed => Ok(()),
|
||||||
|
AcquireResult::Denied { retry_after_secs } => {
|
||||||
|
tracing::warn!(
|
||||||
|
endpoint,
|
||||||
|
retry_after_secs,
|
||||||
|
"auth rate limit hit; returning 429"
|
||||||
|
);
|
||||||
|
Err(AppError::TooManyRequests {
|
||||||
|
retry_after_secs: Some(retry_after_secs),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Exposed pub(crate) so the admin user-create handler can apply the
|
||||||
|
// same rules as self-registration. Keeping the lone canonical
|
||||||
|
// implementation here avoids the two paths drifting on min length /
|
||||||
|
// allowed character set.
|
||||||
|
pub(crate) fn validate_username(u: &str) -> AppResult<()> {
|
||||||
if u.is_empty() {
|
if u.is_empty() {
|
||||||
return Err(AppError::InvalidInput("username is required".into()));
|
return Err(AppError::InvalidInput("username is required".into()));
|
||||||
}
|
}
|
||||||
@@ -310,7 +425,7 @@ fn validate_username(u: &str) -> AppResult<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn validate_password(p: &str) -> AppResult<()> {
|
pub(crate) fn validate_password(p: &str) -> AppResult<()> {
|
||||||
if p.len() < 8 {
|
if p.len() < 8 {
|
||||||
return Err(AppError::InvalidInput(
|
return Err(AppError::InvalidInput(
|
||||||
"password must be at least 8 characters".into(),
|
"password must be at least 8 characters".into(),
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ use uuid::Uuid;
|
|||||||
use crate::api::pagination::PagedResponse;
|
use crate::api::pagination::PagedResponse;
|
||||||
use crate::app::AppState;
|
use crate::app::AppState;
|
||||||
use crate::auth::extractor::CurrentUser;
|
use crate::auth::extractor::CurrentUser;
|
||||||
|
use crate::crawler::pipeline;
|
||||||
use crate::domain::{Bookmark, BookmarkSummary};
|
use crate::domain::{Bookmark, BookmarkSummary};
|
||||||
use crate::error::{AppError, AppResult};
|
use crate::error::{AppError, AppResult};
|
||||||
use crate::repo;
|
use crate::repo;
|
||||||
@@ -66,14 +67,7 @@ async fn create(
|
|||||||
// the foreign-key violation collapse into a generic 500.
|
// the foreign-key violation collapse into a generic 500.
|
||||||
repo::manga::get(&state.db, input.manga_id).await?;
|
repo::manga::get(&state.db, input.manga_id).await?;
|
||||||
if let Some(chapter_id) = input.chapter_id {
|
if let Some(chapter_id) = input.chapter_id {
|
||||||
let exists: Option<(Uuid,)> = sqlx::query_as(
|
if !repo::chapter::belongs_to_manga(&state.db, chapter_id, input.manga_id).await? {
|
||||||
"SELECT id FROM chapters WHERE id = $1 AND manga_id = $2",
|
|
||||||
)
|
|
||||||
.bind(chapter_id)
|
|
||||||
.bind(input.manga_id)
|
|
||||||
.fetch_optional(&state.db)
|
|
||||||
.await?;
|
|
||||||
if exists.is_none() {
|
|
||||||
return Err(AppError::NotFound);
|
return Err(AppError::NotFound);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -86,6 +80,29 @@ async fn create(
|
|||||||
input.page,
|
input.page,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
// Fire-and-forget: kick off content syncs for any pending chapters of
|
||||||
|
// the newly-bookmarked manga. The dedup index makes this idempotent
|
||||||
|
// across repeated bookmarks of the same manga; failure here must not
|
||||||
|
// surface to the user (the daily cron sweeps anything missed).
|
||||||
|
let pool = state.db.clone();
|
||||||
|
let manga_id = input.manga_id;
|
||||||
|
tokio::spawn(async move {
|
||||||
|
match pipeline::enqueue_pending_for_manga(&pool, manga_id).await {
|
||||||
|
Ok(summary) => tracing::info!(
|
||||||
|
%manga_id,
|
||||||
|
inserted = summary.inserted,
|
||||||
|
skipped = summary.skipped,
|
||||||
|
failed = summary.failed,
|
||||||
|
"bookmark hook: enqueued pending chapters"
|
||||||
|
),
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
%manga_id, error = ?e,
|
||||||
|
"bookmark hook: enqueue_pending_for_manga failed"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
Ok((StatusCode::CREATED, Json(bookmark)))
|
Ok((StatusCode::CREATED, Json(bookmark)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,6 +128,7 @@ async fn list_me(
|
|||||||
) -> AppResult<Json<PagedResponse<BookmarkSummary>>> {
|
) -> AppResult<Json<PagedResponse<BookmarkSummary>>> {
|
||||||
let limit = params.limit.clamp(1, 200);
|
let limit = params.limit.clamp(1, 200);
|
||||||
let offset = params.offset.max(0);
|
let offset = params.offset.max(0);
|
||||||
let items = repo::bookmark::list_for_user(&state.db, user.id, limit, offset).await?;
|
let (items, total) =
|
||||||
Ok(Json(PagedResponse::new(items, limit, offset)))
|
repo::bookmark::list_for_user(&state.db, user.id, limit, offset).await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,9 +26,9 @@ use crate::upload::{parse_image, UploadedImage};
|
|||||||
pub fn routes() -> Router<AppState> {
|
pub fn routes() -> Router<AppState> {
|
||||||
Router::new()
|
Router::new()
|
||||||
.route("/mangas/:manga_id/chapters", get(list).post(create))
|
.route("/mangas/:manga_id/chapters", get(list).post(create))
|
||||||
.route("/mangas/:manga_id/chapters/:number", get(get_one))
|
.route("/mangas/:manga_id/chapters/:chapter_id", get(get_one))
|
||||||
.route(
|
.route(
|
||||||
"/mangas/:manga_id/chapters/:number/pages",
|
"/mangas/:manga_id/chapters/:chapter_id/pages",
|
||||||
get(list_pages),
|
get(list_pages),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -60,10 +60,10 @@ async fn list(
|
|||||||
|
|
||||||
async fn get_one(
|
async fn get_one(
|
||||||
State(state): State<AppState>,
|
State(state): State<AppState>,
|
||||||
Path((manga_id, number)): Path<(Uuid, i32)>,
|
Path((manga_id, chapter_id)): Path<(Uuid, Uuid)>,
|
||||||
) -> AppResult<Json<Chapter>> {
|
) -> AppResult<Json<Chapter>> {
|
||||||
repo::manga::get(&state.db, manga_id).await?;
|
repo::manga::get(&state.db, manga_id).await?;
|
||||||
let chapter = repo::chapter::find_by_manga_and_number(&state.db, manga_id, number)
|
let chapter = repo::chapter::find_by_id_in_manga(&state.db, manga_id, chapter_id)
|
||||||
.await?
|
.await?
|
||||||
.ok_or(AppError::NotFound)?;
|
.ok_or(AppError::NotFound)?;
|
||||||
Ok(Json(chapter))
|
Ok(Json(chapter))
|
||||||
@@ -71,7 +71,7 @@ async fn get_one(
|
|||||||
|
|
||||||
async fn create(
|
async fn create(
|
||||||
State(state): State<AppState>,
|
State(state): State<AppState>,
|
||||||
CurrentUser(_user): CurrentUser,
|
CurrentUser(user): CurrentUser,
|
||||||
Path(manga_id): Path<Uuid>,
|
Path(manga_id): Path<Uuid>,
|
||||||
mut multipart: Multipart,
|
mut multipart: Multipart,
|
||||||
) -> AppResult<(StatusCode, Json<Chapter>)> {
|
) -> AppResult<(StatusCode, Json<Chapter>)> {
|
||||||
@@ -133,6 +133,7 @@ async fn create(
|
|||||||
manga_id,
|
manga_id,
|
||||||
metadata.number,
|
metadata.number,
|
||||||
metadata.title.as_deref(),
|
metadata.title.as_deref(),
|
||||||
|
Some(user.id),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -163,10 +164,10 @@ struct PagesResponse {
|
|||||||
|
|
||||||
async fn list_pages(
|
async fn list_pages(
|
||||||
State(state): State<AppState>,
|
State(state): State<AppState>,
|
||||||
Path((manga_id, number)): Path<(Uuid, i32)>,
|
Path((manga_id, chapter_id)): Path<(Uuid, Uuid)>,
|
||||||
) -> AppResult<Json<PagesResponse>> {
|
) -> AppResult<Json<PagesResponse>> {
|
||||||
repo::manga::get(&state.db, manga_id).await?;
|
repo::manga::get(&state.db, manga_id).await?;
|
||||||
let chapter = repo::chapter::find_by_manga_and_number(&state.db, manga_id, number)
|
let chapter = repo::chapter::find_by_id_in_manga(&state.db, manga_id, chapter_id)
|
||||||
.await?
|
.await?
|
||||||
.ok_or(AppError::NotFound)?;
|
.ok_or(AppError::NotFound)?;
|
||||||
let pages = repo::page::list_for_chapter(&state.db, chapter.id).await?;
|
let pages = repo::page::list_for_chapter(&state.db, chapter.id).await?;
|
||||||
|
|||||||
247
backend/src/api/collections.rs
Normal file
247
backend/src/api/collections.rs
Normal file
@@ -0,0 +1,247 @@
|
|||||||
|
use axum::extract::{Path, Query, State};
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::routing::{delete, get, post};
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::json;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::api::pagination::PagedResponse;
|
||||||
|
use crate::app::AppState;
|
||||||
|
use crate::auth::extractor::CurrentUser;
|
||||||
|
use crate::domain::collection::{
|
||||||
|
Collection, CollectionPatch, CollectionSummary, NewCollection,
|
||||||
|
};
|
||||||
|
use crate::domain::manga::Manga;
|
||||||
|
use crate::domain::patch::Patch;
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
use crate::repo;
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new()
|
||||||
|
.route("/collections", post(create))
|
||||||
|
.route("/me/collections", get(list_mine))
|
||||||
|
.route("/collections/:id", get(get_one).patch(update).delete(delete_one))
|
||||||
|
.route("/collections/:id/mangas", get(list_mangas).post(add_manga))
|
||||||
|
.route(
|
||||||
|
"/collections/:id/mangas/:manga_id",
|
||||||
|
delete(remove_manga),
|
||||||
|
)
|
||||||
|
.route(
|
||||||
|
"/mangas/:id/my-collections",
|
||||||
|
get(list_my_collections_containing),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
const MAX_NAME_LEN: usize = 64;
|
||||||
|
const MAX_DESCRIPTION_LEN: usize = 1024;
|
||||||
|
const DEFAULT_LIMIT: i64 = 50;
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ListParams {
|
||||||
|
#[serde(default = "default_limit")]
|
||||||
|
pub limit: i64,
|
||||||
|
#[serde(default)]
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_limit() -> i64 {
|
||||||
|
DEFAULT_LIMIT
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct AddMangaBody {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct MangaCollectionIds {
|
||||||
|
pub collection_ids: Vec<Uuid>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn validate_name(name: &str) -> AppResult<()> {
|
||||||
|
let trimmed = name.trim();
|
||||||
|
if trimmed.is_empty() {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "name is required".into(),
|
||||||
|
details: json!({ "name": "required" }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if trimmed.chars().count() > MAX_NAME_LEN {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "name too long".into(),
|
||||||
|
details: json!({ "name": format!("max {MAX_NAME_LEN} characters") }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn validate_description(desc: Option<&str>) -> AppResult<()> {
|
||||||
|
if let Some(d) = desc {
|
||||||
|
if d.chars().count() > MAX_DESCRIPTION_LEN {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "description too long".into(),
|
||||||
|
details: json!({ "description": format!("max {MAX_DESCRIPTION_LEN} characters") }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Json(input): Json<NewCollection>,
|
||||||
|
) -> AppResult<(StatusCode, Json<Collection>)> {
|
||||||
|
validate_name(&input.name)?;
|
||||||
|
validate_description(input.description.as_deref())?;
|
||||||
|
let row = repo::collection::create(
|
||||||
|
&state.db,
|
||||||
|
user.id,
|
||||||
|
&input.name,
|
||||||
|
input.description.as_deref(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok((StatusCode::CREATED, Json(row)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_mine(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Query(params): Query<ListParams>,
|
||||||
|
) -> AppResult<Json<PagedResponse<CollectionSummary>>> {
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let (items, total) =
|
||||||
|
repo::collection::list_for_user(&state.db, user.id, limit, offset).await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_one(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
) -> AppResult<Json<Collection>> {
|
||||||
|
let row = require_owner(&state, user.id, id).await?;
|
||||||
|
Ok(Json(row))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn update(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
Json(patch): Json<CollectionPatch>,
|
||||||
|
) -> AppResult<Json<Collection>> {
|
||||||
|
require_owner_id(&state, user.id, id).await?;
|
||||||
|
if let Some(ref n) = patch.name {
|
||||||
|
validate_name(n)?;
|
||||||
|
}
|
||||||
|
if let Patch::Set(ref d) = patch.description {
|
||||||
|
validate_description(Some(d.as_str()))?;
|
||||||
|
}
|
||||||
|
// Three-state semantics via `Patch<T>`: omitted → Unchanged
|
||||||
|
// (column untouched), explicit `null` → Clear (NULL), value → Set.
|
||||||
|
let description_provided = patch.description.is_provided();
|
||||||
|
let description_value: Option<&str> = match &patch.description {
|
||||||
|
Patch::Set(s) => Some(s.as_str()),
|
||||||
|
Patch::Clear | Patch::Unchanged => None,
|
||||||
|
};
|
||||||
|
let updated = repo::collection::update(
|
||||||
|
&state.db,
|
||||||
|
id,
|
||||||
|
patch.name.as_deref(),
|
||||||
|
description_provided,
|
||||||
|
description_value,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(updated))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn delete_one(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
) -> AppResult<StatusCode> {
|
||||||
|
require_owner_id(&state, user.id, id).await?;
|
||||||
|
repo::collection::delete(&state.db, id).await?;
|
||||||
|
Ok(StatusCode::NO_CONTENT)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_mangas(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
Query(params): Query<ListParams>,
|
||||||
|
) -> AppResult<Json<PagedResponse<Manga>>> {
|
||||||
|
require_owner_id(&state, user.id, id).await?;
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let (items, total) =
|
||||||
|
repo::collection::list_mangas(&state.db, id, limit, offset).await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn add_manga(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
Json(body): Json<AddMangaBody>,
|
||||||
|
) -> AppResult<StatusCode> {
|
||||||
|
require_owner_id(&state, user.id, id).await?;
|
||||||
|
if !repo::manga::exists(&state.db, body.manga_id).await? {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
}
|
||||||
|
let created = repo::collection::add_manga(&state.db, id, body.manga_id).await?;
|
||||||
|
Ok(if created { StatusCode::CREATED } else { StatusCode::OK })
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn remove_manga(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path((collection_id, manga_id)): Path<(Uuid, Uuid)>,
|
||||||
|
) -> AppResult<StatusCode> {
|
||||||
|
require_owner_id(&state, user.id, collection_id).await?;
|
||||||
|
repo::collection::remove_manga(&state.db, collection_id, manga_id).await?;
|
||||||
|
Ok(StatusCode::NO_CONTENT)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list_my_collections_containing(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(manga_id): Path<Uuid>,
|
||||||
|
) -> AppResult<Json<MangaCollectionIds>> {
|
||||||
|
// No 404 if the manga doesn't exist — the empty list is the
|
||||||
|
// correct answer ("you have it in zero of your collections") and
|
||||||
|
// keeps the request side-effect-free.
|
||||||
|
let ids =
|
||||||
|
repo::collection::list_collections_containing(&state.db, user.id, manga_id).await?;
|
||||||
|
Ok(Json(MangaCollectionIds { collection_ids: ids }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the row iff the caller owns it. Both "doesn't exist" and
|
||||||
|
/// "exists but belongs to someone else" surface as `NotFound` so the
|
||||||
|
/// API doesn't disclose collection existence to non-owners — the
|
||||||
|
/// frontend already does this funnelling for URLs, and consistency at
|
||||||
|
/// the API matters because the same identifiers travel through bots
|
||||||
|
/// and shared links.
|
||||||
|
async fn require_owner(
|
||||||
|
state: &AppState,
|
||||||
|
user_id: Uuid,
|
||||||
|
id: Uuid,
|
||||||
|
) -> AppResult<Collection> {
|
||||||
|
match repo::collection::get(&state.db, id).await {
|
||||||
|
Ok(row) if row.user_id == user_id => Ok(row),
|
||||||
|
// Either the row doesn't exist (NotFound from `get`) or it
|
||||||
|
// belongs to someone else — both collapse to NotFound.
|
||||||
|
Ok(_) | Err(AppError::NotFound) => Err(AppError::NotFound),
|
||||||
|
Err(other) => Err(other),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn require_owner_id(state: &AppState, user_id: Uuid, id: Uuid) -> AppResult<()> {
|
||||||
|
match repo::collection::find_owner(&state.db, id).await? {
|
||||||
|
Some(owner) if owner == user_id => Ok(()),
|
||||||
|
// Same non-leakage rationale as `require_owner` above.
|
||||||
|
_ => Err(AppError::NotFound),
|
||||||
|
}
|
||||||
|
}
|
||||||
145
backend/src/api/history.rs
Normal file
145
backend/src/api/history.rs
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
//! Reading-progress and upload-history endpoints (Phase 5).
|
||||||
|
//!
|
||||||
|
//! All routes live under `/me/...` and require `CurrentUser`. They
|
||||||
|
//! never expose another user's data — the user id is taken from the
|
||||||
|
//! auth extractor, not from the path or body.
|
||||||
|
|
||||||
|
use axum::extract::{Path, Query, State};
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::routing::{get, put};
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use serde::Deserialize;
|
||||||
|
use serde_json::json;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::api::pagination::PagedResponse;
|
||||||
|
use crate::app::AppState;
|
||||||
|
use crate::auth::extractor::CurrentUser;
|
||||||
|
use crate::domain::read_progress::{
|
||||||
|
ReadProgress, ReadProgressForManga, ReadProgressSummary, UpsertReadProgress,
|
||||||
|
};
|
||||||
|
use crate::domain::upload_entry::UploadEntry;
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
use crate::repo;
|
||||||
|
|
||||||
|
pub fn routes() -> Router<AppState> {
|
||||||
|
Router::new()
|
||||||
|
.route("/me/read-progress", put(upsert).get(list))
|
||||||
|
.route(
|
||||||
|
"/me/read-progress/:manga_id",
|
||||||
|
get(get_one).delete(delete_one),
|
||||||
|
)
|
||||||
|
.route("/me/uploads", get(uploads))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ListParams {
|
||||||
|
#[serde(default = "default_limit")]
|
||||||
|
pub limit: i64,
|
||||||
|
#[serde(default)]
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_limit() -> i64 {
|
||||||
|
50
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn upsert(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Json(input): Json<UpsertReadProgress>,
|
||||||
|
) -> AppResult<Json<ReadProgress>> {
|
||||||
|
let page = input.page.unwrap_or(1);
|
||||||
|
if page < 1 {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "page must be 1 or greater".into(),
|
||||||
|
details: json!({ "page": "must be >= 1" }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Cross-link guard: the FKs on read_progress accept any valid
|
||||||
|
// (manga_id, chapter_id), even when they refer to unrelated mangas.
|
||||||
|
// Reject mismatched pairs so history can't end up rendering a
|
||||||
|
// chapter number from the wrong manga.
|
||||||
|
if let Some(chapter_id) = input.chapter_id {
|
||||||
|
let belongs = repo::read_progress::chapter_belongs_to_manga(
|
||||||
|
&state.db,
|
||||||
|
input.manga_id,
|
||||||
|
chapter_id,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
if !belongs {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "chapter does not belong to this manga".into(),
|
||||||
|
details: json!({ "chapter_id": "must reference a chapter of the supplied manga" }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let row = repo::read_progress::upsert(
|
||||||
|
&state.db,
|
||||||
|
user.id,
|
||||||
|
input.manga_id,
|
||||||
|
input.chapter_id,
|
||||||
|
page,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(Json(row))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn list(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Query(params): Query<ListParams>,
|
||||||
|
) -> AppResult<Json<PagedResponse<ReadProgressSummary>>> {
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let offset = params.offset.max(0);
|
||||||
|
let (items, total) =
|
||||||
|
repo::read_progress::list_for_user(&state.db, user.id, limit, offset).await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, offset, total)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_one(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(manga_id): Path<Uuid>,
|
||||||
|
) -> AppResult<Json<ReadProgressForManga>> {
|
||||||
|
// Enriched with `chapter_number` so the manga page's Continue
|
||||||
|
// CTA doesn't need to resolve the chapter id against the paged
|
||||||
|
// chapters list.
|
||||||
|
Ok(Json(
|
||||||
|
repo::read_progress::get_for_manga(&state.db, user.id, manga_id).await?,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn delete_one(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(manga_id): Path<Uuid>,
|
||||||
|
) -> AppResult<StatusCode> {
|
||||||
|
repo::read_progress::delete(&state.db, user.id, manga_id).await?;
|
||||||
|
Ok(StatusCode::NO_CONTENT)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct UploadListParams {
|
||||||
|
#[serde(default = "default_uploads_limit")]
|
||||||
|
pub limit: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_uploads_limit() -> i64 {
|
||||||
|
50
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn uploads(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Query(params): Query<UploadListParams>,
|
||||||
|
) -> AppResult<Json<PagedResponse<UploadEntry>>> {
|
||||||
|
// Limit-only pagination for now — keyset across two unrelated
|
||||||
|
// tables is a future enhancement. Total comes from a fast count
|
||||||
|
// query so the UI can show "N total" without dragging the rows
|
||||||
|
// across the wire.
|
||||||
|
let limit = params.limit.clamp(1, 200);
|
||||||
|
let (items, total) =
|
||||||
|
repo::upload_history::list_for_user(&state.db, user.id, limit).await?;
|
||||||
|
Ok(Json(PagedResponse::with_total(items, limit, 0, total)))
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
use axum::extract::{Multipart, Path, Query, State};
|
use axum::extract::{Multipart, Path, Query, State};
|
||||||
use axum::http::StatusCode;
|
use axum::http::StatusCode;
|
||||||
use axum::routing::{delete, get, post};
|
use axum::routing::{delete, get, post, put};
|
||||||
use axum::{Json, Router};
|
use axum::{Json, Router};
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
@@ -9,16 +9,19 @@ use uuid::Uuid;
|
|||||||
use crate::api::pagination::PagedResponse;
|
use crate::api::pagination::PagedResponse;
|
||||||
use crate::app::AppState;
|
use crate::app::AppState;
|
||||||
use crate::auth::extractor::CurrentUser;
|
use crate::auth::extractor::CurrentUser;
|
||||||
use crate::domain::manga::{MangaCard, MangaDetail, MangaPatch, NewManga, Patch};
|
use crate::domain::manga::{MangaCard, MangaDetail, MangaPatch, NewManga};
|
||||||
|
use crate::domain::patch::Patch;
|
||||||
use crate::domain::tag::TagRef;
|
use crate::domain::tag::TagRef;
|
||||||
use crate::error::{AppError, AppResult};
|
use crate::error::{AppError, AppResult};
|
||||||
use crate::repo;
|
use crate::repo;
|
||||||
|
use crate::storage::StorageError;
|
||||||
use crate::upload::{parse_image, UploadedImage};
|
use crate::upload::{parse_image, UploadedImage};
|
||||||
|
|
||||||
pub fn routes() -> Router<AppState> {
|
pub fn routes() -> Router<AppState> {
|
||||||
Router::new()
|
Router::new()
|
||||||
.route("/mangas", get(list).post(create))
|
.route("/mangas", get(list).post(create))
|
||||||
.route("/mangas/:id", get(get_one).patch(update))
|
.route("/mangas/:id", get(get_one).patch(update))
|
||||||
|
.route("/mangas/:id/cover", put(put_cover).delete(delete_cover))
|
||||||
.route("/mangas/:id/tags", post(attach_tag))
|
.route("/mangas/:id/tags", post(attach_tag))
|
||||||
.route("/mangas/:id/tags/:tag_id", delete(detach_tag))
|
.route("/mangas/:id/tags/:tag_id", delete(detach_tag))
|
||||||
}
|
}
|
||||||
@@ -168,6 +171,7 @@ async fn create(
|
|||||||
&status,
|
&status,
|
||||||
metadata.description.as_deref(),
|
metadata.description.as_deref(),
|
||||||
&alt_titles,
|
&alt_titles,
|
||||||
|
Some(_user.id),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -192,16 +196,14 @@ async fn create(
|
|||||||
|
|
||||||
async fn update(
|
async fn update(
|
||||||
State(state): State<AppState>,
|
State(state): State<AppState>,
|
||||||
CurrentUser(_user): CurrentUser,
|
CurrentUser(user): CurrentUser,
|
||||||
Path(id): Path<Uuid>,
|
Path(id): Path<Uuid>,
|
||||||
Json(patch): Json<MangaPatch>,
|
Json(patch): Json<MangaPatch>,
|
||||||
) -> AppResult<Json<MangaDetail>> {
|
) -> AppResult<Json<MangaDetail>> {
|
||||||
// TODO(auth): until uploaders are tracked (Phase 5), any signed-in
|
|
||||||
// user can edit any manga. Restrict to uploader + admin once that
|
|
||||||
// column lands.
|
|
||||||
if !repo::manga::exists(&state.db, id).await? {
|
if !repo::manga::exists(&state.db, id).await? {
|
||||||
return Err(AppError::NotFound);
|
return Err(AppError::NotFound);
|
||||||
}
|
}
|
||||||
|
require_can_edit(&state, id, user.id).await?;
|
||||||
|
|
||||||
if let Some(ref status) = patch.status {
|
if let Some(ref status) = patch.status {
|
||||||
let trimmed = status.trim();
|
let trimmed = status.trim();
|
||||||
@@ -257,6 +259,80 @@ async fn update(
|
|||||||
Ok(Json(repo::manga::get_detail(&state.db, id).await?))
|
Ok(Json(repo::manga::get_detail(&state.db, id).await?))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `PUT /api/v1/mangas/:id/cover` is multipart/form-data with a single
|
||||||
|
/// required `cover` part containing image bytes. MIME is sniffed by
|
||||||
|
/// magic bytes (jpeg/png/webp/gif/avif); filename and Content-Type from
|
||||||
|
/// the client are ignored. Replaces any existing cover, deleting the
|
||||||
|
/// previous blob if its extension differs. Returns the refreshed
|
||||||
|
/// `MangaDetail`.
|
||||||
|
async fn put_cover(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
mut multipart: Multipart,
|
||||||
|
) -> AppResult<Json<MangaDetail>> {
|
||||||
|
if !repo::manga::exists(&state.db, id).await? {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
}
|
||||||
|
require_can_edit(&state, id, user.id).await?;
|
||||||
|
|
||||||
|
let mut cover: Option<UploadedImage> = None;
|
||||||
|
while let Some(field) = next_field(&mut multipart).await? {
|
||||||
|
if field.name() == Some("cover") {
|
||||||
|
let bytes = read_field_bytes(field).await?.to_vec();
|
||||||
|
cover = Some(parse_image(bytes, state.upload.max_file_bytes, "cover")?);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let img = cover.ok_or_else(|| AppError::ValidationFailed {
|
||||||
|
message: "cover part is required".into(),
|
||||||
|
details: json!({ "cover": "required" }),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Read the old key BEFORE writing so we can clean up an orphan if
|
||||||
|
// the extension changed (e.g., .png → .jpg). Same-extension is a
|
||||||
|
// `put` overwrite — no delete needed.
|
||||||
|
let old_key = repo::manga::get(&state.db, id).await?.cover_image_path;
|
||||||
|
let new_key = format!("mangas/{}/cover.{}", id, img.ext);
|
||||||
|
state.storage.put(&new_key, &img.bytes).await?;
|
||||||
|
|
||||||
|
if let Some(prev) = old_key.as_deref() {
|
||||||
|
if prev != new_key {
|
||||||
|
// Swallow NotFound — AppError maps it to a client 404,
|
||||||
|
// which would be wrong here. The DB row can outlive a
|
||||||
|
// manually-deleted blob.
|
||||||
|
match state.storage.delete(prev).await {
|
||||||
|
Ok(()) | Err(StorageError::NotFound) => {}
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
repo::manga::set_cover_image_path(&state.db, id, &new_key).await?;
|
||||||
|
Ok(Json(repo::manga::get_detail(&state.db, id).await?))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `DELETE /api/v1/mangas/:id/cover` clears `cover_image_path` and
|
||||||
|
/// removes the blob. Idempotent: removing a non-existent cover succeeds
|
||||||
|
/// with the unchanged detail.
|
||||||
|
async fn delete_cover(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
CurrentUser(user): CurrentUser,
|
||||||
|
Path(id): Path<Uuid>,
|
||||||
|
) -> AppResult<Json<MangaDetail>> {
|
||||||
|
if !repo::manga::exists(&state.db, id).await? {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
}
|
||||||
|
require_can_edit(&state, id, user.id).await?;
|
||||||
|
if let Some(key) = repo::manga::get(&state.db, id).await?.cover_image_path {
|
||||||
|
match state.storage.delete(&key).await {
|
||||||
|
Ok(()) | Err(StorageError::NotFound) => {}
|
||||||
|
Err(e) => return Err(e.into()),
|
||||||
|
}
|
||||||
|
repo::manga::clear_cover_image_path(&state.db, id).await?;
|
||||||
|
}
|
||||||
|
Ok(Json(repo::manga::get_detail(&state.db, id).await?))
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
pub struct AttachTagBody {
|
pub struct AttachTagBody {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
@@ -268,6 +344,7 @@ async fn attach_tag(
|
|||||||
Path(id): Path<Uuid>,
|
Path(id): Path<Uuid>,
|
||||||
Json(body): Json<AttachTagBody>,
|
Json(body): Json<AttachTagBody>,
|
||||||
) -> AppResult<(StatusCode, Json<TagRef>)> {
|
) -> AppResult<(StatusCode, Json<TagRef>)> {
|
||||||
|
validate_tag_name(&body.name)?;
|
||||||
if !repo::manga::exists(&state.db, id).await? {
|
if !repo::manga::exists(&state.db, id).await? {
|
||||||
return Err(AppError::NotFound);
|
return Err(AppError::NotFound);
|
||||||
}
|
}
|
||||||
@@ -314,6 +391,27 @@ async fn detach_tag(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Request-side validation for `POST /mangas/:id/tags` body. Mirrors
|
||||||
|
/// the repo-level cap in `repo::tag::upsert_by_name` (max 64 chars
|
||||||
|
/// after trim) but surfaces the failure at the handler boundary with
|
||||||
|
/// the same envelope shape other validations use.
|
||||||
|
fn validate_tag_name(name: &str) -> AppResult<()> {
|
||||||
|
let trimmed = name.trim();
|
||||||
|
if trimmed.is_empty() {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "tag name cannot be empty".into(),
|
||||||
|
details: json!({ "name": "required" }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if trimmed.chars().count() > 64 {
|
||||||
|
return Err(AppError::ValidationFailed {
|
||||||
|
message: "tag name too long".into(),
|
||||||
|
details: json!({ "name": "max 64 characters" }),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn validate_new_manga(input: &NewManga) -> AppResult<()> {
|
fn validate_new_manga(input: &NewManga) -> AppResult<()> {
|
||||||
if input.title.trim().is_empty() {
|
if input.title.trim().is_empty() {
|
||||||
return Err(AppError::ValidationFailed {
|
return Err(AppError::ValidationFailed {
|
||||||
@@ -333,6 +431,30 @@ fn validate_new_manga(input: &NewManga) -> AppResult<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Authorisation gate for manga mutations. The manga is assumed to
|
||||||
|
/// exist (the caller runs [`repo::manga::exists`] first so a missing id
|
||||||
|
/// surfaces as `NotFound`, not `Forbidden`).
|
||||||
|
///
|
||||||
|
/// Rule: a non-NULL `uploaded_by` must match the current user. Legacy
|
||||||
|
/// rows with `uploaded_by IS NULL` (pre-migration-0011) are still
|
||||||
|
/// editable by any signed-in user — there's nobody to gate on yet, and
|
||||||
|
/// the historical-data note in 0011 acknowledges the gap. Once an
|
||||||
|
/// admin role lands the NULL case can flip to admin-only.
|
||||||
|
///
|
||||||
|
/// Returns `Forbidden` (not `NotFound`) on owner mismatch — mangas
|
||||||
|
/// are listable via `GET /mangas`, so existence isn't a secret and
|
||||||
|
/// the more accurate 403 is fine. This deliberately differs from
|
||||||
|
/// `repo::collection::require_owner`, which collapses both states to
|
||||||
|
/// `NotFound` because collections are private to a user and existence
|
||||||
|
/// itself is information worth hiding from non-owners.
|
||||||
|
async fn require_can_edit(state: &AppState, manga_id: Uuid, user_id: Uuid) -> AppResult<()> {
|
||||||
|
match repo::manga::uploaded_by(&state.db, manga_id).await? {
|
||||||
|
Some(owner) if owner != user_id => Err(AppError::Forbidden),
|
||||||
|
// Some(owner) == user_id (good) or None (legacy row, no owner).
|
||||||
|
_ => Ok(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn validate_genre_ids(state: &AppState, ids: &[Uuid]) -> AppResult<()> {
|
async fn validate_genre_ids(state: &AppState, ids: &[Uuid]) -> AppResult<()> {
|
||||||
if ids.is_empty() {
|
if ids.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
|
pub mod admin;
|
||||||
pub mod auth;
|
pub mod auth;
|
||||||
pub mod authors;
|
pub mod authors;
|
||||||
pub mod bookmarks;
|
pub mod bookmarks;
|
||||||
pub mod chapters;
|
pub mod chapters;
|
||||||
|
pub mod collections;
|
||||||
pub mod files;
|
pub mod files;
|
||||||
pub mod genres;
|
pub mod genres;
|
||||||
pub mod health;
|
pub mod health;
|
||||||
|
pub mod history;
|
||||||
pub mod mangas;
|
pub mod mangas;
|
||||||
pub mod pagination;
|
pub mod pagination;
|
||||||
pub mod tags;
|
pub mod tags;
|
||||||
@@ -24,4 +27,7 @@ pub fn routes() -> Router<AppState> {
|
|||||||
.merge(genres::routes())
|
.merge(genres::routes())
|
||||||
.merge(tags::routes())
|
.merge(tags::routes())
|
||||||
.merge(authors::routes())
|
.merge(authors::routes())
|
||||||
|
.merge(collections::routes())
|
||||||
|
.merge(history::routes())
|
||||||
|
.merge(admin::routes())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,14 +1,33 @@
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
|
||||||
|
|
||||||
use axum::extract::DefaultBodyLimit;
|
use anyhow::Context;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use axum::extract::{DefaultBodyLimit, FromRequestParts, Request, State};
|
||||||
use axum::http::{HeaderName, HeaderValue, Method};
|
use axum::http::{HeaderName, HeaderValue, Method};
|
||||||
|
use axum::middleware::{self, Next};
|
||||||
|
use axum::response::Response;
|
||||||
use axum::Router;
|
use axum::Router;
|
||||||
use sqlx::postgres::PgPoolOptions;
|
use sqlx::postgres::PgPoolOptions;
|
||||||
use sqlx::PgPool;
|
use sqlx::PgPool;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
use tower_http::cors::{AllowOrigin, CorsLayer};
|
use tower_http::cors::{AllowOrigin, CorsLayer};
|
||||||
use tower_http::trace::TraceLayer;
|
use tower_http::trace::TraceLayer;
|
||||||
|
|
||||||
use crate::config::{AuthConfig, Config, UploadConfig};
|
use crate::auth::extractor::CurrentUser;
|
||||||
|
use crate::auth::rate_limit::AuthRateLimiter;
|
||||||
|
use crate::error::AppError;
|
||||||
|
use crate::config::{AuthConfig, Config, CrawlerConfig, UploadConfig};
|
||||||
|
use crate::crawler::browser_manager::{self, BrowserManager};
|
||||||
|
use crate::crawler::content::{self, SyncOutcome};
|
||||||
|
use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass};
|
||||||
|
use crate::crawler::jobs::JobPayload;
|
||||||
|
use crate::crawler::pipeline::{self, MetadataStats};
|
||||||
|
use crate::crawler::rate_limit::HostRateLimiters;
|
||||||
|
use crate::crawler::resync::{RealResyncService, ResyncService};
|
||||||
|
use crate::crawler::safety::DownloadAllowlist;
|
||||||
|
use crate::crawler::session;
|
||||||
|
use crate::repo;
|
||||||
use crate::storage::{LocalStorage, Storage};
|
use crate::storage::{LocalStorage, Storage};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@@ -17,24 +36,496 @@ pub struct AppState {
|
|||||||
pub storage: Arc<dyn Storage>,
|
pub storage: Arc<dyn Storage>,
|
||||||
pub auth: AuthConfig,
|
pub auth: AuthConfig,
|
||||||
pub upload: UploadConfig,
|
pub upload: UploadConfig,
|
||||||
|
/// Shared rate limiter guarding the `/auth/*` mutation endpoints.
|
||||||
|
/// One instance per AppState so tests stay isolated across the
|
||||||
|
/// same process.
|
||||||
|
pub auth_limiter: Arc<AuthRateLimiter>,
|
||||||
|
/// Admin-triggered force resync. `None` when the crawler daemon
|
||||||
|
/// is disabled (`CRAWLER_DAEMON=false`); admin handlers gate on
|
||||||
|
/// `.is_some()` and return 503 otherwise. Set by [`build`] from the
|
||||||
|
/// same wiring that builds the daemon's chapter dispatcher, so a
|
||||||
|
/// force resync uses the daemon's BrowserManager + rate limiters.
|
||||||
|
pub resync: Option<Arc<dyn ResyncService>>,
|
||||||
|
/// Crawler observability + control handle (live status, coordinated
|
||||||
|
/// browser restart, runtime session, manual run). `None` when the
|
||||||
|
/// daemon is disabled; admin handlers gate on `.is_some()` → 503.
|
||||||
|
pub crawler: Option<Arc<CrawlerControl>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn build(config: Config) -> anyhow::Result<Router> {
|
/// Shared handle the admin crawler endpoints use to observe and control
|
||||||
|
/// the running daemon. Bundled so the handlers take one optional field on
|
||||||
|
/// `AppState` rather than many.
|
||||||
|
pub struct CrawlerControl {
|
||||||
|
pub browser_manager: Arc<BrowserManager>,
|
||||||
|
pub session: Arc<crate::crawler::session_control::SessionController>,
|
||||||
|
pub status: crate::crawler::status::StatusHandle,
|
||||||
|
/// Used by the "run metadata pass now" endpoint; `None` when no
|
||||||
|
/// `CRAWLER_START_URL` is configured (cron disabled).
|
||||||
|
pub metadata_pass: Option<Arc<dyn MetadataPass>>,
|
||||||
|
/// Drain budget for a manually-triggered coordinated browser restart.
|
||||||
|
pub drain_deadline: std::time::Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bundle returned by [`build`]. The router is what `axum::serve` consumes;
|
||||||
|
/// the daemon (when enabled) outlives the HTTP server and is awaited via
|
||||||
|
/// [`AppHandle::shutdown`] after the listener has finished gracefully.
|
||||||
|
pub struct AppHandle {
|
||||||
|
pub router: Router,
|
||||||
|
pub daemon: Option<daemon::DaemonHandle>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AppHandle {
|
||||||
|
pub async fn shutdown(self) {
|
||||||
|
if let Some(d) = self.daemon {
|
||||||
|
d.shutdown().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn build(config: Config) -> anyhow::Result<AppHandle> {
|
||||||
let db = PgPoolOptions::new()
|
let db = PgPoolOptions::new()
|
||||||
.max_connections(10)
|
.max_connections(10)
|
||||||
.connect(&config.database_url)
|
.connect(&config.database_url)
|
||||||
.await?;
|
.await?;
|
||||||
sqlx::migrate!("./migrations").run(&db).await?;
|
sqlx::migrate!("./migrations").run(&db).await?;
|
||||||
|
|
||||||
|
if let Some((username, password)) = config.admin_bootstrap.as_ref() {
|
||||||
|
repo::user::bootstrap_admin(&db, username, password)
|
||||||
|
.await
|
||||||
|
.context("bootstrap_admin from ADMIN_USERNAME/ADMIN_PASSWORD env")?;
|
||||||
|
tracing::info!(admin_username = %username, "admin bootstrap ensured");
|
||||||
|
}
|
||||||
|
|
||||||
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(config.storage_dir.clone()));
|
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(config.storage_dir.clone()));
|
||||||
|
|
||||||
|
let (daemon, resync, crawler) = if config.crawler.daemon_enabled {
|
||||||
|
let spawned = spawn_crawler_daemon(db.clone(), Arc::clone(&storage), &config.crawler).await?;
|
||||||
|
(Some(spawned.handle), Some(spawned.resync), Some(spawned.crawler))
|
||||||
|
} else {
|
||||||
|
tracing::info!("crawler daemon disabled (CRAWLER_DAEMON=false)");
|
||||||
|
(None, None, None)
|
||||||
|
};
|
||||||
|
|
||||||
|
let auth_limiter = Arc::new(AuthRateLimiter::new(config.auth.rate_limit));
|
||||||
let state = AppState {
|
let state = AppState {
|
||||||
db,
|
db,
|
||||||
storage,
|
storage,
|
||||||
auth: config.auth.clone(),
|
auth: config.auth.clone(),
|
||||||
upload: config.upload.clone(),
|
upload: config.upload.clone(),
|
||||||
|
auth_limiter,
|
||||||
|
resync,
|
||||||
|
crawler,
|
||||||
};
|
};
|
||||||
Ok(router(state).layer(cors_layer(&config.cors_allowed_origins)))
|
let router = router(state).layer(cors_layer(&config.cors_allowed_origins));
|
||||||
|
Ok(AppHandle { router, daemon })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bundle returned by [`spawn_crawler_daemon`]. The handle owns the
|
||||||
|
/// daemon's tasks; `resync` is the operator-trigger service shared with
|
||||||
|
/// `AppState` so admin endpoints can call into the same browser /
|
||||||
|
/// rate-limit machinery.
|
||||||
|
struct SpawnedDaemon {
|
||||||
|
handle: daemon::DaemonHandle,
|
||||||
|
resync: Arc<dyn ResyncService>,
|
||||||
|
crawler: Arc<CrawlerControl>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn spawn_crawler_daemon(
|
||||||
|
db: PgPool,
|
||||||
|
storage: Arc<dyn Storage>,
|
||||||
|
cfg: &CrawlerConfig,
|
||||||
|
) -> anyhow::Result<SpawnedDaemon> {
|
||||||
|
// Reqwest client with a shared cookie jar so CDN image fetches include
|
||||||
|
// PHPSESSID. The same `Arc<Jar>` is held by the SessionController, so a
|
||||||
|
// runtime session refresh rewrites it in place. Initial value: a
|
||||||
|
// persisted runtime session (survives restart) takes precedence over
|
||||||
|
// CRAWLER_PHPSESSID env.
|
||||||
|
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
|
||||||
|
let initial_sid = crate::crawler::session_control::SessionController::load_persisted(&db)
|
||||||
|
.await
|
||||||
|
.or_else(|| cfg.phpsessid.clone());
|
||||||
|
if let (Some(sid), Some(domain), Some(start_url)) =
|
||||||
|
(&initial_sid, &cfg.cookie_domain, &cfg.start_url)
|
||||||
|
{
|
||||||
|
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
|
||||||
|
let seed_url = reqwest::Url::parse(start_url)
|
||||||
|
.context("parse CRAWLER_START_URL for cookie seed")?;
|
||||||
|
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
|
||||||
|
}
|
||||||
|
let mut http_builder = reqwest::Client::builder()
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.no_proxy()
|
||||||
|
.cookie_provider(Arc::clone(&cookie_jar));
|
||||||
|
if let Some(ua) = &cfg.user_agent {
|
||||||
|
http_builder = http_builder.user_agent(ua);
|
||||||
|
}
|
||||||
|
if let Some(proxy) = &cfg.proxy {
|
||||||
|
http_builder = http_builder
|
||||||
|
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy: {proxy}"))?);
|
||||||
|
}
|
||||||
|
let http = http_builder.build().context("build crawler reqwest")?;
|
||||||
|
|
||||||
|
let mut rate = HostRateLimiters::new(std::time::Duration::from_millis(cfg.rate_ms));
|
||||||
|
if let Some(host) = &cfg.cdn_host {
|
||||||
|
rate = rate.with_override(host, std::time::Duration::from_millis(cfg.cdn_rate_ms));
|
||||||
|
}
|
||||||
|
let rate = Arc::new(rate);
|
||||||
|
|
||||||
|
let tor = crate::crawler::tor::TorController::from_parts(
|
||||||
|
cfg.tor_control_url.as_deref(),
|
||||||
|
cfg.tor_control_password.as_deref(),
|
||||||
|
cfg.tor_control_cookie_path.as_deref(),
|
||||||
|
)
|
||||||
|
.context("build TorController from CRAWLER_TOR_CONTROL_* env")?
|
||||||
|
.map(Arc::new);
|
||||||
|
if let Some(t) = &tor {
|
||||||
|
tracing::info!(?t, "TOR control configured; transient pages will trigger NEWNYM");
|
||||||
|
}
|
||||||
|
let tor_recircuit_max = cfg.tor_recircuit_max_attempts;
|
||||||
|
|
||||||
|
// Session controller + sticky session-expired flag. Created before the
|
||||||
|
// browser so the on_launch hook can read the *current* session value
|
||||||
|
// (rather than a value captured at startup), and so a runtime refresh
|
||||||
|
// updates the cookie everywhere.
|
||||||
|
let session_expired = Arc::new(AtomicBool::new(false));
|
||||||
|
let session_controller = crate::crawler::session_control::SessionController::new(
|
||||||
|
initial_sid,
|
||||||
|
Arc::clone(&cookie_jar),
|
||||||
|
cfg.cookie_domain.clone(),
|
||||||
|
cfg.start_url.clone(),
|
||||||
|
db.clone(),
|
||||||
|
Arc::clone(&session_expired),
|
||||||
|
);
|
||||||
|
|
||||||
|
// Live status surface, sized to the worker count.
|
||||||
|
let status = crate::crawler::status::StatusHandle::new(cfg.chapter_workers);
|
||||||
|
|
||||||
|
// Browser manager. on_launch re-injects PHPSESSID on every fresh
|
||||||
|
// chromium spawn so an idle teardown followed by re-launch stays
|
||||||
|
// authenticated without operator action.
|
||||||
|
let mut launch_opts = cfg.browser.clone();
|
||||||
|
if let Some(proxy) = &cfg.proxy {
|
||||||
|
let chromium_proxy = crate::crawler::url_utils::chromium_proxy_arg(proxy);
|
||||||
|
launch_opts.extra_args.push(format!("--proxy-server={chromium_proxy}"));
|
||||||
|
}
|
||||||
|
let on_launch = match (&cfg.cookie_domain, &cfg.start_url) {
|
||||||
|
(Some(domain), Some(start_url)) => {
|
||||||
|
let domain = domain.clone();
|
||||||
|
let start_url = start_url.clone();
|
||||||
|
let tor_for_launch = tor.as_ref().map(Arc::clone);
|
||||||
|
let sc = Arc::clone(&session_controller);
|
||||||
|
let on_launch: browser_manager::OnLaunch = Arc::new(move |browser| {
|
||||||
|
let domain = domain.clone();
|
||||||
|
let start_url = start_url.clone();
|
||||||
|
let tor_for_launch = tor_for_launch.as_ref().map(Arc::clone);
|
||||||
|
let sc = Arc::clone(&sc);
|
||||||
|
Box::pin(async move {
|
||||||
|
// Read the *current* session each launch so a runtime
|
||||||
|
// refresh is picked up on the next (re)launch. No session
|
||||||
|
// configured → run unauthenticated (metadata needs no auth).
|
||||||
|
let Some(sid) = sc.current().await else {
|
||||||
|
tracing::info!("on_launch: no session set — skipping inject + probe");
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
session::inject_phpsessid(&browser, &sid, &domain)
|
||||||
|
.await
|
||||||
|
.context("on_launch: inject_phpsessid")?;
|
||||||
|
session::verify_session_with_recircuit(
|
||||||
|
&browser,
|
||||||
|
&start_url,
|
||||||
|
tor_for_launch.as_deref(),
|
||||||
|
tor_recircuit_max,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.context("on_launch: verify_session")?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
});
|
||||||
|
on_launch
|
||||||
|
}
|
||||||
|
_ => browser_manager::noop_on_launch(),
|
||||||
|
};
|
||||||
|
let browser_manager = BrowserManager::new(launch_opts, cfg.idle_timeout, on_launch);
|
||||||
|
|
||||||
|
let metadata_pass: Option<Arc<dyn MetadataPass>> = cfg.start_url.as_ref().map(|url| {
|
||||||
|
let m: Arc<dyn MetadataPass> = Arc::new(RealMetadataPass {
|
||||||
|
browser_manager: Arc::clone(&browser_manager),
|
||||||
|
db: db.clone(),
|
||||||
|
storage: Arc::clone(&storage),
|
||||||
|
http: http.clone(),
|
||||||
|
rate: Arc::clone(&rate),
|
||||||
|
start_url: url.clone(),
|
||||||
|
manga_limit: cfg.manga_limit,
|
||||||
|
download_allowlist: cfg.download_allowlist.clone(),
|
||||||
|
max_image_bytes: cfg.max_image_bytes,
|
||||||
|
metadata_max_consecutive_failures: cfg.metadata_max_consecutive_failures,
|
||||||
|
status: status.clone(),
|
||||||
|
tor: tor.as_ref().map(Arc::clone),
|
||||||
|
});
|
||||||
|
m
|
||||||
|
});
|
||||||
|
|
||||||
|
let dispatcher: Arc<dyn ChapterDispatcher> = Arc::new(RealChapterDispatcher {
|
||||||
|
browser_manager: Arc::clone(&browser_manager),
|
||||||
|
db: db.clone(),
|
||||||
|
storage: Arc::clone(&storage),
|
||||||
|
http: http.clone(),
|
||||||
|
rate: Arc::clone(&rate),
|
||||||
|
download_allowlist: cfg.download_allowlist.clone(),
|
||||||
|
max_image_bytes: cfg.max_image_bytes,
|
||||||
|
transient_failures: Arc::new(AtomicU32::new(0)),
|
||||||
|
restart_threshold: cfg.browser_restart_threshold,
|
||||||
|
drain_deadline: cfg.job_timeout,
|
||||||
|
status: status.clone(),
|
||||||
|
tor: tor.as_ref().map(Arc::clone),
|
||||||
|
});
|
||||||
|
|
||||||
|
let resync: Arc<dyn ResyncService> = Arc::new(RealResyncService {
|
||||||
|
browser_manager: Arc::clone(&browser_manager),
|
||||||
|
db: db.clone(),
|
||||||
|
storage: Arc::clone(&storage),
|
||||||
|
http,
|
||||||
|
rate: Arc::clone(&rate),
|
||||||
|
download_allowlist: cfg.download_allowlist.clone(),
|
||||||
|
max_image_bytes: cfg.max_image_bytes,
|
||||||
|
tor: tor.as_ref().map(Arc::clone),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Shared cancellation: daemon shutdown cancels the BrowserManager's
|
||||||
|
// idle reaper too. Reaper itself is added to the daemon's extra_tasks
|
||||||
|
// so DaemonHandle::shutdown awaits its completion.
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
let reaper_task = browser_manager::spawn_idle_reaper(
|
||||||
|
Arc::clone(&browser_manager),
|
||||||
|
cancel.clone(),
|
||||||
|
);
|
||||||
|
// Also close the browser explicitly on shutdown so we don't rely on
|
||||||
|
// kill-on-drop when other Arc<Browser> holders may still exist.
|
||||||
|
let shutdown_task = {
|
||||||
|
let cancel = cancel.clone();
|
||||||
|
let mgr = Arc::clone(&browser_manager);
|
||||||
|
tokio::spawn(async move {
|
||||||
|
cancel.cancelled().await;
|
||||||
|
mgr.shutdown().await;
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
let daemon_handle = daemon::spawn(
|
||||||
|
db,
|
||||||
|
cancel,
|
||||||
|
DaemonConfig {
|
||||||
|
metadata_pass: metadata_pass.clone(),
|
||||||
|
dispatcher,
|
||||||
|
chapter_workers: cfg.chapter_workers,
|
||||||
|
daily_at: cfg.daily_at,
|
||||||
|
tz: cfg.tz,
|
||||||
|
retention_days: cfg.retention_days,
|
||||||
|
session_expired,
|
||||||
|
status: status.clone(),
|
||||||
|
job_timeout: cfg.job_timeout,
|
||||||
|
extra_tasks: vec![reaper_task, shutdown_task],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let crawler = Arc::new(CrawlerControl {
|
||||||
|
browser_manager: Arc::clone(&browser_manager),
|
||||||
|
session: session_controller,
|
||||||
|
status,
|
||||||
|
metadata_pass,
|
||||||
|
drain_deadline: cfg.job_timeout,
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(SpawnedDaemon {
|
||||||
|
handle: daemon_handle,
|
||||||
|
resync,
|
||||||
|
crawler,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Real impls of the daemon traits, owning the browser manager + I/O. Kept
|
||||||
|
// in app.rs because they need the same builder-side env wiring that
|
||||||
|
// AppState gets — the daemon module itself stays free of reqwest / storage
|
||||||
|
// details so its tests don't pull them in.
|
||||||
|
|
||||||
|
struct RealMetadataPass {
|
||||||
|
browser_manager: Arc<BrowserManager>,
|
||||||
|
db: PgPool,
|
||||||
|
storage: Arc<dyn Storage>,
|
||||||
|
http: reqwest::Client,
|
||||||
|
rate: Arc<HostRateLimiters>,
|
||||||
|
start_url: String,
|
||||||
|
manga_limit: usize,
|
||||||
|
download_allowlist: DownloadAllowlist,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
metadata_max_consecutive_failures: u32,
|
||||||
|
status: crate::crawler::status::StatusHandle,
|
||||||
|
tor: Option<Arc<crate::crawler::tor::TorController>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl MetadataPass for RealMetadataPass {
|
||||||
|
async fn run(&self) -> anyhow::Result<MetadataStats> {
|
||||||
|
let result = pipeline::run_metadata_pass(
|
||||||
|
&self.browser_manager,
|
||||||
|
&self.db,
|
||||||
|
self.storage.as_ref(),
|
||||||
|
&self.http,
|
||||||
|
&self.rate,
|
||||||
|
&self.start_url,
|
||||||
|
self.manga_limit,
|
||||||
|
false,
|
||||||
|
&self.download_allowlist,
|
||||||
|
self.max_image_bytes,
|
||||||
|
self.metadata_max_consecutive_failures,
|
||||||
|
Some(&self.status),
|
||||||
|
self.tor.as_deref(),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
if let Err(e) = &result {
|
||||||
|
if crate::crawler::nav::anyhow_looks_browser_dead(e) {
|
||||||
|
self.browser_manager.invalidate().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Cover backfill follows the metadata pass even when the pass
|
||||||
|
// errored — the early-stop walk can complete its work and bail
|
||||||
|
// late, and a transient browser failure shouldn't cancel the
|
||||||
|
// residual cover backlog. The backfill has its own per-call cap
|
||||||
|
// so a runaway error stream can't monopolise the tick. It sets the
|
||||||
|
// CoverBackfill{index,total} phase + current_cover per entry.
|
||||||
|
match pipeline::backfill_missing_covers(
|
||||||
|
&self.browser_manager,
|
||||||
|
&self.db,
|
||||||
|
self.storage.as_ref(),
|
||||||
|
&self.http,
|
||||||
|
&self.rate,
|
||||||
|
pipeline::COVER_BACKFILL_DEFAULT_MAX,
|
||||||
|
&self.download_allowlist,
|
||||||
|
self.max_image_bytes,
|
||||||
|
Some(&self.status),
|
||||||
|
self.tor.as_deref(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(stats) => {
|
||||||
|
if stats.considered > 0 {
|
||||||
|
tracing::info!(?stats, "cover backfill complete");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(error = ?e, "cover backfill failed");
|
||||||
|
if crate::crawler::nav::anyhow_looks_browser_dead(&e) {
|
||||||
|
self.browser_manager.invalidate().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct RealChapterDispatcher {
|
||||||
|
browser_manager: Arc<BrowserManager>,
|
||||||
|
db: PgPool,
|
||||||
|
storage: Arc<dyn Storage>,
|
||||||
|
http: reqwest::Client,
|
||||||
|
rate: Arc<HostRateLimiters>,
|
||||||
|
download_allowlist: DownloadAllowlist,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
/// Consecutive transient chapter failures; resets on any success.
|
||||||
|
/// Drives the automatic coordinated browser restart.
|
||||||
|
transient_failures: Arc<std::sync::atomic::AtomicU32>,
|
||||||
|
/// Consecutive-failure count that triggers an auto restart.
|
||||||
|
restart_threshold: u32,
|
||||||
|
/// How long a coordinated restart waits for in-flight leases to drain.
|
||||||
|
drain_deadline: std::time::Duration,
|
||||||
|
/// Live status surface — the dispatcher registers each chapter it
|
||||||
|
/// crawls (with a realtime page count) here.
|
||||||
|
status: crate::crawler::status::StatusHandle,
|
||||||
|
tor: Option<Arc<crate::crawler::tor::TorController>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl ChapterDispatcher for RealChapterDispatcher {
|
||||||
|
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||||
|
match payload {
|
||||||
|
JobPayload::SyncChapterContent {
|
||||||
|
source_id: _,
|
||||||
|
chapter_id,
|
||||||
|
source_chapter_key: _,
|
||||||
|
} => {
|
||||||
|
let row = repo::chapter::dispatch_target(&self.db, chapter_id)
|
||||||
|
.await
|
||||||
|
.context("look up chapter for dispatch")?;
|
||||||
|
let Some((manga_id, source_url, manga_title, chapter_number)) = row else {
|
||||||
|
// Chapter (or its source row) is gone — ack done.
|
||||||
|
return Ok(SyncOutcome::Skipped);
|
||||||
|
};
|
||||||
|
// Register the chapter as crawling now (live status). The
|
||||||
|
// guard removes it on every exit path — success, panic, or
|
||||||
|
// the worker's outer-timeout drop.
|
||||||
|
let _active = self.status.begin_chapter(crate::crawler::status::ActiveChapter {
|
||||||
|
manga_id,
|
||||||
|
manga_title,
|
||||||
|
chapter_id,
|
||||||
|
chapter_number,
|
||||||
|
pages_done: 0,
|
||||||
|
pages_total: None,
|
||||||
|
});
|
||||||
|
let lease = self.browser_manager.acquire().await?;
|
||||||
|
let result = content::sync_chapter_content(
|
||||||
|
&lease,
|
||||||
|
&self.db,
|
||||||
|
self.storage.as_ref(),
|
||||||
|
&self.http,
|
||||||
|
&self.rate,
|
||||||
|
chapter_id,
|
||||||
|
manga_id,
|
||||||
|
&source_url,
|
||||||
|
false,
|
||||||
|
&self.download_allowlist,
|
||||||
|
self.max_image_bytes,
|
||||||
|
self.tor.as_deref(),
|
||||||
|
Some(&self.status),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
drop(lease);
|
||||||
|
match result {
|
||||||
|
Ok(outcome) => {
|
||||||
|
// Any successful dispatch (including a clean Skipped)
|
||||||
|
// means the browser is healthy — reset the streak.
|
||||||
|
self.transient_failures.store(0, Ordering::Release);
|
||||||
|
Ok(outcome)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
let streak = self.transient_failures.fetch_add(1, Ordering::AcqRel) + 1;
|
||||||
|
if crate::crawler::nav::anyhow_looks_browser_dead(&e) {
|
||||||
|
// Hard browser-dead: lazy invalidate (next acquire
|
||||||
|
// relaunches). Reset the streak — we're recovering.
|
||||||
|
self.browser_manager.invalidate().await;
|
||||||
|
self.transient_failures.store(0, Ordering::Release);
|
||||||
|
} else if self.restart_threshold > 0 && streak >= self.restart_threshold {
|
||||||
|
// Persistent transients that TOR recircuit couldn't
|
||||||
|
// fix — proactively restart Chromium.
|
||||||
|
tracing::warn!(
|
||||||
|
streak,
|
||||||
|
threshold = self.restart_threshold,
|
||||||
|
"auto browser restart: consecutive transient chapter failures"
|
||||||
|
);
|
||||||
|
let _ = self
|
||||||
|
.browser_manager
|
||||||
|
.coordinated_restart(self.drain_deadline)
|
||||||
|
.await;
|
||||||
|
self.transient_failures.store(0, Ordering::Release);
|
||||||
|
}
|
||||||
|
Err(e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Other payload kinds aren't dispatched by this daemon yet —
|
||||||
|
// SyncManga / SyncChapterList are handled inline by the cron's
|
||||||
|
// metadata pass.
|
||||||
|
_ => Ok(SyncOutcome::Skipped),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build a router from a pre-assembled state. Used by integration tests
|
/// Build a router from a pre-assembled state. Used by integration tests
|
||||||
@@ -43,11 +534,62 @@ pub fn router(state: AppState) -> Router {
|
|||||||
let max_request_bytes = state.upload.max_request_bytes;
|
let max_request_bytes = state.upload.max_request_bytes;
|
||||||
Router::new()
|
Router::new()
|
||||||
.nest("/api/v1", crate::api::routes())
|
.nest("/api/v1", crate::api::routes())
|
||||||
|
.layer(middleware::from_fn_with_state(
|
||||||
|
state.clone(),
|
||||||
|
private_mode_guard,
|
||||||
|
))
|
||||||
.layer(DefaultBodyLimit::max(max_request_bytes))
|
.layer(DefaultBodyLimit::max(max_request_bytes))
|
||||||
.with_state(state)
|
.with_state(state)
|
||||||
.layer(TraceLayer::new_for_http())
|
.layer(TraceLayer::new_for_http())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Paths reachable anonymously even when `PRIVATE_MODE=true`. Login and
|
||||||
|
/// logout are needed for the auth flow itself; `/health` is reserved
|
||||||
|
/// for load-balancer probes; `/auth/config` lets the frontend decide
|
||||||
|
/// whether to render the login form or its anonymous alternatives;
|
||||||
|
/// `/auth/register` is exempted from the gate so the handler can
|
||||||
|
/// return its informative `registration_disabled` 403 (the same code
|
||||||
|
/// public-mode deployments use when `ALLOW_SELF_REGISTER=false`) —
|
||||||
|
/// the handler itself force-blocks the request body in private mode,
|
||||||
|
/// so no account ever gets created here. Everything else demands a
|
||||||
|
/// valid session cookie or bearer token.
|
||||||
|
fn is_public_in_private_mode(path: &str) -> bool {
|
||||||
|
matches!(
|
||||||
|
path,
|
||||||
|
"/api/v1/health"
|
||||||
|
| "/api/v1/auth/config"
|
||||||
|
| "/api/v1/auth/login"
|
||||||
|
| "/api/v1/auth/logout"
|
||||||
|
| "/api/v1/auth/register"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Site-wide auth gate for `PRIVATE_MODE=true`. With the flag off this
|
||||||
|
/// is a no-op pass-through, so public deployments take no extra DB
|
||||||
|
/// hit. With it on, the guard reuses [`CurrentUser`] — the same
|
||||||
|
/// session-cookie-then-bearer-token logic the per-handler extractor
|
||||||
|
/// uses — so the two paths can never drift.
|
||||||
|
async fn private_mode_guard(
|
||||||
|
State(state): State<AppState>,
|
||||||
|
req: Request,
|
||||||
|
next: Next,
|
||||||
|
) -> Result<Response, AppError> {
|
||||||
|
if !state.auth.private_mode {
|
||||||
|
return Ok(next.run(req).await);
|
||||||
|
}
|
||||||
|
if is_public_in_private_mode(req.uri().path()) {
|
||||||
|
return Ok(next.run(req).await);
|
||||||
|
}
|
||||||
|
let (mut parts, body) = req.into_parts();
|
||||||
|
match CurrentUser::from_request_parts(&mut parts, &state).await {
|
||||||
|
Ok(_) => {
|
||||||
|
let req = Request::from_parts(parts, body);
|
||||||
|
Ok(next.run(req).await)
|
||||||
|
}
|
||||||
|
Err(_) => Err(AppError::Unauthenticated),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn cors_layer(allowed_origins: &[String]) -> CorsLayer {
|
pub(crate) fn cors_layer(allowed_origins: &[String]) -> CorsLayer {
|
||||||
if allowed_origins.is_empty() {
|
if allowed_origins.is_empty() {
|
||||||
// Same-origin only — no CORS headers emitted.
|
// Same-origin only — no CORS headers emitted.
|
||||||
|
|||||||
@@ -1,11 +1,19 @@
|
|||||||
//! `CurrentUser` axum extractor.
|
//! Auth extractors.
|
||||||
//!
|
//!
|
||||||
//! Resolves a request to a logged-in user by trying, in order:
|
//! Three extractors are available, in increasing strictness:
|
||||||
//! 1. a `mangalord_session` cookie (session lookup by `sha256(value)`);
|
|
||||||
//! 2. an `Authorization: Bearer <token>` header (api_token lookup).
|
|
||||||
//!
|
//!
|
||||||
//! Both paths look up by hash, never by raw value. Failure to resolve
|
//! - [`CurrentUser`] — accepts either a session cookie or an
|
||||||
//! either way returns 401 via `AppError::Unauthenticated`.
|
//! `Authorization: Bearer <token>` header. Used by ordinary
|
||||||
|
//! authenticated endpoints where bot tokens are first-class clients.
|
||||||
|
//! - [`CurrentSessionUser`] — accepts only the session cookie. Used as
|
||||||
|
//! the substrate for admin extraction so bot tokens cannot authenticate
|
||||||
|
//! as the admin (see [`RequireAdmin`]).
|
||||||
|
//! - [`RequireAdmin`] — composes over [`CurrentSessionUser`] and
|
||||||
|
//! additionally requires `user.is_admin`. Returns 403 for
|
||||||
|
//! authenticated-but-not-admin, 401 otherwise.
|
||||||
|
//!
|
||||||
|
//! All lookups go by `sha256(raw_token)` — the raw value is never stored
|
||||||
|
//! in the database.
|
||||||
|
|
||||||
use axum::async_trait;
|
use axum::async_trait;
|
||||||
use axum::extract::FromRequestParts;
|
use axum::extract::FromRequestParts;
|
||||||
@@ -61,3 +69,54 @@ impl FromRequestParts<AppState> for CurrentUser {
|
|||||||
Err(AppError::Unauthenticated)
|
Err(AppError::Unauthenticated)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Cookie-only authentication. Bot/API tokens are explicitly NOT accepted
|
||||||
|
/// here — this is the substrate for [`RequireAdmin`] and exists precisely
|
||||||
|
/// to keep admin authority out of bearer-token reach.
|
||||||
|
pub struct CurrentSessionUser(pub User);
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl FromRequestParts<AppState> for CurrentSessionUser {
|
||||||
|
type Rejection = AppError;
|
||||||
|
|
||||||
|
async fn from_request_parts(
|
||||||
|
parts: &mut Parts,
|
||||||
|
state: &AppState,
|
||||||
|
) -> Result<Self, Self::Rejection> {
|
||||||
|
let jar = CookieJar::from_headers(&parts.headers);
|
||||||
|
let cookie = jar
|
||||||
|
.get(SESSION_COOKIE_NAME)
|
||||||
|
.ok_or(AppError::Unauthenticated)?;
|
||||||
|
let hash = hash_token(cookie.value());
|
||||||
|
let session = repo::session::find_active(&state.db, &hash)
|
||||||
|
.await?
|
||||||
|
.ok_or(AppError::Unauthenticated)?;
|
||||||
|
let user = repo::user::find_by_id(&state.db, session.user_id)
|
||||||
|
.await?
|
||||||
|
.ok_or(AppError::Unauthenticated)?;
|
||||||
|
Ok(CurrentSessionUser(user))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Admin-only. Composes over [`CurrentSessionUser`] so bot tokens are
|
||||||
|
/// rejected at the auth step (401) rather than the role step (403).
|
||||||
|
/// The user row is re-read every request, so demotion takes effect on
|
||||||
|
/// the very next call without needing to purge sessions.
|
||||||
|
pub struct RequireAdmin(pub User);
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl FromRequestParts<AppState> for RequireAdmin {
|
||||||
|
type Rejection = AppError;
|
||||||
|
|
||||||
|
async fn from_request_parts(
|
||||||
|
parts: &mut Parts,
|
||||||
|
state: &AppState,
|
||||||
|
) -> Result<Self, Self::Rejection> {
|
||||||
|
let CurrentSessionUser(user) =
|
||||||
|
CurrentSessionUser::from_request_parts(parts, state).await?;
|
||||||
|
if !user.is_admin {
|
||||||
|
return Err(AppError::Forbidden);
|
||||||
|
}
|
||||||
|
Ok(RequireAdmin(user))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,4 +7,5 @@
|
|||||||
|
|
||||||
pub mod extractor;
|
pub mod extractor;
|
||||||
pub mod password;
|
pub mod password;
|
||||||
|
pub mod rate_limit;
|
||||||
pub mod token;
|
pub mod token;
|
||||||
|
|||||||
179
backend/src/auth/rate_limit.rs
Normal file
179
backend/src/auth/rate_limit.rs
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
//! Per-process token-bucket rate limiter for the auth endpoints.
|
||||||
|
//!
|
||||||
|
//! Protects `/auth/login`, `/auth/register`, and `/auth/me/password`
|
||||||
|
//! from credential stuffing / password spraying / username probing.
|
||||||
|
//!
|
||||||
|
//! The current deploy puts SvelteKit's hooks.server.ts proxy in front
|
||||||
|
//! of axum without forwarding the original client IP (no
|
||||||
|
//! `X-Forwarded-For`), so per-IP buckets would all collapse to the
|
||||||
|
//! proxy container's address. Until the proxy learns to forward the
|
||||||
|
//! peer address, a single global bucket gives equivalent protection
|
||||||
|
//! against mass-attack patterns and trades a small DoS surface
|
||||||
|
//! (legitimate users sharing the limit) for simplicity.
|
||||||
|
//!
|
||||||
|
//! Each `AppState` carries its own [`AuthRateLimiter`] instance, so
|
||||||
|
//! tests run in isolated buckets and won't bleed across `#[sqlx::test]`
|
||||||
|
//! cases that share a process.
|
||||||
|
|
||||||
|
use std::sync::Mutex;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
/// Tunable limits. `per_sec == 0` disables the limiter — used by the
|
||||||
|
/// test harness and by anyone who wants to opt out via env config.
|
||||||
|
#[derive(Clone, Copy, Debug)]
|
||||||
|
pub struct RateLimitConfig {
|
||||||
|
pub per_sec: u32,
|
||||||
|
pub burst: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for RateLimitConfig {
|
||||||
|
/// Disabled by default. The production `AuthConfig::from_env`
|
||||||
|
/// overrides to a real limit; the test harness keeps the default
|
||||||
|
/// so existing tests don't flake against shared buckets.
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
per_sec: 0,
|
||||||
|
burst: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Production defaults: 5 requests/sec sustained, 10-request burst.
|
||||||
|
/// Tight enough to make brute force impractical, loose enough that a
|
||||||
|
/// real user mistyping their password three times in a row doesn't
|
||||||
|
/// hit it.
|
||||||
|
pub const PRODUCTION_PER_SEC: u32 = 5;
|
||||||
|
pub const PRODUCTION_BURST: u32 = 10;
|
||||||
|
|
||||||
|
struct Bucket {
|
||||||
|
tokens: f64,
|
||||||
|
last_refill: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Outcome of [`AuthRateLimiter::try_acquire`]. When `Denied`, the
|
||||||
|
/// caller can use `retry_after_secs` for a `Retry-After: N` header
|
||||||
|
/// (RFC 6585 §4) so well-behaved clients back off correctly rather
|
||||||
|
/// than retrying in a tight loop.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum AcquireResult {
|
||||||
|
Allowed,
|
||||||
|
Denied { retry_after_secs: u64 },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Single-bucket token-bucket limiter. `try_acquire` is cheap (one
|
||||||
|
/// mutex acquire, no allocations) so the auth path doesn't pay a real
|
||||||
|
/// cost for the check.
|
||||||
|
pub struct AuthRateLimiter {
|
||||||
|
cfg: RateLimitConfig,
|
||||||
|
bucket: Mutex<Bucket>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AuthRateLimiter {
|
||||||
|
pub fn new(cfg: RateLimitConfig) -> Self {
|
||||||
|
Self {
|
||||||
|
cfg,
|
||||||
|
bucket: Mutex::new(Bucket {
|
||||||
|
tokens: cfg.burst as f64,
|
||||||
|
last_refill: Instant::now(),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume one token if available. Returns `Denied` with a
|
||||||
|
/// rounded-up seconds-until-refill so the caller can emit a
|
||||||
|
/// `Retry-After` header.
|
||||||
|
pub fn try_acquire(&self) -> AcquireResult {
|
||||||
|
if self.cfg.per_sec == 0 {
|
||||||
|
return AcquireResult::Allowed;
|
||||||
|
}
|
||||||
|
let now = Instant::now();
|
||||||
|
let mut bucket = self.bucket.lock().expect("rate limiter mutex poisoned");
|
||||||
|
let elapsed = now.duration_since(bucket.last_refill).as_secs_f64();
|
||||||
|
bucket.tokens =
|
||||||
|
(bucket.tokens + elapsed * f64::from(self.cfg.per_sec)).min(f64::from(self.cfg.burst));
|
||||||
|
bucket.last_refill = now;
|
||||||
|
if bucket.tokens >= 1.0 {
|
||||||
|
bucket.tokens -= 1.0;
|
||||||
|
AcquireResult::Allowed
|
||||||
|
} else {
|
||||||
|
// ceil((1 - tokens) / per_sec), minimum 1 — a `Retry-After: 0`
|
||||||
|
// would tell clients to retry immediately, which is what we're
|
||||||
|
// actively trying to discourage.
|
||||||
|
let deficit = 1.0 - bucket.tokens;
|
||||||
|
let wait_secs = (deficit / f64::from(self.cfg.per_sec)).ceil() as u64;
|
||||||
|
AcquireResult::Denied {
|
||||||
|
retry_after_secs: wait_secs.max(1),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn disabled_limiter_always_allows() {
|
||||||
|
let rl = AuthRateLimiter::new(RateLimitConfig {
|
||||||
|
per_sec: 0,
|
||||||
|
burst: 0,
|
||||||
|
});
|
||||||
|
for _ in 0..1000 {
|
||||||
|
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn burst_lets_through_initial_window_then_blocks() {
|
||||||
|
// 0 refill, burst 3 → first three pass, fourth blocks.
|
||||||
|
let rl = AuthRateLimiter::new(RateLimitConfig {
|
||||||
|
per_sec: 1,
|
||||||
|
burst: 3,
|
||||||
|
});
|
||||||
|
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
|
||||||
|
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
|
||||||
|
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
|
||||||
|
match rl.try_acquire() {
|
||||||
|
AcquireResult::Denied { retry_after_secs } => {
|
||||||
|
// Bucket is at ~0 tokens, refill rate 1/sec → ~1s wait.
|
||||||
|
assert!(
|
||||||
|
retry_after_secs >= 1,
|
||||||
|
"retry_after must be at least 1s, got {retry_after_secs}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
AcquireResult::Allowed => panic!("fourth request must be denied"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokens_refill_over_time() {
|
||||||
|
// 10/sec → after ~120ms we should have at least one token back.
|
||||||
|
let rl = AuthRateLimiter::new(RateLimitConfig {
|
||||||
|
per_sec: 10,
|
||||||
|
burst: 1,
|
||||||
|
});
|
||||||
|
assert_eq!(rl.try_acquire(), AcquireResult::Allowed);
|
||||||
|
assert!(matches!(rl.try_acquire(), AcquireResult::Denied { .. }));
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(150));
|
||||||
|
assert_eq!(
|
||||||
|
rl.try_acquire(),
|
||||||
|
AcquireResult::Allowed,
|
||||||
|
"token should have refilled"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn retry_after_scales_inversely_with_refill_rate() {
|
||||||
|
// 1/sec → wait ~1s after burst exhausted.
|
||||||
|
// 10/sec → wait <1s, but we clamp to a minimum of 1s.
|
||||||
|
let slow = AuthRateLimiter::new(RateLimitConfig {
|
||||||
|
per_sec: 1,
|
||||||
|
burst: 1,
|
||||||
|
});
|
||||||
|
slow.try_acquire();
|
||||||
|
match slow.try_acquire() {
|
||||||
|
AcquireResult::Denied { retry_after_secs } => assert_eq!(retry_after_secs, 1),
|
||||||
|
_ => panic!("expected Denied"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
497
backend/src/bin/crawler.rs
Normal file
497
backend/src/bin/crawler.rs
Normal file
@@ -0,0 +1,497 @@
|
|||||||
|
//! Crawler binary.
|
||||||
|
//!
|
||||||
|
//! Now an ops escape hatch sitting alongside the in-process daemon: walks
|
||||||
|
//! the source's manga listing (all pages), fetches each manga's metadata +
|
||||||
|
//! chapter list, downloads covers, reconciles chapters — and then, for any
|
||||||
|
//! chapter belonging to a bookmarked manga whose `page_count` is still 0,
|
||||||
|
//! fetches the chapter pages inline. The daemon does the same work through
|
||||||
|
//! `crawler_jobs`; the CLI is kept around for force-refetches and manual
|
||||||
|
//! backfills.
|
||||||
|
//!
|
||||||
|
//! Configuration mirrors the daemon's `CRAWLER_*` env vars (see
|
||||||
|
//! `crate::config::CrawlerConfig`) plus the CLI-only:
|
||||||
|
//! - **Start URL**: first CLI positional arg, else `$CRAWLER_START_URL`.
|
||||||
|
//! - **Skip chapters / chapter content / force re-fetch / keep browser**:
|
||||||
|
//! `CRAWLER_SKIP_CHAPTERS`, `CRAWLER_SKIP_CHAPTER_CONTENT`,
|
||||||
|
//! `CRAWLER_FORCE_REFETCH_CHAPTERS`, `CRAWLER_KEEP_BROWSER_OPEN`.
|
||||||
|
//! - **Limit**: `CRAWLER_LIMIT` (max manga detail fetches per run).
|
||||||
|
//!
|
||||||
|
//! See `crawler::pipeline::run_metadata_pass` for the shared metadata
|
||||||
|
//! flow.
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context};
|
||||||
|
use futures_util::stream::{self, StreamExt};
|
||||||
|
use mangalord::crawler::browser::{BrowserMode, LaunchOptions};
|
||||||
|
use mangalord::crawler::browser_manager::{self, BrowserManager};
|
||||||
|
use mangalord::crawler::content::{self, SyncOutcome};
|
||||||
|
use mangalord::crawler::pipeline;
|
||||||
|
use mangalord::crawler::rate_limit::HostRateLimiters;
|
||||||
|
use mangalord::crawler::session;
|
||||||
|
use mangalord::storage::{LocalStorage, Storage};
|
||||||
|
use sqlx::postgres::PgPoolOptions;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tracing_subscriber::EnvFilter;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> anyhow::Result<()> {
|
||||||
|
dotenvy::dotenv().ok();
|
||||||
|
tracing_subscriber::fmt()
|
||||||
|
.with_env_filter(
|
||||||
|
EnvFilter::try_from_default_env().unwrap_or_else(|_| {
|
||||||
|
"info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off"
|
||||||
|
.into()
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.init();
|
||||||
|
|
||||||
|
let start_url = resolve_start_url()?;
|
||||||
|
let database_url = std::env::var("DATABASE_URL")
|
||||||
|
.map_err(|_| anyhow!("DATABASE_URL must be set"))?;
|
||||||
|
let storage_dir: PathBuf = std::env::var("STORAGE_DIR")
|
||||||
|
.unwrap_or_else(|_| "./data/storage".to_string())
|
||||||
|
.into();
|
||||||
|
let rate_ms = env_u64("CRAWLER_RATE_MS", 1000);
|
||||||
|
let cdn_host = std::env::var("CRAWLER_CDN_HOST")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let cdn_rate_ms = env_u64("CRAWLER_CDN_RATE_MS", rate_ms);
|
||||||
|
let limit = env_u64("CRAWLER_LIMIT", 0) as usize;
|
||||||
|
let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false);
|
||||||
|
let skip_chapter_content = env_bool("CRAWLER_SKIP_CHAPTER_CONTENT", false);
|
||||||
|
let chapter_workers = env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize;
|
||||||
|
let force_refetch_chapters = env_bool("CRAWLER_FORCE_REFETCH_CHAPTERS", false);
|
||||||
|
let phpsessid = std::env::var("CRAWLER_PHPSESSID")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let cookie_domain = std::env::var("CRAWLER_COOKIE_DOMAIN")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty())
|
||||||
|
.or_else(|| session::registrable_domain(&start_url));
|
||||||
|
let user_agent = std::env::var("CRAWLER_USER_AGENT")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let proxy_url = std::env::var("CRAWLER_PROXY")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let tor_control_url = std::env::var("CRAWLER_TOR_CONTROL_URL")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let tor_control_password = std::env::var("CRAWLER_TOR_CONTROL_PASSWORD")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let tor_control_cookie_path = std::env::var("CRAWLER_TOR_CONTROL_COOKIE_PATH")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty())
|
||||||
|
.map(std::path::PathBuf::from);
|
||||||
|
let tor_recircuit_max_attempts: u32 = std::env::var("CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(3)
|
||||||
|
.max(1);
|
||||||
|
let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false);
|
||||||
|
|
||||||
|
let db = PgPoolOptions::new()
|
||||||
|
.max_connections(5)
|
||||||
|
.connect(&database_url)
|
||||||
|
.await
|
||||||
|
.context("connect to database")?;
|
||||||
|
sqlx::migrate!("./migrations").run(&db).await?;
|
||||||
|
|
||||||
|
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(&storage_dir));
|
||||||
|
|
||||||
|
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
|
||||||
|
if let (Some(sid), Some(domain)) = (&phpsessid, &cookie_domain) {
|
||||||
|
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
|
||||||
|
let seed_url =
|
||||||
|
reqwest::Url::parse(&start_url).context("parse start URL for cookie seed")?;
|
||||||
|
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
|
||||||
|
tracing::info!(domain, "seeded PHPSESSID into reqwest cookie jar");
|
||||||
|
}
|
||||||
|
let mut http_builder = reqwest::Client::builder()
|
||||||
|
.timeout(Duration::from_secs(30))
|
||||||
|
.no_proxy()
|
||||||
|
.cookie_provider(cookie_jar);
|
||||||
|
if let Some(ua) = &user_agent {
|
||||||
|
http_builder = http_builder.user_agent(ua);
|
||||||
|
}
|
||||||
|
if let Some(proxy) = &proxy_url {
|
||||||
|
http_builder = http_builder
|
||||||
|
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy URL: {proxy}"))?);
|
||||||
|
}
|
||||||
|
let http = http_builder.build().context("build http client")?;
|
||||||
|
|
||||||
|
let mut options = LaunchOptions::from_env();
|
||||||
|
if let Some(proxy) = &proxy_url {
|
||||||
|
let chromium_proxy = mangalord::crawler::url_utils::chromium_proxy_arg(proxy);
|
||||||
|
options.extra_args.push(format!("--proxy-server={chromium_proxy}"));
|
||||||
|
}
|
||||||
|
let keep_open = match (keep_browser_open, options.mode) {
|
||||||
|
(true, BrowserMode::Headed) => true,
|
||||||
|
(true, BrowserMode::Headless) => {
|
||||||
|
tracing::warn!(
|
||||||
|
"CRAWLER_KEEP_BROWSER_OPEN ignored in headless mode (no window to inspect)"
|
||||||
|
);
|
||||||
|
false
|
||||||
|
}
|
||||||
|
_ => false,
|
||||||
|
};
|
||||||
|
tracing::info!(
|
||||||
|
?options,
|
||||||
|
%start_url,
|
||||||
|
rate_ms,
|
||||||
|
cdn_host = ?cdn_host,
|
||||||
|
cdn_rate_ms,
|
||||||
|
limit,
|
||||||
|
skip_chapters,
|
||||||
|
skip_chapter_content,
|
||||||
|
chapter_workers,
|
||||||
|
force_refetch_chapters,
|
||||||
|
phpsessid_set = phpsessid.is_some(),
|
||||||
|
cookie_domain = ?cookie_domain,
|
||||||
|
user_agent = ?user_agent,
|
||||||
|
proxy = ?proxy_url,
|
||||||
|
keep_open,
|
||||||
|
storage_dir = %storage_dir.display(),
|
||||||
|
"starting crawler"
|
||||||
|
);
|
||||||
|
|
||||||
|
let tor = mangalord::crawler::tor::TorController::from_parts(
|
||||||
|
tor_control_url.as_deref(),
|
||||||
|
tor_control_password.as_deref(),
|
||||||
|
tor_control_cookie_path.as_deref(),
|
||||||
|
)
|
||||||
|
.context("build TorController from CRAWLER_TOR_CONTROL_* env")?
|
||||||
|
.map(Arc::new);
|
||||||
|
if let Some(t) = &tor {
|
||||||
|
tracing::info!(?t, "TOR control configured");
|
||||||
|
}
|
||||||
|
|
||||||
|
// BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium
|
||||||
|
// alive for the entire run — same lifecycle as the old direct
|
||||||
|
// `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the
|
||||||
|
// session probe; bad cookies fail fast before any real work happens.
|
||||||
|
let on_launch: browser_manager::OnLaunch = match (&phpsessid, &cookie_domain) {
|
||||||
|
(Some(sid), Some(domain)) => {
|
||||||
|
let sid = sid.clone();
|
||||||
|
let domain = domain.clone();
|
||||||
|
let start_url_clone = start_url.clone();
|
||||||
|
let tor_for_launch = tor.as_ref().map(Arc::clone);
|
||||||
|
Arc::new(move |browser| {
|
||||||
|
let sid = sid.clone();
|
||||||
|
let domain = domain.clone();
|
||||||
|
let start_url = start_url_clone.clone();
|
||||||
|
let tor_for_launch = tor_for_launch.as_ref().map(Arc::clone);
|
||||||
|
Box::pin(async move {
|
||||||
|
session::inject_phpsessid(&browser, &sid, &domain)
|
||||||
|
.await
|
||||||
|
.context("inject_phpsessid")?;
|
||||||
|
session::verify_session_with_recircuit(
|
||||||
|
&browser,
|
||||||
|
&start_url,
|
||||||
|
tor_for_launch.as_deref(),
|
||||||
|
tor_recircuit_max_attempts,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.context("verify_session")?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
_ => browser_manager::noop_on_launch(),
|
||||||
|
};
|
||||||
|
let session_ready = phpsessid.is_some() && cookie_domain.is_some();
|
||||||
|
let manager = BrowserManager::new(options, Duration::ZERO, on_launch);
|
||||||
|
|
||||||
|
let result = run(
|
||||||
|
Arc::clone(&manager),
|
||||||
|
&db,
|
||||||
|
Arc::clone(&storage),
|
||||||
|
&http,
|
||||||
|
&start_url,
|
||||||
|
rate_ms,
|
||||||
|
cdn_host.as_deref(),
|
||||||
|
cdn_rate_ms,
|
||||||
|
limit,
|
||||||
|
skip_chapters,
|
||||||
|
skip_chapter_content || !session_ready,
|
||||||
|
chapter_workers,
|
||||||
|
force_refetch_chapters,
|
||||||
|
tor.clone(),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
if keep_open {
|
||||||
|
tracing::info!(
|
||||||
|
"crawler finished; browser kept open. Press Ctrl+C to close and exit."
|
||||||
|
);
|
||||||
|
let _ = tokio::signal::ctrl_c().await;
|
||||||
|
tracing::info!("Ctrl+C received; closing browser");
|
||||||
|
}
|
||||||
|
manager.shutdown().await;
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
async fn run(
|
||||||
|
manager: Arc<BrowserManager>,
|
||||||
|
db: &PgPool,
|
||||||
|
storage: Arc<dyn Storage>,
|
||||||
|
http: &reqwest::Client,
|
||||||
|
start_url: &str,
|
||||||
|
rate_ms: u64,
|
||||||
|
cdn_host: Option<&str>,
|
||||||
|
cdn_rate_ms: u64,
|
||||||
|
limit: usize,
|
||||||
|
skip_chapters: bool,
|
||||||
|
skip_chapter_content: bool,
|
||||||
|
chapter_workers: usize,
|
||||||
|
force_refetch_chapters: bool,
|
||||||
|
tor: Option<Arc<mangalord::crawler::tor::TorController>>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let mut rate = HostRateLimiters::new(Duration::from_millis(rate_ms));
|
||||||
|
if let Some(host) = cdn_host {
|
||||||
|
rate = rate.with_override(host, Duration::from_millis(cdn_rate_ms));
|
||||||
|
}
|
||||||
|
let rate = Arc::new(rate);
|
||||||
|
|
||||||
|
// SSRF defence: only download from the catalog host + CDN host
|
||||||
|
// (plus optional CRAWLER_DOWNLOAD_ALLOWLIST extras), and cap
|
||||||
|
// single-image downloads at CRAWLER_MAX_IMAGE_BYTES bytes.
|
||||||
|
// CRAWLER_ALLOW_ANY_HOST=true short-circuits the host check for
|
||||||
|
// sharded-CDN sources; private-IP and scheme guards still apply.
|
||||||
|
let allowlist = if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
|
||||||
|
mangalord::crawler::safety::DownloadAllowlist::allow_any()
|
||||||
|
} else {
|
||||||
|
let mut allow = mangalord::crawler::safety::DownloadAllowlist::new();
|
||||||
|
if let Ok(parsed) = reqwest::Url::parse(start_url) {
|
||||||
|
if let Some(h) = parsed.host_str() {
|
||||||
|
allow = allow.allow(h);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(host) = cdn_host {
|
||||||
|
allow = allow.allow(host);
|
||||||
|
}
|
||||||
|
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
|
||||||
|
for piece in extras.split(',') {
|
||||||
|
let trimmed = piece.trim();
|
||||||
|
if !trimmed.is_empty() {
|
||||||
|
allow = allow.allow(trimmed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
allow
|
||||||
|
};
|
||||||
|
let max_image_bytes: usize = std::env::var("CRAWLER_MAX_IMAGE_BYTES")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(mangalord::crawler::safety::DEFAULT_MAX_IMAGE_BYTES);
|
||||||
|
let allowlist = Arc::new(allowlist);
|
||||||
|
|
||||||
|
let stats = pipeline::run_metadata_pass(
|
||||||
|
manager.as_ref(),
|
||||||
|
db,
|
||||||
|
storage.as_ref(),
|
||||||
|
http,
|
||||||
|
rate.as_ref(),
|
||||||
|
start_url,
|
||||||
|
limit,
|
||||||
|
skip_chapters,
|
||||||
|
allowlist.as_ref(),
|
||||||
|
max_image_bytes,
|
||||||
|
// Circuit-breaker disabled for the operator-driven CLI: a manual
|
||||||
|
// sweep should push through transient failures, not self-abort.
|
||||||
|
0,
|
||||||
|
// No live status surface for the one-shot CLI.
|
||||||
|
None,
|
||||||
|
tor.as_deref(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
tracing::info!(?stats, "metadata pass complete");
|
||||||
|
|
||||||
|
if !skip_chapter_content {
|
||||||
|
sync_bookmarked_chapter_content(
|
||||||
|
Arc::clone(&manager),
|
||||||
|
db,
|
||||||
|
Arc::clone(&storage),
|
||||||
|
http,
|
||||||
|
Arc::clone(&rate),
|
||||||
|
"target",
|
||||||
|
chapter_workers,
|
||||||
|
force_refetch_chapters,
|
||||||
|
Arc::clone(&allowlist),
|
||||||
|
max_image_bytes,
|
||||||
|
tor.clone(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find every chapter whose manga is bookmarked by at least one user and
|
||||||
|
/// that hasn't been content-synced yet, then fan them out across `workers`
|
||||||
|
/// concurrent tasks. Same as before except the browser comes from a
|
||||||
|
/// BrowserManager lease so it interleaves cleanly with the metadata pass.
|
||||||
|
///
|
||||||
|
/// A `SessionExpired` result aborts the phase.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
async fn sync_bookmarked_chapter_content(
|
||||||
|
manager: Arc<BrowserManager>,
|
||||||
|
db: &PgPool,
|
||||||
|
storage: Arc<dyn Storage>,
|
||||||
|
http: &reqwest::Client,
|
||||||
|
rate: Arc<HostRateLimiters>,
|
||||||
|
source_id: &str,
|
||||||
|
workers: usize,
|
||||||
|
force_refetch: bool,
|
||||||
|
allowlist: Arc<mangalord::crawler::safety::DownloadAllowlist>,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
tor: Option<Arc<mangalord::crawler::tor::TorController>>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT id, manga_id, source_url FROM (
|
||||||
|
SELECT DISTINCT c.id, c.manga_id, c.created_at, cs.source_url
|
||||||
|
FROM chapters c
|
||||||
|
JOIN bookmarks b ON b.manga_id = c.manga_id
|
||||||
|
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||||
|
WHERE cs.source_id = $1
|
||||||
|
AND cs.dropped_at IS NULL
|
||||||
|
AND (c.page_count = 0 OR $2)
|
||||||
|
) sub
|
||||||
|
ORDER BY manga_id, created_at ASC
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(force_refetch)
|
||||||
|
.fetch_all(db)
|
||||||
|
.await
|
||||||
|
.context("query pending chapter content")?;
|
||||||
|
|
||||||
|
if pending.is_empty() {
|
||||||
|
tracing::info!("chapter content: nothing pending");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
tracing::info!(count = pending.len(), workers, "chapter content phase starting");
|
||||||
|
|
||||||
|
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||||
|
let stats = std::sync::Mutex::new(WorkerStats::default());
|
||||||
|
|
||||||
|
stream::iter(pending.into_iter())
|
||||||
|
.for_each_concurrent(workers.max(1), |(chapter_id, manga_id, source_url)| {
|
||||||
|
let session_expired = Arc::clone(&session_expired);
|
||||||
|
let storage = Arc::clone(&storage);
|
||||||
|
let rate = Arc::clone(&rate);
|
||||||
|
let manager = Arc::clone(&manager);
|
||||||
|
let allowlist = Arc::clone(&allowlist);
|
||||||
|
let tor = tor.clone();
|
||||||
|
let stats = &stats;
|
||||||
|
async move {
|
||||||
|
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let lease = match manager.acquire().await {
|
||||||
|
Ok(l) => l,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!(%chapter_id, error = ?e, "browser acquire failed");
|
||||||
|
let mut s = stats.lock().unwrap();
|
||||||
|
s.failed += 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let outcome = content::sync_chapter_content(
|
||||||
|
&lease,
|
||||||
|
db,
|
||||||
|
storage.as_ref(),
|
||||||
|
http,
|
||||||
|
rate.as_ref(),
|
||||||
|
chapter_id,
|
||||||
|
manga_id,
|
||||||
|
&source_url,
|
||||||
|
force_refetch,
|
||||||
|
allowlist.as_ref(),
|
||||||
|
max_image_bytes,
|
||||||
|
tor.as_deref(),
|
||||||
|
// CLI one-shot — no live status surface.
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
drop(lease);
|
||||||
|
let mut s = stats.lock().unwrap();
|
||||||
|
match outcome {
|
||||||
|
Ok(SyncOutcome::Fetched { pages }) => {
|
||||||
|
tracing::info!(%chapter_id, pages, "chapter content fetched");
|
||||||
|
s.fetched += 1;
|
||||||
|
}
|
||||||
|
Ok(SyncOutcome::Skipped) => s.skipped += 1,
|
||||||
|
Ok(SyncOutcome::SessionExpired) => {
|
||||||
|
tracing::error!(
|
||||||
|
%chapter_id,
|
||||||
|
"session expired mid-run — refresh CRAWLER_PHPSESSID and re-run"
|
||||||
|
);
|
||||||
|
session_expired
|
||||||
|
.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
%chapter_id, error = ?e, "chapter content sync failed"
|
||||||
|
);
|
||||||
|
s.failed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let total = stats.into_inner().unwrap();
|
||||||
|
tracing::info!(
|
||||||
|
fetched = total.fetched,
|
||||||
|
skipped = total.skipped,
|
||||||
|
failed = total.failed,
|
||||||
|
"chapter content phase done"
|
||||||
|
);
|
||||||
|
|
||||||
|
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
|
||||||
|
anyhow::bail!("session expired during chapter content phase");
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default, Clone, Copy)]
|
||||||
|
struct WorkerStats {
|
||||||
|
fetched: usize,
|
||||||
|
skipped: usize,
|
||||||
|
failed: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn resolve_start_url() -> anyhow::Result<String> {
|
||||||
|
if let Some(arg) = std::env::args().nth(1) {
|
||||||
|
return Ok(arg);
|
||||||
|
}
|
||||||
|
std::env::var("CRAWLER_START_URL").map_err(|_| {
|
||||||
|
anyhow!(
|
||||||
|
"start URL is required — pass as first CLI arg or set $CRAWLER_START_URL"
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn env_u64(name: &str, default: u64) -> u64 {
|
||||||
|
std::env::var(name)
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(default)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn env_bool(name: &str, default: bool) -> bool {
|
||||||
|
match std::env::var(name).ok().as_deref() {
|
||||||
|
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
||||||
|
Some("0") | Some("false") | Some("FALSE") | Some("no") => false,
|
||||||
|
_ => default,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -1,10 +1,32 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use chrono::NaiveTime;
|
||||||
|
use chrono_tz::Tz;
|
||||||
|
|
||||||
|
use crate::crawler::browser::LaunchOptions;
|
||||||
|
use crate::crawler::safety::{DownloadAllowlist, DEFAULT_MAX_IMAGE_BYTES};
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct AuthConfig {
|
pub struct AuthConfig {
|
||||||
pub cookie_secure: bool,
|
pub cookie_secure: bool,
|
||||||
pub cookie_domain: Option<String>,
|
pub cookie_domain: Option<String>,
|
||||||
pub session_ttl_days: i64,
|
pub session_ttl_days: i64,
|
||||||
|
pub rate_limit: crate::auth::rate_limit::RateLimitConfig,
|
||||||
|
/// When `false`, `POST /auth/register` returns 403
|
||||||
|
/// `registration_disabled` and the frontend hides its register
|
||||||
|
/// affordance. Admins can still mint accounts via
|
||||||
|
/// `POST /admin/users`. Defaults to `true` (open registration)
|
||||||
|
/// for backward compatibility.
|
||||||
|
pub allow_self_register: bool,
|
||||||
|
/// When `true`, every API path except a small allowlist
|
||||||
|
/// (`/health`, `/auth/config`, `/auth/login`, `/auth/logout`)
|
||||||
|
/// requires a valid session cookie or bearer token — anonymous
|
||||||
|
/// reads are rejected with 401. Self-registration is also
|
||||||
|
/// force-disabled regardless of [`Self::allow_self_register`]
|
||||||
|
/// so a private instance is locked down with a single switch.
|
||||||
|
/// Defaults to `false` (current public behaviour).
|
||||||
|
pub private_mode: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for AuthConfig {
|
impl Default for AuthConfig {
|
||||||
@@ -13,6 +35,13 @@ impl Default for AuthConfig {
|
|||||||
cookie_secure: true,
|
cookie_secure: true,
|
||||||
cookie_domain: None,
|
cookie_domain: None,
|
||||||
session_ttl_days: 30,
|
session_ttl_days: 30,
|
||||||
|
// Disabled by default so the test harness inherits a
|
||||||
|
// non-throttling limiter. Production `from_env` overrides
|
||||||
|
// to the [`PRODUCTION_PER_SEC`]/[`PRODUCTION_BURST`]
|
||||||
|
// defaults.
|
||||||
|
rate_limit: crate::auth::rate_limit::RateLimitConfig::default(),
|
||||||
|
allow_self_register: true,
|
||||||
|
private_mode: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -45,6 +74,109 @@ pub struct Config {
|
|||||||
pub auth: AuthConfig,
|
pub auth: AuthConfig,
|
||||||
pub upload: UploadConfig,
|
pub upload: UploadConfig,
|
||||||
pub cors_allowed_origins: Vec<String>,
|
pub cors_allowed_origins: Vec<String>,
|
||||||
|
pub crawler: CrawlerConfig,
|
||||||
|
/// `(username, password)` for the admin user provisioned at startup
|
||||||
|
/// when both `ADMIN_USERNAME` and `ADMIN_PASSWORD` are set. `None`
|
||||||
|
/// skips the bootstrap entirely. See `repo::user::bootstrap_admin`
|
||||||
|
/// for the create-vs-promote semantics — notably the password here
|
||||||
|
/// is used only when creating a new row, never to overwrite an
|
||||||
|
/// existing one.
|
||||||
|
pub admin_bootstrap: Option<(String, String)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All crawler-daemon knobs read from env. Mirrors the env vars the
|
||||||
|
/// `bin/crawler` binary already reads, plus the new daemon-only knobs
|
||||||
|
/// (daily_at, tz, idle_timeout, retention_days, daemon_enabled).
|
||||||
|
///
|
||||||
|
/// `daemon_enabled = false` skips the daemon spawn entirely — used by
|
||||||
|
/// integration tests and dev runs that don't want background activity.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct CrawlerConfig {
|
||||||
|
pub daemon_enabled: bool,
|
||||||
|
pub daily_at: NaiveTime,
|
||||||
|
pub tz: Tz,
|
||||||
|
pub idle_timeout: Duration,
|
||||||
|
pub chapter_workers: usize,
|
||||||
|
pub retention_days: u32,
|
||||||
|
pub start_url: Option<String>,
|
||||||
|
pub rate_ms: u64,
|
||||||
|
pub cdn_host: Option<String>,
|
||||||
|
pub cdn_rate_ms: u64,
|
||||||
|
pub phpsessid: Option<String>,
|
||||||
|
pub cookie_domain: Option<String>,
|
||||||
|
pub user_agent: Option<String>,
|
||||||
|
pub proxy: Option<String>,
|
||||||
|
/// `tcp://host:port`, `host:port`, or bare `host` (default port
|
||||||
|
/// 9051). When `None`, TOR-recircuit-on-transient is disabled and
|
||||||
|
/// the crawler behaves identically to pre-TOR releases.
|
||||||
|
pub tor_control_url: Option<String>,
|
||||||
|
/// HashedControlPassword auth. Used only when
|
||||||
|
/// `tor_control_cookie_path` is `None`.
|
||||||
|
pub tor_control_password: Option<String>,
|
||||||
|
/// Cookie-file auth path (e.g.
|
||||||
|
/// `/var/lib/tor/control_auth_cookie`). Takes precedence over
|
||||||
|
/// password when both are set.
|
||||||
|
pub tor_control_cookie_path: Option<PathBuf>,
|
||||||
|
/// Maximum NEWNYM-and-retry cycles per recircuit-eligible failure.
|
||||||
|
/// Defaults to 3.
|
||||||
|
pub tor_recircuit_max_attempts: u32,
|
||||||
|
pub browser: LaunchOptions,
|
||||||
|
/// Hosts the crawler is allowed to download images / covers from.
|
||||||
|
/// Always seeded with the host of `start_url` and (when set) the
|
||||||
|
/// configured `cdn_host`. Additional hosts can be added via
|
||||||
|
/// `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-separated).
|
||||||
|
pub download_allowlist: DownloadAllowlist,
|
||||||
|
/// Hard upper bound on a single image download. Defaults to 32 MiB.
|
||||||
|
pub max_image_bytes: usize,
|
||||||
|
/// Max manga detail fetches per metadata pass. `0` means no cap
|
||||||
|
/// (full sweep up to the source's own bound). Sourced from
|
||||||
|
/// `CRAWLER_LIMIT`, mirroring the CLI binary.
|
||||||
|
pub manga_limit: usize,
|
||||||
|
/// Hard upper bound on a single chapter-content job dispatch. A job
|
||||||
|
/// exceeding this is acked failed (exponential backoff) instead of
|
||||||
|
/// wedging a worker. Defaults to 600s. `CRAWLER_JOB_TIMEOUT_SECS`.
|
||||||
|
pub job_timeout: Duration,
|
||||||
|
/// Consecutive `fetch_manga` failures that abort a metadata pass
|
||||||
|
/// (circuit-breaker for a source outage). The pass does NOT mark a
|
||||||
|
/// clean exit, so the next tick does a recovery sweep. Defaults to
|
||||||
|
/// 10. `CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES`.
|
||||||
|
pub metadata_max_consecutive_failures: u32,
|
||||||
|
/// Consecutive transient chapter failures (after TOR recircuit is
|
||||||
|
/// exhausted) that trigger an automatic coordinated browser restart.
|
||||||
|
/// Defaults to 3. `CRAWLER_BROWSER_RESTART_THRESHOLD`.
|
||||||
|
pub browser_restart_threshold: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for CrawlerConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
daemon_enabled: false,
|
||||||
|
daily_at: NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
||||||
|
tz: Tz::UTC,
|
||||||
|
idle_timeout: Duration::from_secs(600),
|
||||||
|
chapter_workers: 1,
|
||||||
|
retention_days: 7,
|
||||||
|
start_url: None,
|
||||||
|
rate_ms: 1000,
|
||||||
|
cdn_host: None,
|
||||||
|
cdn_rate_ms: 1000,
|
||||||
|
phpsessid: None,
|
||||||
|
cookie_domain: None,
|
||||||
|
user_agent: None,
|
||||||
|
proxy: None,
|
||||||
|
tor_control_url: None,
|
||||||
|
tor_control_password: None,
|
||||||
|
tor_control_cookie_path: None,
|
||||||
|
tor_recircuit_max_attempts: 3,
|
||||||
|
browser: LaunchOptions::headless(),
|
||||||
|
download_allowlist: DownloadAllowlist::new(),
|
||||||
|
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
|
||||||
|
manga_limit: 0,
|
||||||
|
job_timeout: Duration::from_secs(600),
|
||||||
|
metadata_max_consecutive_failures: 10,
|
||||||
|
browser_restart_threshold: 3,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
@@ -63,6 +195,18 @@ impl Config {
|
|||||||
.ok()
|
.ok()
|
||||||
.filter(|s| !s.is_empty()),
|
.filter(|s| !s.is_empty()),
|
||||||
session_ttl_days: env_i64("SESSION_TTL_DAYS", 30),
|
session_ttl_days: env_i64("SESSION_TTL_DAYS", 30),
|
||||||
|
rate_limit: crate::auth::rate_limit::RateLimitConfig {
|
||||||
|
per_sec: env_u64(
|
||||||
|
"AUTH_RATE_PER_SEC",
|
||||||
|
crate::auth::rate_limit::PRODUCTION_PER_SEC.into(),
|
||||||
|
) as u32,
|
||||||
|
burst: env_u64(
|
||||||
|
"AUTH_RATE_BURST",
|
||||||
|
crate::auth::rate_limit::PRODUCTION_BURST.into(),
|
||||||
|
) as u32,
|
||||||
|
},
|
||||||
|
allow_self_register: env_bool("ALLOW_SELF_REGISTER", true),
|
||||||
|
private_mode: env_bool("PRIVATE_MODE", false),
|
||||||
},
|
},
|
||||||
upload: UploadConfig {
|
upload: UploadConfig {
|
||||||
max_request_bytes: env_usize("MAX_REQUEST_BYTES", 200 * 1024 * 1024),
|
max_request_bytes: env_usize("MAX_REQUEST_BYTES", 200 * 1024 * 1024),
|
||||||
@@ -77,10 +221,142 @@ impl Config {
|
|||||||
.collect()
|
.collect()
|
||||||
})
|
})
|
||||||
.unwrap_or_default(),
|
.unwrap_or_default(),
|
||||||
|
crawler: CrawlerConfig::from_env()?,
|
||||||
|
admin_bootstrap: admin_bootstrap_from_env(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns `Some((username, password))` only when BOTH `ADMIN_USERNAME`
|
||||||
|
/// and `ADMIN_PASSWORD` are set and non-empty. Half-set configuration is
|
||||||
|
/// treated as "no bootstrap" rather than a hard error, so an operator
|
||||||
|
/// can comment out one env var without crashing the server.
|
||||||
|
fn admin_bootstrap_from_env() -> Option<(String, String)> {
|
||||||
|
let username = std::env::var("ADMIN_USERNAME").ok().filter(|s| !s.is_empty())?;
|
||||||
|
let password = std::env::var("ADMIN_PASSWORD").ok().filter(|s| !s.is_empty())?;
|
||||||
|
Some((username, password))
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CrawlerConfig {
|
||||||
|
pub fn from_env() -> anyhow::Result<Self> {
|
||||||
|
// Parse CRAWLER_DAILY_AT (HH:MM, 24h). Invalid → fail fast.
|
||||||
|
let daily_at = match std::env::var("CRAWLER_DAILY_AT").ok().as_deref() {
|
||||||
|
None | Some("") => NaiveTime::from_hms_opt(0, 0, 0).unwrap(),
|
||||||
|
Some(raw) => NaiveTime::parse_from_str(raw, "%H:%M").map_err(|e| {
|
||||||
|
anyhow::anyhow!("CRAWLER_DAILY_AT must be HH:MM (got {raw:?}): {e}")
|
||||||
|
})?,
|
||||||
|
};
|
||||||
|
let tz: Tz = match std::env::var("CRAWLER_TZ").ok().as_deref() {
|
||||||
|
None | Some("") => Tz::UTC,
|
||||||
|
Some(raw) => raw
|
||||||
|
.parse()
|
||||||
|
.map_err(|e| anyhow::anyhow!("CRAWLER_TZ must be a valid IANA TZ (got {raw:?}): {e}"))?,
|
||||||
|
};
|
||||||
|
let start_url = std::env::var("CRAWLER_START_URL")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let cdn_host = std::env::var("CRAWLER_CDN_HOST")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty());
|
||||||
|
let download_allowlist =
|
||||||
|
build_download_allowlist(start_url.as_deref(), cdn_host.as_deref());
|
||||||
|
Ok(Self {
|
||||||
|
daemon_enabled: env_bool("CRAWLER_DAEMON", true),
|
||||||
|
daily_at,
|
||||||
|
tz,
|
||||||
|
idle_timeout: Duration::from_secs(env_u64("CRAWLER_IDLE_TIMEOUT_S", 600)),
|
||||||
|
chapter_workers: env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize,
|
||||||
|
retention_days: env_u64("CRAWLER_JOB_RETENTION_DAYS", 7) as u32,
|
||||||
|
start_url,
|
||||||
|
rate_ms: env_u64("CRAWLER_RATE_MS", 1000),
|
||||||
|
cdn_host,
|
||||||
|
cdn_rate_ms: env_u64("CRAWLER_CDN_RATE_MS", env_u64("CRAWLER_RATE_MS", 1000)),
|
||||||
|
phpsessid: std::env::var("CRAWLER_PHPSESSID")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty()),
|
||||||
|
cookie_domain: std::env::var("CRAWLER_COOKIE_DOMAIN")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty()),
|
||||||
|
user_agent: std::env::var("CRAWLER_USER_AGENT")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty()),
|
||||||
|
proxy: std::env::var("CRAWLER_PROXY")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty()),
|
||||||
|
tor_control_url: std::env::var("CRAWLER_TOR_CONTROL_URL")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty()),
|
||||||
|
tor_control_password: std::env::var("CRAWLER_TOR_CONTROL_PASSWORD")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty()),
|
||||||
|
tor_control_cookie_path: std::env::var("CRAWLER_TOR_CONTROL_COOKIE_PATH")
|
||||||
|
.ok()
|
||||||
|
.filter(|s| !s.trim().is_empty())
|
||||||
|
.map(PathBuf::from),
|
||||||
|
tor_recircuit_max_attempts: env_u64("CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS", 3)
|
||||||
|
.max(1) as u32,
|
||||||
|
browser: LaunchOptions::from_env(),
|
||||||
|
download_allowlist,
|
||||||
|
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
|
||||||
|
manga_limit: env_usize("CRAWLER_LIMIT", 0),
|
||||||
|
job_timeout: Duration::from_secs(env_u64("CRAWLER_JOB_TIMEOUT_SECS", 600).max(1)),
|
||||||
|
metadata_max_consecutive_failures: env_u64(
|
||||||
|
"CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES",
|
||||||
|
10,
|
||||||
|
) as u32,
|
||||||
|
browser_restart_threshold: env_u64("CRAWLER_BROWSER_RESTART_THRESHOLD", 3).max(1)
|
||||||
|
as u32,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the download allowlist from env. Always includes
|
||||||
|
/// `CRAWLER_START_URL`'s host (so the crawler can fetch covers from
|
||||||
|
/// the catalog itself) and `CRAWLER_CDN_HOST` when set. Additional
|
||||||
|
/// hosts can be supplied via `CRAWLER_DOWNLOAD_ALLOWLIST` (comma-
|
||||||
|
/// separated). Empty by default — meaning the crawler refuses to
|
||||||
|
/// download anything when no source is configured, which is the safe
|
||||||
|
/// fail-closed posture.
|
||||||
|
///
|
||||||
|
/// `CRAWLER_ALLOW_ANY_HOST=true` short-circuits the host enumeration
|
||||||
|
/// for operators whose sources shard across numbered CDN subdomains.
|
||||||
|
/// Scheme + private-IP defenses still apply.
|
||||||
|
fn build_download_allowlist(
|
||||||
|
start_url: Option<&str>,
|
||||||
|
cdn_host: Option<&str>,
|
||||||
|
) -> DownloadAllowlist {
|
||||||
|
if env_bool("CRAWLER_ALLOW_ANY_HOST", false) {
|
||||||
|
return DownloadAllowlist::allow_any();
|
||||||
|
}
|
||||||
|
let mut allow = DownloadAllowlist::new();
|
||||||
|
if let Some(url) = start_url {
|
||||||
|
if let Ok(parsed) = reqwest::Url::parse(url) {
|
||||||
|
if let Some(h) = parsed.host_str() {
|
||||||
|
allow = allow.allow(h);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(host) = cdn_host {
|
||||||
|
allow = allow.allow(host);
|
||||||
|
}
|
||||||
|
if let Ok(extras) = std::env::var("CRAWLER_DOWNLOAD_ALLOWLIST") {
|
||||||
|
for piece in extras.split(',') {
|
||||||
|
let trimmed = piece.trim();
|
||||||
|
if !trimmed.is_empty() {
|
||||||
|
allow = allow.allow(trimmed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
allow
|
||||||
|
}
|
||||||
|
|
||||||
|
fn env_u64(name: &str, default: u64) -> u64 {
|
||||||
|
std::env::var(name)
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.unwrap_or(default)
|
||||||
|
}
|
||||||
|
|
||||||
fn env_bool(name: &str, default: bool) -> bool {
|
fn env_bool(name: &str, default: bool) -> bool {
|
||||||
match std::env::var(name).ok().as_deref() {
|
match std::env::var(name).ok().as_deref() {
|
||||||
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
||||||
@@ -102,3 +378,92 @@ fn env_usize(name: &str, default: usize) -> usize {
|
|||||||
.and_then(|s| s.parse().ok())
|
.and_then(|s| s.parse().ok())
|
||||||
.unwrap_or(default)
|
.unwrap_or(default)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
// Serialise env-touching tests so concurrent cargo-test threads don't
|
||||||
|
// race on the process-global env. Re-acquire on poison since a
|
||||||
|
// panicking test still leaves the env in a consistent state for us
|
||||||
|
// (we set/unset within each guard region).
|
||||||
|
static ENV_GUARD: Mutex<()> = Mutex::new(());
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn crawler_limit_env_populates_manga_limit() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::set_var("CRAWLER_LIMIT", "96");
|
||||||
|
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||||
|
std::env::remove_var("CRAWLER_LIMIT");
|
||||||
|
assert_eq!(cfg.manga_limit, 96);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn crawler_limit_unset_defaults_to_zero() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::remove_var("CRAWLER_LIMIT");
|
||||||
|
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||||
|
assert_eq!(cfg.manga_limit, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn reliability_knobs_default_when_unset() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::remove_var("CRAWLER_JOB_TIMEOUT_SECS");
|
||||||
|
std::env::remove_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES");
|
||||||
|
std::env::remove_var("CRAWLER_BROWSER_RESTART_THRESHOLD");
|
||||||
|
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||||
|
assert_eq!(cfg.job_timeout, Duration::from_secs(600));
|
||||||
|
assert_eq!(cfg.metadata_max_consecutive_failures, 10);
|
||||||
|
assert_eq!(cfg.browser_restart_threshold, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn reliability_knobs_parse_from_env() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::set_var("CRAWLER_JOB_TIMEOUT_SECS", "120");
|
||||||
|
std::env::set_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES", "5");
|
||||||
|
std::env::set_var("CRAWLER_BROWSER_RESTART_THRESHOLD", "7");
|
||||||
|
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||||
|
std::env::remove_var("CRAWLER_JOB_TIMEOUT_SECS");
|
||||||
|
std::env::remove_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES");
|
||||||
|
std::env::remove_var("CRAWLER_BROWSER_RESTART_THRESHOLD");
|
||||||
|
assert_eq!(cfg.job_timeout, Duration::from_secs(120));
|
||||||
|
assert_eq!(cfg.metadata_max_consecutive_failures, 5);
|
||||||
|
assert_eq!(cfg.browser_restart_threshold, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn private_mode_env_parses_true() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::set_var("PRIVATE_MODE", "true");
|
||||||
|
std::env::set_var("DATABASE_URL", "postgres://test");
|
||||||
|
let cfg = Config::from_env().expect("from_env");
|
||||||
|
std::env::remove_var("PRIVATE_MODE");
|
||||||
|
std::env::remove_var("DATABASE_URL");
|
||||||
|
assert!(cfg.auth.private_mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn private_mode_env_parses_false() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::set_var("PRIVATE_MODE", "false");
|
||||||
|
std::env::set_var("DATABASE_URL", "postgres://test");
|
||||||
|
let cfg = Config::from_env().expect("from_env");
|
||||||
|
std::env::remove_var("PRIVATE_MODE");
|
||||||
|
std::env::remove_var("DATABASE_URL");
|
||||||
|
assert!(!cfg.auth.private_mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn private_mode_defaults_to_false() {
|
||||||
|
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||||
|
std::env::remove_var("PRIVATE_MODE");
|
||||||
|
std::env::set_var("DATABASE_URL", "postgres://test");
|
||||||
|
let cfg = Config::from_env().expect("from_env");
|
||||||
|
std::env::remove_var("DATABASE_URL");
|
||||||
|
assert!(!cfg.auth.private_mode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
397
backend/src/crawler/browser.rs
Normal file
397
backend/src/crawler/browser.rs
Normal file
@@ -0,0 +1,397 @@
|
|||||||
|
//! Chromium launcher and lifecycle.
|
||||||
|
//!
|
||||||
|
//! By default uses `chromiumoxide`'s `fetcher` feature — first call
|
||||||
|
//! downloads a known-good revision into a cache dir and reuses it
|
||||||
|
//! forever after. Set `CRAWLER_CHROMIUM_BINARY` to skip the fetcher
|
||||||
|
//! and use a system-installed Chromium instead; required on platforms
|
||||||
|
//! where the upstream snapshot bucket has no usable build (notably
|
||||||
|
//! `Linux_arm64` / Raspberry Pi). Debian's package is at
|
||||||
|
//! `/usr/bin/chromium` or `/usr/bin/chromium-headless-shell`; Ubuntu
|
||||||
|
//! ships it as `chromium-browser` at a different path — don't paste
|
||||||
|
//! the wrong one.
|
||||||
|
//!
|
||||||
|
//! `BrowserMode` toggles headed vs headless; the headed path needs a
|
||||||
|
//! display (real `$DISPLAY` or `xvfb-run`).
|
||||||
|
//!
|
||||||
|
//! Extra Chromium command-line flags can be supplied through
|
||||||
|
//! [`LaunchOptions::extra_args`] in code, or via the
|
||||||
|
//! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going
|
||||||
|
//! through [`LaunchOptions::from_env`]. The launcher always also
|
||||||
|
//! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're
|
||||||
|
//! near-mandatory for containerized Chromium; everything else is
|
||||||
|
//! caller-provided.
|
||||||
|
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use chromiumoxide::browser::{Browser, BrowserConfig};
|
||||||
|
use chromiumoxide::error::CdpError;
|
||||||
|
use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions};
|
||||||
|
use futures_util::StreamExt;
|
||||||
|
use tokio::task::JoinHandle;
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
|
pub enum BrowserMode {
|
||||||
|
/// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the
|
||||||
|
/// binary). Opt-in via `CRAWLER_BROWSER_MODE=headed` — useful for
|
||||||
|
/// debugging a flow visually or for sites that fingerprint
|
||||||
|
/// headless Chrome. Not used in production.
|
||||||
|
Headed,
|
||||||
|
/// No window. Faster, lower resource use, runs without a display.
|
||||||
|
/// This is the default for both `from_env()` and `Default`.
|
||||||
|
Headless,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configuration for a single browser launch.
|
||||||
|
///
|
||||||
|
/// Public fields rather than a builder — there are only two of them
|
||||||
|
/// and callers benefit from struct literal syntax for clarity.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct LaunchOptions {
|
||||||
|
pub mode: BrowserMode,
|
||||||
|
/// Extra Chromium flags, appended after the launcher's own
|
||||||
|
/// defaults. Example: `vec!["--lang=de-DE".into(),
|
||||||
|
/// "--window-size=1280,800".into()]`.
|
||||||
|
pub extra_args: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LaunchOptions {
|
||||||
|
pub fn headed() -> Self {
|
||||||
|
Self {
|
||||||
|
mode: BrowserMode::Headed,
|
||||||
|
extra_args: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn headless() -> Self {
|
||||||
|
Self {
|
||||||
|
mode: BrowserMode::Headless,
|
||||||
|
extra_args: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default
|
||||||
|
/// `headless`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated
|
||||||
|
/// Chromium flags). Flags containing whitespace aren't supported
|
||||||
|
/// through the env var — use the programmatic API for those.
|
||||||
|
pub fn from_env() -> Self {
|
||||||
|
let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() {
|
||||||
|
Ok("headed") => BrowserMode::Headed,
|
||||||
|
_ => BrowserMode::Headless,
|
||||||
|
};
|
||||||
|
let extra_args = std::env::var("CRAWLER_BROWSER_ARGS")
|
||||||
|
.map(|s| parse_args(&s))
|
||||||
|
.unwrap_or_default();
|
||||||
|
Self { mode, extra_args }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for LaunchOptions {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::headless()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed
|
||||||
|
/// separately from `from_env` so it can be unit-tested without
|
||||||
|
/// touching process environment.
|
||||||
|
pub(crate) fn parse_args(s: &str) -> Vec<String> {
|
||||||
|
s.split_whitespace().map(str::to_string).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Owned browser plus the spawned task that drives its CDP event loop.
|
||||||
|
/// Dropping `Handle` without calling `close` leaks the Chromium process
|
||||||
|
/// — always call `close().await` in production paths.
|
||||||
|
///
|
||||||
|
/// The browser is stored behind an `Arc` so it can be shared across
|
||||||
|
/// worker tasks (via [`Handle::shared`]) without copying. `Browser::new_page`
|
||||||
|
/// only needs `&self`, so multiple workers can drive the same browser
|
||||||
|
/// concurrently as long as the manager keeps the `Arc` alive.
|
||||||
|
pub struct Handle {
|
||||||
|
browser: Arc<Browser>,
|
||||||
|
driver: JoinHandle<()>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Handle {
|
||||||
|
/// Borrow the browser. Equivalent to `&*handle.shared()`.
|
||||||
|
pub fn browser(&self) -> &Browser {
|
||||||
|
&self.browser
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clone the shared handle. Workers hold these to call `new_page`
|
||||||
|
/// concurrently. The browser only exits when the last `Arc<Browser>`
|
||||||
|
/// is dropped (kill-on-drop), or when `close()` is called on the
|
||||||
|
/// originating `Handle` while it is the sole holder.
|
||||||
|
pub fn shared(&self) -> Arc<Browser> {
|
||||||
|
Arc::clone(&self.browser)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Closes the browser and awaits the driver task. If other Arcs to
|
||||||
|
/// the browser are still alive we can't issue a clean CDP `close`,
|
||||||
|
/// so we abort the driver task instead — otherwise `handler.next()`
|
||||||
|
/// keeps polling forever and `Handle::close` hangs (chromiumoxide's
|
||||||
|
/// handler stream doesn't end on its own when the underlying WS
|
||||||
|
/// dies). Chromium itself is reaped by kill-on-drop once the last
|
||||||
|
/// `Arc<Browser>` is dropped.
|
||||||
|
pub async fn close(self) -> anyhow::Result<()> {
|
||||||
|
close_or_abort(self.browser, self.driver, |mut owned| async move {
|
||||||
|
let _ = owned.close().await;
|
||||||
|
let _ = owned.wait().await;
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shutdown core for [`Handle::close`], extracted so it can be unit-
|
||||||
|
/// tested without launching real Chromium. When `arc` is uniquely owned,
|
||||||
|
/// `on_owned` runs against the owned value and the driver is awaited
|
||||||
|
/// normally. When other Arc holders exist, the driver is aborted before
|
||||||
|
/// awaiting it so shutdown returns promptly.
|
||||||
|
async fn close_or_abort<T, F, Fut>(arc: Arc<T>, driver: JoinHandle<()>, on_owned: F)
|
||||||
|
where
|
||||||
|
T: Send + 'static,
|
||||||
|
F: FnOnce(T) -> Fut + Send,
|
||||||
|
Fut: std::future::Future<Output = ()> + Send,
|
||||||
|
{
|
||||||
|
match Arc::try_unwrap(arc) {
|
||||||
|
Ok(owned) => {
|
||||||
|
on_owned(owned).await;
|
||||||
|
let _ = driver.await;
|
||||||
|
}
|
||||||
|
Err(shared) => {
|
||||||
|
tracing::warn!(
|
||||||
|
strong_count = Arc::strong_count(&shared),
|
||||||
|
"Handle::close while Arc still shared — aborting driver, relying on kill-on-drop"
|
||||||
|
);
|
||||||
|
drop(shared);
|
||||||
|
driver.abort();
|
||||||
|
let _ = driver.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Launches Chromium. If `CRAWLER_CHROMIUM_BINARY` is set, uses that
|
||||||
|
/// path directly. Otherwise downloads via the `fetcher` feature on
|
||||||
|
/// first run and hits the cache after that. The fetcher cache dir is
|
||||||
|
/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`,
|
||||||
|
/// else `./.chromium-cache` as a last-resort repo-local fallback.
|
||||||
|
pub async fn launch(options: LaunchOptions) -> anyhow::Result<Handle> {
|
||||||
|
let executable = match system_chromium_path_from_env() {
|
||||||
|
Some(path) => {
|
||||||
|
tracing::info!(path = %path.display(), "using system chromium (CRAWLER_CHROMIUM_BINARY)");
|
||||||
|
path
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let cache = cache_dir()?;
|
||||||
|
tokio::fs::create_dir_all(&cache)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("create cache dir {}", cache.display()))?;
|
||||||
|
|
||||||
|
let fetcher = BrowserFetcher::new(
|
||||||
|
BrowserFetcherOptions::builder()
|
||||||
|
.with_path(&cache)
|
||||||
|
.build()
|
||||||
|
.map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?,
|
||||||
|
);
|
||||||
|
tracing::info!(path = %cache.display(), "ensuring chromium revision is present");
|
||||||
|
let info = fetcher
|
||||||
|
.fetch()
|
||||||
|
.await
|
||||||
|
.context("download chromium via fetcher")?;
|
||||||
|
tracing::info!(executable = %info.executable_path.display(), "chromium ready");
|
||||||
|
info.executable_path
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut builder = BrowserConfig::builder()
|
||||||
|
.chrome_executable(executable)
|
||||||
|
// Linux containers / CI commonly lack the user namespaces
|
||||||
|
// Chromium's sandbox wants. Disable it; the crawler runs in its
|
||||||
|
// own container anyway.
|
||||||
|
.arg("--no-sandbox")
|
||||||
|
.arg("--disable-dev-shm-usage");
|
||||||
|
for arg in &options.extra_args {
|
||||||
|
builder = builder.arg(arg);
|
||||||
|
}
|
||||||
|
if matches!(options.mode, BrowserMode::Headed) {
|
||||||
|
builder = builder.with_head();
|
||||||
|
}
|
||||||
|
tracing::info!(
|
||||||
|
mode = ?options.mode,
|
||||||
|
extra_args = ?options.extra_args,
|
||||||
|
"building browser config"
|
||||||
|
);
|
||||||
|
let config = builder
|
||||||
|
.build()
|
||||||
|
.map_err(|e| anyhow::anyhow!("browser config: {e}"))?;
|
||||||
|
|
||||||
|
let (browser, mut handler) = Browser::launch(config)
|
||||||
|
.await
|
||||||
|
.context("launch chromium")?;
|
||||||
|
|
||||||
|
let driver = tokio::spawn(async move {
|
||||||
|
while let Some(event) = handler.next().await {
|
||||||
|
match event {
|
||||||
|
Ok(_) => {}
|
||||||
|
// chromiumoxide 0.7 ships fixed CDP type bindings, so any
|
||||||
|
// CDP event Chrome added later fails to deserialize. The
|
||||||
|
// connection is unaffected — these are noise. Suppress
|
||||||
|
// them so real failures stay visible.
|
||||||
|
Err(CdpError::Serde(_)) => {
|
||||||
|
tracing::trace!("chromium emitted an unrecognized CDP event");
|
||||||
|
}
|
||||||
|
Err(err) => tracing::warn!(?err, "chromium handler event error"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(Handle {
|
||||||
|
browser: Arc::new(browser),
|
||||||
|
driver,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cache_dir() -> anyhow::Result<PathBuf> {
|
||||||
|
if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") {
|
||||||
|
return Ok(PathBuf::from(dir));
|
||||||
|
}
|
||||||
|
if let Ok(home) = std::env::var("HOME") {
|
||||||
|
return Ok(PathBuf::from(home).join(".cache/mangalord/chromium"));
|
||||||
|
}
|
||||||
|
Ok(PathBuf::from("./.chromium-cache"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reads `CRAWLER_CHROMIUM_BINARY` and delegates to the pure helper.
|
||||||
|
/// Thin wrapper kept separate so the decision logic can be unit-tested
|
||||||
|
/// without mutating the process environment.
|
||||||
|
fn system_chromium_path_from_env() -> Option<PathBuf> {
|
||||||
|
system_chromium_path_from_value(std::env::var_os("CRAWLER_CHROMIUM_BINARY").as_deref())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `Some(path)` only when the value is set and non-empty. An
|
||||||
|
/// exported-but-blank var (common in compose `${VAR:-}` patterns when
|
||||||
|
/// the operator didn't fill it in) must behave like "unset" — otherwise
|
||||||
|
/// we'd hand chromiumoxide an empty path and fail launch in a confusing
|
||||||
|
/// way.
|
||||||
|
pub(crate) fn system_chromium_path_from_value(
|
||||||
|
raw: Option<&std::ffi::OsStr>,
|
||||||
|
) -> Option<PathBuf> {
|
||||||
|
raw.filter(|v| !v.is_empty()).map(PathBuf::from)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_args_splits_on_whitespace() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_args("--lang=de-DE --window-size=1280,800"),
|
||||||
|
vec!["--lang=de-DE", "--window-size=1280,800"]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_args_tolerates_irregular_whitespace() {
|
||||||
|
// tabs, multiple spaces, leading/trailing — all collapsed.
|
||||||
|
assert_eq!(
|
||||||
|
parse_args(" --a\t--b --c=1\n"),
|
||||||
|
vec!["--a", "--b", "--c=1"]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_args_empty_string_yields_empty_vec() {
|
||||||
|
assert!(parse_args("").is_empty());
|
||||||
|
assert!(parse_args(" \t\n").is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn system_chromium_path_returns_some_when_value_set() {
|
||||||
|
let raw = std::ffi::OsString::from("/usr/bin/chromium-headless-shell");
|
||||||
|
assert_eq!(
|
||||||
|
system_chromium_path_from_value(Some(raw.as_os_str())),
|
||||||
|
Some(PathBuf::from("/usr/bin/chromium-headless-shell"))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn system_chromium_path_returns_none_when_unset() {
|
||||||
|
assert_eq!(system_chromium_path_from_value(None), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn system_chromium_path_treats_empty_as_unset() {
|
||||||
|
// Compose's `${VAR:-}` substitution produces an exported-but-empty
|
||||||
|
// env var when the operator left it blank. Treat it as unset so
|
||||||
|
// the launcher falls back to the fetcher path instead of handing
|
||||||
|
// chromiumoxide an empty path.
|
||||||
|
let raw = std::ffi::OsString::from("");
|
||||||
|
assert_eq!(
|
||||||
|
system_chromium_path_from_value(Some(raw.as_os_str())),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn default_launch_options_are_headless() {
|
||||||
|
// Headless is the production-safe default — no display required,
|
||||||
|
// smaller resource footprint. `Headed` stays available as an
|
||||||
|
// opt-in for debugging via CRAWLER_BROWSER_MODE=headed.
|
||||||
|
assert_eq!(LaunchOptions::default().mode, BrowserMode::Headless);
|
||||||
|
assert_eq!(LaunchOptions::headless().mode, BrowserMode::Headless);
|
||||||
|
assert_eq!(LaunchOptions::headed().mode, BrowserMode::Headed);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regression: if another Arc<Browser> outlives `Handle::close`, the
|
||||||
|
// old code awaited the driver task forever because the chromiumoxide
|
||||||
|
// handler stream doesn't return None on its own. Aborting the driver
|
||||||
|
// unblocks shutdown even when kill-on-drop can't fire yet.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn close_or_abort_returns_when_arc_is_shared() {
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
let arc = Arc::new(());
|
||||||
|
let _keepalive = Arc::clone(&arc); // forces try_unwrap to fail
|
||||||
|
let driver = tokio::spawn(std::future::pending::<()>());
|
||||||
|
let on_owned_ran = Arc::new(AtomicBool::new(false));
|
||||||
|
|
||||||
|
let flag = Arc::clone(&on_owned_ran);
|
||||||
|
let fut = close_or_abort(arc, driver, move |_| {
|
||||||
|
let flag = Arc::clone(&flag);
|
||||||
|
async move { flag.store(true, Ordering::Release) }
|
||||||
|
});
|
||||||
|
|
||||||
|
tokio::time::timeout(Duration::from_secs(2), fut)
|
||||||
|
.await
|
||||||
|
.expect("close_or_abort must not hang when driver is pending and Arc is shared");
|
||||||
|
assert!(
|
||||||
|
!on_owned_ran.load(Ordering::Acquire),
|
||||||
|
"on_owned must not run when the Arc is still shared"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn close_or_abort_runs_on_owned_when_arc_is_unique() {
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
|
||||||
|
let arc = Arc::new(());
|
||||||
|
let driver = tokio::spawn(async {}); // completes immediately
|
||||||
|
let on_owned_ran = Arc::new(AtomicBool::new(false));
|
||||||
|
|
||||||
|
let flag = Arc::clone(&on_owned_ran);
|
||||||
|
close_or_abort(arc, driver, move |_| {
|
||||||
|
let flag = Arc::clone(&flag);
|
||||||
|
async move { flag.store(true, Ordering::Release) }
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
on_owned_ran.load(Ordering::Acquire),
|
||||||
|
"on_owned must run when the Arc is unique"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
500
backend/src/crawler/browser_manager.rs
Normal file
500
backend/src/crawler/browser_manager.rs
Normal file
@@ -0,0 +1,500 @@
|
|||||||
|
//! Lazy-launch / idle-teardown Chromium manager for the daemon.
|
||||||
|
//!
|
||||||
|
//! The first worker that calls [`BrowserManager::acquire`] triggers a real
|
||||||
|
//! Chromium launch (and the `on_launch` hook — used to re-inject the
|
||||||
|
//! PHPSESSID cookie on every fresh process). Each acquire bumps an active
|
||||||
|
//! counter; the returned [`BrowserLease`] decrements it on drop.
|
||||||
|
//!
|
||||||
|
//! When the active counter hits zero, a background reaper task waits
|
||||||
|
//! `idle_timeout`. If still zero on wake, it closes Chromium and clears the
|
||||||
|
//! cached handle. The next acquire re-launches.
|
||||||
|
//!
|
||||||
|
//! `idle_timeout = Duration::ZERO` disables the reaper — Chromium stays alive
|
||||||
|
//! until [`BrowserManager::shutdown`].
|
||||||
|
|
||||||
|
use std::ops::Deref;
|
||||||
|
use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use chromiumoxide::browser::Browser;
|
||||||
|
use futures_util::future::BoxFuture;
|
||||||
|
use tokio::sync::{Mutex, Notify};
|
||||||
|
use tokio::task::JoinHandle;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
use crate::crawler::browser::{self, LaunchOptions};
|
||||||
|
|
||||||
|
/// Hook invoked on every fresh launch with the new browser. Typically used
|
||||||
|
/// to re-inject PHPSESSID + run the session probe. Errors abort the
|
||||||
|
/// `acquire` that triggered the launch — the next acquire will re-launch.
|
||||||
|
pub type OnLaunch =
|
||||||
|
Arc<dyn Fn(Arc<Browser>) -> BoxFuture<'static, anyhow::Result<()>> + Send + Sync>;
|
||||||
|
|
||||||
|
/// Returns an `OnLaunch` that does nothing — useful when no session is
|
||||||
|
/// configured (e.g. CLI metadata-only runs).
|
||||||
|
pub fn noop_on_launch() -> OnLaunch {
|
||||||
|
Arc::new(|_| Box::pin(async { Ok(()) }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decoupled active-lease tracker. Owns the atomic counter and the idle
|
||||||
|
/// notifier so the wiring is unit-testable without standing up a real
|
||||||
|
/// `BrowserManager` (which would require launching Chromium).
|
||||||
|
#[derive(Default)]
|
||||||
|
pub(crate) struct ActiveTracker {
|
||||||
|
counter: AtomicUsize,
|
||||||
|
idle_signal: Notify,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ActiveTracker {
|
||||||
|
pub(crate) fn new() -> Arc<Self> {
|
||||||
|
Arc::new(Self::default())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn acquire(self: &Arc<Self>) {
|
||||||
|
self.counter.fetch_add(1, Ordering::AcqRel);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn release(self: &Arc<Self>) {
|
||||||
|
if self.counter.fetch_sub(1, Ordering::AcqRel) == 1 {
|
||||||
|
self.idle_signal.notify_one();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn current(&self) -> usize {
|
||||||
|
self.counter.load(Ordering::Acquire)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn idle_signal(&self) -> &Notify {
|
||||||
|
&self.idle_signal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lifecycle gate for a coordinated browser restart. `acquire()` parks
|
||||||
|
/// while not [`RestartPhase::Healthy`] so no new navigation starts mid-
|
||||||
|
/// restart; long-lived lease holders (the metadata pass) cooperate by
|
||||||
|
/// checking [`BrowserManager::is_restart_pending`] at safe boundaries.
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||||
|
pub enum RestartPhase {
|
||||||
|
/// Normal operation — acquires proceed.
|
||||||
|
Healthy,
|
||||||
|
/// Restart requested; new acquires park, waiting for in-flight leases
|
||||||
|
/// to drain.
|
||||||
|
Draining,
|
||||||
|
/// Chromium is being closed + relaunched.
|
||||||
|
Restarting,
|
||||||
|
}
|
||||||
|
|
||||||
|
const PHASE_HEALTHY: u8 = 0;
|
||||||
|
const PHASE_DRAINING: u8 = 1;
|
||||||
|
const PHASE_RESTARTING: u8 = 2;
|
||||||
|
|
||||||
|
pub struct BrowserManager {
|
||||||
|
inner: Mutex<Inner>,
|
||||||
|
active: Arc<ActiveTracker>,
|
||||||
|
launch_opts: LaunchOptions,
|
||||||
|
idle_timeout: Duration,
|
||||||
|
on_launch: OnLaunch,
|
||||||
|
/// Coarse lifecycle phase (one of the `PHASE_*` constants).
|
||||||
|
phase: AtomicU8,
|
||||||
|
/// Woken when the phase returns to `Healthy` so parked acquires resume.
|
||||||
|
resume: Notify,
|
||||||
|
/// Serialises coordinated restarts so concurrent requests collapse into
|
||||||
|
/// a single relaunch.
|
||||||
|
restart_lock: Mutex<()>,
|
||||||
|
/// Result of the most recent relaunch, so a caller that coalesced into
|
||||||
|
/// an in-progress restart reports that restart's real outcome instead
|
||||||
|
/// of a blind success.
|
||||||
|
last_restart_ok: AtomicBool,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Inner {
|
||||||
|
handle: Option<browser::Handle>,
|
||||||
|
shared: Option<Arc<Browser>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BrowserManager {
|
||||||
|
pub fn new(
|
||||||
|
launch_opts: LaunchOptions,
|
||||||
|
idle_timeout: Duration,
|
||||||
|
on_launch: OnLaunch,
|
||||||
|
) -> Arc<Self> {
|
||||||
|
Arc::new(Self {
|
||||||
|
inner: Mutex::new(Inner {
|
||||||
|
handle: None,
|
||||||
|
shared: None,
|
||||||
|
}),
|
||||||
|
active: ActiveTracker::new(),
|
||||||
|
launch_opts,
|
||||||
|
idle_timeout,
|
||||||
|
on_launch,
|
||||||
|
phase: AtomicU8::new(PHASE_HEALTHY),
|
||||||
|
resume: Notify::new(),
|
||||||
|
restart_lock: Mutex::new(()),
|
||||||
|
last_restart_ok: AtomicBool::new(true),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Current restart phase.
|
||||||
|
pub fn phase(&self) -> RestartPhase {
|
||||||
|
match self.phase.load(Ordering::Acquire) {
|
||||||
|
PHASE_DRAINING => RestartPhase::Draining,
|
||||||
|
PHASE_RESTARTING => RestartPhase::Restarting,
|
||||||
|
_ => RestartPhase::Healthy,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_phase(&self, phase: RestartPhase) {
|
||||||
|
let v = match phase {
|
||||||
|
RestartPhase::Healthy => PHASE_HEALTHY,
|
||||||
|
RestartPhase::Draining => PHASE_DRAINING,
|
||||||
|
RestartPhase::Restarting => PHASE_RESTARTING,
|
||||||
|
};
|
||||||
|
self.phase.store(v, Ordering::Release);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether a coordinated restart is in progress. Long-lived lease
|
||||||
|
/// holders poll this at safe boundaries and yield their lease so the
|
||||||
|
/// drain can complete promptly.
|
||||||
|
pub fn is_restart_pending(&self) -> bool {
|
||||||
|
self.phase() != RestartPhase::Healthy
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Launch Chromium into `guard`, running the `on_launch` hook before
|
||||||
|
/// publishing the handle so a probe failure doesn't leave a half-
|
||||||
|
/// initialised browser behind.
|
||||||
|
async fn launch_into(&self, guard: &mut Inner) -> anyhow::Result<()> {
|
||||||
|
let handle = browser::launch(self.launch_opts.clone())
|
||||||
|
.await
|
||||||
|
.context("BrowserManager: launch chromium")?;
|
||||||
|
let shared = handle.shared();
|
||||||
|
if let Err(e) = (self.on_launch)(Arc::clone(&shared)).await {
|
||||||
|
let _ = handle.close().await;
|
||||||
|
return Err(e.context("BrowserManager: on_launch hook failed"));
|
||||||
|
}
|
||||||
|
guard.handle = Some(handle);
|
||||||
|
guard.shared = Some(shared);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Acquire a shared browser lease. The first acquire after a teardown
|
||||||
|
/// launches a fresh Chromium (and runs `on_launch`); subsequent acquires
|
||||||
|
/// while a process is alive just bump the counter and clone the `Arc`.
|
||||||
|
pub async fn acquire(&self) -> anyhow::Result<BrowserLease> {
|
||||||
|
// Park while a coordinated restart is draining/relaunching so no new
|
||||||
|
// navigation starts against a browser that's about to be torn down.
|
||||||
|
// The short sleep fallback guarantees liveness even if a `resume`
|
||||||
|
// notification is missed (classic Notify lost-wakeup).
|
||||||
|
while self.phase() != RestartPhase::Healthy {
|
||||||
|
tokio::select! {
|
||||||
|
_ = self.resume.notified() => {}
|
||||||
|
_ = tokio::time::sleep(Duration::from_millis(100)) => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut guard = self.inner.lock().await;
|
||||||
|
if guard.handle.is_none() {
|
||||||
|
self.launch_into(&mut guard).await?;
|
||||||
|
}
|
||||||
|
let browser = guard
|
||||||
|
.shared
|
||||||
|
.as_ref()
|
||||||
|
.expect("shared set above")
|
||||||
|
.clone();
|
||||||
|
self.active.acquire();
|
||||||
|
Ok(BrowserLease {
|
||||||
|
browser,
|
||||||
|
active: Arc::clone(&self.active),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Coordinated restart: block new acquires, wait for in-flight leases
|
||||||
|
/// to drain (up to `drain_deadline`, then force), close + relaunch
|
||||||
|
/// Chromium (re-running `on_launch` → re-inject session + probe), then
|
||||||
|
/// resume parked acquirers. Concurrent calls collapse into one
|
||||||
|
/// relaunch. The phase is always returned to `Healthy` — even if the
|
||||||
|
/// relaunch errors — so a failed restart never permanently wedges
|
||||||
|
/// acquisition (the next acquire retries the launch lazily).
|
||||||
|
pub async fn coordinated_restart(&self, drain_deadline: Duration) -> anyhow::Result<()> {
|
||||||
|
// Dedup: if a restart is already running, wait for it and report
|
||||||
|
// that restart's real outcome (not a blind success).
|
||||||
|
let _restart_guard = match self.restart_lock.try_lock() {
|
||||||
|
Ok(g) => g,
|
||||||
|
Err(_) => {
|
||||||
|
let _ = self.restart_lock.lock().await;
|
||||||
|
return if self.last_restart_ok.load(Ordering::Acquire) {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(anyhow::anyhow!("a concurrent coordinated browser restart failed"))
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
self.set_phase(RestartPhase::Draining);
|
||||||
|
await_drain(&self.active, drain_deadline).await;
|
||||||
|
|
||||||
|
self.set_phase(RestartPhase::Restarting);
|
||||||
|
let relaunch = {
|
||||||
|
let mut guard = self.inner.lock().await;
|
||||||
|
guard.shared = None;
|
||||||
|
if let Some(handle) = guard.handle.take() {
|
||||||
|
let _ = handle.close().await;
|
||||||
|
}
|
||||||
|
self.launch_into(&mut guard).await
|
||||||
|
};
|
||||||
|
|
||||||
|
self.last_restart_ok.store(relaunch.is_ok(), Ordering::Release);
|
||||||
|
self.set_phase(RestartPhase::Healthy);
|
||||||
|
self.resume.notify_waiters();
|
||||||
|
match &relaunch {
|
||||||
|
Ok(()) => tracing::info!("BrowserManager: coordinated restart complete"),
|
||||||
|
Err(e) => tracing::error!(error = ?e, "BrowserManager: coordinated restart relaunch failed"),
|
||||||
|
}
|
||||||
|
relaunch.context("coordinated_restart: relaunch")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Forcefully close the cached browser regardless of active count.
|
||||||
|
/// Used on daemon shutdown. After this returns the next acquire will
|
||||||
|
/// re-launch from scratch.
|
||||||
|
pub async fn shutdown(&self) {
|
||||||
|
let mut guard = self.inner.lock().await;
|
||||||
|
guard.shared = None;
|
||||||
|
if let Some(handle) = guard.handle.take() {
|
||||||
|
let _ = handle.close().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark the cached browser handle as unhealthy. The next `acquire`
|
||||||
|
/// will re-launch Chromium from scratch.
|
||||||
|
///
|
||||||
|
/// Same semantics as `shutdown` — the difference is intent:
|
||||||
|
/// `shutdown` runs once at daemon teardown, while `invalidate` is a
|
||||||
|
/// recovery hook callers fire after a CDP / connection / navigation
|
||||||
|
/// failure that suggests the underlying process has died. Calling
|
||||||
|
/// this while other workers still hold leases is safe — their
|
||||||
|
/// outstanding CDP operations will return channel-closed errors
|
||||||
|
/// and those workers will then re-acquire (re-launching Chromium).
|
||||||
|
///
|
||||||
|
/// Idempotent: calling on an already-invalidated manager is a
|
||||||
|
/// no-op.
|
||||||
|
pub async fn invalidate(&self) {
|
||||||
|
let mut guard = self.inner.lock().await;
|
||||||
|
guard.shared = None;
|
||||||
|
if let Some(handle) = guard.handle.take() {
|
||||||
|
let _ = handle.close().await;
|
||||||
|
tracing::warn!("BrowserManager: handle invalidated — next acquire will relaunch");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn idle_timeout(&self) -> Duration {
|
||||||
|
self.idle_timeout
|
||||||
|
}
|
||||||
|
|
||||||
|
fn active(&self) -> Arc<ActiveTracker> {
|
||||||
|
Arc::clone(&self.active)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for the active-lease count to reach zero, up to `deadline`. Wakes
|
||||||
|
/// on the tracker's idle signal and re-checks on a short poll so a missed
|
||||||
|
/// signal can't strand the drain. Returns when drained or when the
|
||||||
|
/// deadline elapses (the caller then force-restarts). Extracted as a free
|
||||||
|
/// fn so the timing logic is unit-testable without launching Chromium.
|
||||||
|
async fn await_drain(active: &Arc<ActiveTracker>, deadline: Duration) {
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
while active.current() > 0 {
|
||||||
|
let Some(remaining) = deadline.checked_sub(start.elapsed()) else {
|
||||||
|
tracing::warn!(
|
||||||
|
active = active.current(),
|
||||||
|
"coordinated_restart: drain deadline exceeded — forcing relaunch"
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
let nap = remaining.min(Duration::from_millis(250));
|
||||||
|
tokio::select! {
|
||||||
|
_ = active.idle_signal().notified() => {}
|
||||||
|
_ = tokio::time::sleep(nap) => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Background reaper. Returns immediately when `idle_timeout == 0`.
|
||||||
|
/// Otherwise spawns a task that:
|
||||||
|
/// 1. Waits on `idle_signal` (woken when active hits zero).
|
||||||
|
/// 2. Sleeps `idle_timeout`.
|
||||||
|
/// 3. Re-checks the counter under the mutex — if still zero, takes the
|
||||||
|
/// handle and closes it.
|
||||||
|
///
|
||||||
|
/// Repeats forever until `cancel` fires.
|
||||||
|
pub fn spawn_idle_reaper(mgr: Arc<BrowserManager>, cancel: CancellationToken) -> JoinHandle<()> {
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if mgr.idle_timeout().is_zero() {
|
||||||
|
// Block until cancellation, then exit.
|
||||||
|
cancel.cancelled().await;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let active = mgr.active();
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.cancelled() => return,
|
||||||
|
_ = active.idle_signal().notified() => {}
|
||||||
|
}
|
||||||
|
if active.current() > 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.cancelled() => return,
|
||||||
|
_ = tokio::time::sleep(mgr.idle_timeout()) => {}
|
||||||
|
}
|
||||||
|
let mut guard = mgr.inner.lock().await;
|
||||||
|
if active.current() > 0 {
|
||||||
|
// A worker grabbed a lease during the sleep — abort teardown.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let handle = guard.handle.take();
|
||||||
|
guard.shared = None;
|
||||||
|
drop(guard);
|
||||||
|
if let Some(h) = handle {
|
||||||
|
let _ = h.close().await;
|
||||||
|
tracing::info!("BrowserManager: idle teardown — Chromium closed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A worker-side handle that keeps the browser alive while in scope.
|
||||||
|
/// `Deref<Target = Browser>` so callers can pass `&*lease` to APIs that
|
||||||
|
/// expect `&Browser`.
|
||||||
|
pub struct BrowserLease {
|
||||||
|
browser: Arc<Browser>,
|
||||||
|
active: Arc<ActiveTracker>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Deref for BrowserLease {
|
||||||
|
type Target = Browser;
|
||||||
|
fn deref(&self) -> &Browser {
|
||||||
|
&self.browser
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for BrowserLease {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.active.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::sync::atomic::AtomicBool;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn noop_on_launch_is_send_sync() {
|
||||||
|
fn assert_send_sync<T: Send + Sync>(_: &T) {}
|
||||||
|
let h = noop_on_launch();
|
||||||
|
assert_send_sync(&h);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Invalidate is the only `BrowserManager` method that's safe to
|
||||||
|
/// exercise in a unit test without launching Chromium — it's a
|
||||||
|
/// no-op when no handle has been cached, and that path is exactly
|
||||||
|
/// the one we want to verify is idempotent.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn invalidate_is_a_noop_when_no_handle_cached() {
|
||||||
|
let mgr = BrowserManager::new(
|
||||||
|
crate::crawler::browser::LaunchOptions::default(),
|
||||||
|
Duration::ZERO,
|
||||||
|
noop_on_launch(),
|
||||||
|
);
|
||||||
|
// Two back-to-back invalidates must both complete; the second
|
||||||
|
// would hang or panic if the first had left torn state.
|
||||||
|
mgr.invalidate().await;
|
||||||
|
mgr.invalidate().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn await_drain_returns_immediately_when_already_idle() {
|
||||||
|
let active = ActiveTracker::new();
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
await_drain(&active, Duration::from_secs(5)).await;
|
||||||
|
assert!(start.elapsed() < Duration::from_millis(200), "no wait when idle");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn await_drain_completes_when_lease_released() {
|
||||||
|
let active = ActiveTracker::new();
|
||||||
|
active.acquire();
|
||||||
|
let bg = {
|
||||||
|
let a = Arc::clone(&active);
|
||||||
|
tokio::spawn(async move {
|
||||||
|
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||||
|
a.release();
|
||||||
|
})
|
||||||
|
};
|
||||||
|
// Generous deadline; should return shortly after the release, not
|
||||||
|
// at the deadline.
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
await_drain(&active, Duration::from_secs(5)).await;
|
||||||
|
assert!(start.elapsed() < Duration::from_secs(2), "drained on release");
|
||||||
|
assert_eq!(active.current(), 0);
|
||||||
|
bg.await.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn await_drain_force_returns_after_deadline_when_stuck() {
|
||||||
|
let active = ActiveTracker::new();
|
||||||
|
active.acquire(); // never released
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
await_drain(&active, Duration::from_millis(300)).await;
|
||||||
|
let elapsed = start.elapsed();
|
||||||
|
assert!(elapsed >= Duration::from_millis(250), "waited ~deadline: {elapsed:?}");
|
||||||
|
assert!(elapsed < Duration::from_secs(2), "but not forever: {elapsed:?}");
|
||||||
|
assert_eq!(active.current(), 1, "still held — caller force-restarts");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn phase_transitions_reflect_is_restart_pending() {
|
||||||
|
let mgr = BrowserManager::new(
|
||||||
|
crate::crawler::browser::LaunchOptions::default(),
|
||||||
|
Duration::ZERO,
|
||||||
|
noop_on_launch(),
|
||||||
|
);
|
||||||
|
assert_eq!(mgr.phase(), RestartPhase::Healthy);
|
||||||
|
assert!(!mgr.is_restart_pending());
|
||||||
|
mgr.set_phase(RestartPhase::Draining);
|
||||||
|
assert!(mgr.is_restart_pending());
|
||||||
|
mgr.set_phase(RestartPhase::Restarting);
|
||||||
|
assert!(mgr.is_restart_pending());
|
||||||
|
mgr.set_phase(RestartPhase::Healthy);
|
||||||
|
assert!(!mgr.is_restart_pending());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn active_tracker_signals_idle_only_on_zero_transition() {
|
||||||
|
let tracker = ActiveTracker::new();
|
||||||
|
let signaled = Arc::new(AtomicBool::new(false));
|
||||||
|
{
|
||||||
|
let s = Arc::clone(&signaled);
|
||||||
|
let t = Arc::clone(&tracker);
|
||||||
|
tokio::spawn(async move {
|
||||||
|
t.idle_signal().notified().await;
|
||||||
|
s.store(true, Ordering::Release);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
tracker.acquire();
|
||||||
|
tracker.acquire();
|
||||||
|
assert_eq!(tracker.current(), 2);
|
||||||
|
tracker.release();
|
||||||
|
assert_eq!(tracker.current(), 1);
|
||||||
|
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||||
|
assert!(!signaled.load(Ordering::Acquire), "no idle signal at count 1");
|
||||||
|
tracker.release();
|
||||||
|
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||||
|
assert_eq!(tracker.current(), 0);
|
||||||
|
assert!(
|
||||||
|
signaled.load(Ordering::Acquire),
|
||||||
|
"idle signal fires on 1 -> 0 transition"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
795
backend/src/crawler/content.rs
Normal file
795
backend/src/crawler/content.rs
Normal file
@@ -0,0 +1,795 @@
|
|||||||
|
//! Chapter content sync — fetch a logged-in chapter page, extract its
|
||||||
|
//! image URLs in `pageN` order, download each to storage, and atomically
|
||||||
|
//! persist a `pages` row per image plus the chapter's `page_count`.
|
||||||
|
//!
|
||||||
|
//! Only chapters belonging to a manga someone has bookmarked are
|
||||||
|
//! candidates. The crawler scans bookmarks at the start of each run and
|
||||||
|
//! enqueues unfetched chapters; the API also enqueues at bookmark-time
|
||||||
|
//! so users get instant feedback. Both feed into the same queue and
|
||||||
|
//! dedup by chapter id.
|
||||||
|
|
||||||
|
// Implementation lands in the next commits in this branch. Module is
|
||||||
|
// declared so other crates can `use crawler::content` without breaking
|
||||||
|
// builds while iteration is in progress.
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::crawler::detect::PageError;
|
||||||
|
use crate::crawler::rate_limit::HostRateLimiters;
|
||||||
|
use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist};
|
||||||
|
use crate::crawler::session::{self, ChapterProbe};
|
||||||
|
use crate::storage::Storage;
|
||||||
|
|
||||||
|
/// Parse the chapter page DOM and return the page images in `pageN`
|
||||||
|
/// order. Filters out the loader `<img class="loading">` and any
|
||||||
|
/// `<img>` without a numeric `id="pageN"`.
|
||||||
|
///
|
||||||
|
/// Reader pages don't render the site's `#logo` element, so the
|
||||||
|
/// universal logo-sentinel can't apply here — instead we assert
|
||||||
|
/// `a#pic_container` is present. Its absence means the response is the
|
||||||
|
/// transient broken-page response (or a redirect to some other layout)
|
||||||
|
/// and the caller should retry.
|
||||||
|
pub fn parse_chapter_pages(html: &str) -> Result<Vec<ChapterImage>, PageError> {
|
||||||
|
let doc = scraper::Html::parse_document(html);
|
||||||
|
let container_sel = scraper::Selector::parse("a#pic_container").unwrap();
|
||||||
|
if doc.select(&container_sel).next().is_none() {
|
||||||
|
return Err(PageError::transient("reader: a#pic_container missing"));
|
||||||
|
}
|
||||||
|
let sel = scraper::Selector::parse("a#pic_container img:not(.loading)").unwrap();
|
||||||
|
let mut pages: Vec<ChapterImage> = doc
|
||||||
|
.select(&sel)
|
||||||
|
.filter_map(|img| {
|
||||||
|
let id = img.value().id()?;
|
||||||
|
let n: i32 = id.strip_prefix("page")?.parse().ok()?;
|
||||||
|
let src = img.value().attr("src")?.trim().to_string();
|
||||||
|
if src.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(ChapterImage { page_number: n, url: src })
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
pages.sort_by_key(|p| p.page_number);
|
||||||
|
Ok(pages)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub struct ChapterImage {
|
||||||
|
pub page_number: i32,
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Outcome of a single chapter sync — surfaced to callers for logging
|
||||||
|
/// and exit-code decisions.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum SyncOutcome {
|
||||||
|
/// All images downloaded and stored, chapter row updated.
|
||||||
|
Fetched { pages: usize },
|
||||||
|
/// `page_count > 0` already — no-op unless force_refetch is set.
|
||||||
|
Skipped,
|
||||||
|
/// Session probe failed mid-sync (avatar selector missing on the
|
||||||
|
/// chapter page). Caller should abort the whole crawler run.
|
||||||
|
SessionExpired,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-chapter max fetch attempts when TOR is configured. `N = 3` means
|
||||||
|
/// up to 3 total page fetches with 2 NEWNYM signals between them. When
|
||||||
|
/// TOR is not configured the effective budget collapses to 1 (single
|
||||||
|
/// attempt, no retry, no recircuit — bit-for-bit pre-TOR behavior).
|
||||||
|
const CHAPTER_RECIRCUIT_MAX_ATTEMPTS: u32 = 3;
|
||||||
|
|
||||||
|
/// Outcome of [`fetch_chapter_html_with_recircuit`]. `Ok` carries the
|
||||||
|
/// final reader HTML; the other two map to `sync_chapter_content`'s
|
||||||
|
/// existing failure modes.
|
||||||
|
#[derive(Debug)]
|
||||||
|
enum ChapterFetchOutcome {
|
||||||
|
Ok(String),
|
||||||
|
/// `ChapterProbe::Unauthenticated` after exhausting recircuit
|
||||||
|
/// budget (or with budget=0). Caller returns
|
||||||
|
/// `SyncOutcome::SessionExpired`.
|
||||||
|
SessionExpired,
|
||||||
|
/// `ChapterProbe::Transient` after exhausting recircuit budget
|
||||||
|
/// (or with budget=0). Caller bails so the dispatcher does
|
||||||
|
/// exponential backoff.
|
||||||
|
PersistentTransient,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Single rate-limited Chromium navigation to the chapter URL,
|
||||||
|
/// returning the page HTML. Extracted from `sync_chapter_content` so
|
||||||
|
/// the recircuit loop can call it once per attempt.
|
||||||
|
async fn fetch_chapter_html_once(
|
||||||
|
browser: &chromiumoxide::Browser,
|
||||||
|
rate: &HostRateLimiters,
|
||||||
|
source_url: &str,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
rate.wait_for(source_url).await?;
|
||||||
|
let page = browser
|
||||||
|
.new_page(source_url)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("open chapter page {source_url}"))?;
|
||||||
|
crate::crawler::nav::wait_for_nav(&page)
|
||||||
|
.await
|
||||||
|
.context("wait for chapter nav")?;
|
||||||
|
// Best-effort wait for the reader marker — same partial-render
|
||||||
|
// race that bit the chapter-list parser can hit here. Timeout is
|
||||||
|
// not an error; the chapter probe + parser sentinels still catch
|
||||||
|
// real failures.
|
||||||
|
let _ = crate::crawler::nav::wait_for_selector(
|
||||||
|
&page,
|
||||||
|
"a#pic_container",
|
||||||
|
crate::crawler::nav::SELECTOR_TIMEOUT,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let html = page.content().await.context("read chapter html")?;
|
||||||
|
page.close().await.ok();
|
||||||
|
Ok(html)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pure-over-IO loop: fetch + classify, up to `max_attempts` total
|
||||||
|
/// fetches. Between attempts, `recircuit` is invoked (a no-op when
|
||||||
|
/// TOR isn't configured). `max_attempts = 1` collapses to the
|
||||||
|
/// original single-shot behavior — `Unauthenticated` →
|
||||||
|
/// `SessionExpired`, `Transient` → `PersistentTransient` on the first
|
||||||
|
/// hit, no recircuit.
|
||||||
|
///
|
||||||
|
/// Semantics match [`crate::crawler::detect::retry_on_transient`] and
|
||||||
|
/// [`run_session_probe_loop`]: `N` is **total attempts including the
|
||||||
|
/// first**, so `N = 3` means 3 fetches and up to 2 NEWNYM calls.
|
||||||
|
/// `Unauthenticated` and `Transient` share the budget — the loop
|
||||||
|
/// doesn't distinguish, so a sequence like Transient → Unauth → Ok
|
||||||
|
/// counts as 3 attempts.
|
||||||
|
async fn fetch_chapter_html_with_recircuit<F, Fut, R, RFut>(
|
||||||
|
mut fetch: F,
|
||||||
|
mut recircuit: R,
|
||||||
|
max_attempts: u32,
|
||||||
|
source_url_for_msg: &str,
|
||||||
|
) -> anyhow::Result<ChapterFetchOutcome>
|
||||||
|
where
|
||||||
|
F: FnMut() -> Fut,
|
||||||
|
Fut: std::future::Future<Output = anyhow::Result<String>>,
|
||||||
|
R: FnMut() -> RFut,
|
||||||
|
RFut: std::future::Future<Output = ()>,
|
||||||
|
{
|
||||||
|
debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
loop {
|
||||||
|
attempt += 1;
|
||||||
|
let html = fetch().await?;
|
||||||
|
match session::classify_chapter_probe(&html) {
|
||||||
|
ChapterProbe::Ok => return Ok(ChapterFetchOutcome::Ok(html)),
|
||||||
|
ChapterProbe::Unauthenticated => {
|
||||||
|
if attempt >= max_attempts {
|
||||||
|
return Ok(ChapterFetchOutcome::SessionExpired);
|
||||||
|
}
|
||||||
|
tracing::warn!(
|
||||||
|
attempt,
|
||||||
|
max = max_attempts,
|
||||||
|
url = source_url_for_msg,
|
||||||
|
"chapter probe Unauthenticated; signaling TOR NEWNYM and retrying"
|
||||||
|
);
|
||||||
|
recircuit().await;
|
||||||
|
}
|
||||||
|
ChapterProbe::Transient => {
|
||||||
|
if attempt >= max_attempts {
|
||||||
|
return Ok(ChapterFetchOutcome::PersistentTransient);
|
||||||
|
}
|
||||||
|
tracing::warn!(
|
||||||
|
attempt,
|
||||||
|
max = max_attempts,
|
||||||
|
url = source_url_for_msg,
|
||||||
|
"chapter probe Transient; signaling TOR NEWNYM and retrying"
|
||||||
|
);
|
||||||
|
recircuit().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch one chapter's images and persist them. Each image is streamed to
|
||||||
|
/// storage as it's fetched (peak memory ≈ one image, not the whole
|
||||||
|
/// chapter); the page rows + `page_count` are then written in one short
|
||||||
|
/// transaction. On any failure the chapter stays at `page_count = 0` (no
|
||||||
|
/// partial rows) and the blobs already written are deleted best-effort by
|
||||||
|
/// [`cleanup_orphans`], so a retry starts clean.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub async fn sync_chapter_content(
|
||||||
|
browser: &chromiumoxide::Browser,
|
||||||
|
db: &PgPool,
|
||||||
|
storage: &dyn Storage,
|
||||||
|
http: &reqwest::Client,
|
||||||
|
rate: &HostRateLimiters,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
source_url: &str,
|
||||||
|
force_refetch: bool,
|
||||||
|
allowlist: &DownloadAllowlist,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
tor: Option<&crate::crawler::tor::TorController>,
|
||||||
|
// Optional live-status sink for the realtime page counter. The daemon
|
||||||
|
// dispatcher passes the shared handle (the chapter has already been
|
||||||
|
// registered via `begin_chapter`); the CLI / admin resync pass `None`.
|
||||||
|
progress: Option<&crate::crawler::status::StatusHandle>,
|
||||||
|
) -> anyhow::Result<SyncOutcome> {
|
||||||
|
// Skip if already fetched, unless caller explicitly forces.
|
||||||
|
if !force_refetch {
|
||||||
|
let (page_count,): (i32,) =
|
||||||
|
sqlx::query_as("SELECT page_count FROM chapters WHERE id = $1")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.fetch_one(db)
|
||||||
|
.await
|
||||||
|
.context("read chapter page_count")?;
|
||||||
|
if page_count > 0 {
|
||||||
|
return Ok(SyncOutcome::Skipped);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch + classify. With TOR configured, allow up to
|
||||||
|
// CHAPTER_RECIRCUIT_MAX_ATTEMPTS total page fetches with NEWNYM
|
||||||
|
// between each. Without TOR, collapse to 1 attempt (no retry, no
|
||||||
|
// recircuit) — matches the pre-TOR single-shot behavior bit-for-bit.
|
||||||
|
let max_attempts = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 1 };
|
||||||
|
let html = match fetch_chapter_html_with_recircuit(
|
||||||
|
|| fetch_chapter_html_once(browser, rate, source_url),
|
||||||
|
|| async {
|
||||||
|
if let Some(t) = tor {
|
||||||
|
if let Err(e) = t.new_identity().await {
|
||||||
|
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
max_attempts,
|
||||||
|
source_url,
|
||||||
|
)
|
||||||
|
.await?
|
||||||
|
{
|
||||||
|
ChapterFetchOutcome::Ok(html) => html,
|
||||||
|
ChapterFetchOutcome::SessionExpired => return Ok(SyncOutcome::SessionExpired),
|
||||||
|
ChapterFetchOutcome::PersistentTransient => {
|
||||||
|
// Surface as a typed Err so the dispatcher path runs
|
||||||
|
// ack_failed with exponential backoff (rather than the
|
||||||
|
// session-expired sticky flag).
|
||||||
|
anyhow::bail!(
|
||||||
|
"chapter page at {source_url} returned a transient response after \
|
||||||
|
{max_attempts} attempt(s); will retry"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let images = parse_chapter_pages(&html)
|
||||||
|
.with_context(|| format!("parse chapter pages at {source_url}"))?;
|
||||||
|
if images.is_empty() {
|
||||||
|
anyhow::bail!("no page images parsed from {source_url}");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve image URLs against the chapter URL (they may be relative).
|
||||||
|
let base = reqwest::Url::parse(source_url).context("parse chapter URL")?;
|
||||||
|
|
||||||
|
// Stream each image straight to storage as it's fetched, capping peak
|
||||||
|
// memory at a single image rather than the whole chapter. Track the
|
||||||
|
// keys written so they can be rolled back if a later page (or the
|
||||||
|
// final DB commit) fails — preserving the all-or-nothing guarantee
|
||||||
|
// without holding a DB transaction open across the network puts
|
||||||
|
// (which matters once `Storage` is backed by S3).
|
||||||
|
let total = images.len();
|
||||||
|
// Publish the now-known page total so the dashboard shows "0/N".
|
||||||
|
if let Some(p) = progress {
|
||||||
|
p.set_chapter_pages(chapter_id, 0, Some(total));
|
||||||
|
}
|
||||||
|
let mut written_keys: Vec<String> = Vec::with_capacity(total);
|
||||||
|
let mut stored: Vec<StoredPage> = Vec::with_capacity(total);
|
||||||
|
for img in &images {
|
||||||
|
match download_and_store_page(
|
||||||
|
storage,
|
||||||
|
http,
|
||||||
|
rate,
|
||||||
|
&base,
|
||||||
|
source_url,
|
||||||
|
manga_id,
|
||||||
|
chapter_id,
|
||||||
|
img,
|
||||||
|
allowlist,
|
||||||
|
max_image_bytes,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(page) => {
|
||||||
|
written_keys.push(page.storage_key.clone());
|
||||||
|
stored.push(page);
|
||||||
|
// Live page counter: push the climbing count to subscribers.
|
||||||
|
if let Some(p) = progress {
|
||||||
|
p.set_chapter_pages(chapter_id, stored.len(), Some(total));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
cleanup_orphans(storage, &written_keys).await;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Short transaction: page rows + page_count only, no network I/O. On
|
||||||
|
// failure, roll back the stored bytes so the chapter stays at
|
||||||
|
// page_count=0 and is retried cleanly next run.
|
||||||
|
if let Err(e) = persist_pages(db, chapter_id, &stored).await {
|
||||||
|
cleanup_orphans(storage, &written_keys).await;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(SyncOutcome::Fetched { pages: stored.len() })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A page image that has been written to storage and is awaiting its DB
|
||||||
|
/// row. Carries everything `persist_pages` needs.
|
||||||
|
pub(crate) struct StoredPage {
|
||||||
|
page_number: i32,
|
||||||
|
storage_key: String,
|
||||||
|
content_type: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Download a single page image, validate it's really an image, and write
|
||||||
|
/// it to storage. Returns the storage key + content type. Does not touch
|
||||||
|
/// the DB — persistence is batched into one short transaction afterward.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
async fn download_and_store_page(
|
||||||
|
storage: &dyn Storage,
|
||||||
|
http: &reqwest::Client,
|
||||||
|
rate: &HostRateLimiters,
|
||||||
|
base: &reqwest::Url,
|
||||||
|
source_url: &str,
|
||||||
|
manga_id: Uuid,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
img: &ChapterImage,
|
||||||
|
allowlist: &DownloadAllowlist,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
) -> anyhow::Result<StoredPage> {
|
||||||
|
let url = base
|
||||||
|
.join(&img.url)
|
||||||
|
.with_context(|| format!("join image URL {} onto {source_url}", img.url))?;
|
||||||
|
rate.wait_for(url.as_str()).await?;
|
||||||
|
let bytes = fetch_bytes_capped(http, url.as_str(), Some(source_url), allowlist, max_image_bytes)
|
||||||
|
.await?;
|
||||||
|
// Reject any non-image response: the only valid output of an image URL
|
||||||
|
// is an image. `infer` returns None on truncated bytes too, which also
|
||||||
|
// wants to be a failure not a silent `.bin` extension.
|
||||||
|
if !looks_like_image(&bytes) {
|
||||||
|
anyhow::bail!(
|
||||||
|
"image URL {url} returned non-image bytes \
|
||||||
|
(first 16: {:?}); refusing to store as binary blob",
|
||||||
|
&bytes.get(..16.min(bytes.len()))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
let ext = infer::get(&bytes)
|
||||||
|
.map(|k| k.extension())
|
||||||
|
.expect("looks_like_image asserted infer succeeded");
|
||||||
|
let key = format!(
|
||||||
|
"mangas/{manga_id}/chapters/{chapter_id}/pages/{:04}.{ext}",
|
||||||
|
img.page_number
|
||||||
|
);
|
||||||
|
storage
|
||||||
|
.put(&key, &bytes)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("put {key}"))?;
|
||||||
|
Ok(StoredPage {
|
||||||
|
page_number: img.page_number,
|
||||||
|
storage_key: key,
|
||||||
|
content_type: format!("image/{ext}"),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Persist the page rows + chapter `page_count` in one short transaction.
|
||||||
|
/// `(chapter_id, page_number)` is unique so re-runs are idempotent.
|
||||||
|
pub(crate) async fn persist_pages(
|
||||||
|
db: &PgPool,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
stored: &[StoredPage],
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let mut tx = db.begin().await.context("open chapter sync tx")?;
|
||||||
|
for page in stored {
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO pages (chapter_id, page_number, storage_key, content_type)
|
||||||
|
VALUES ($1, $2, $3, $4)
|
||||||
|
ON CONFLICT (chapter_id, page_number) DO UPDATE
|
||||||
|
SET storage_key = EXCLUDED.storage_key,
|
||||||
|
content_type = EXCLUDED.content_type",
|
||||||
|
)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(page.page_number)
|
||||||
|
.bind(&page.storage_key)
|
||||||
|
.bind(&page.content_type)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("insert page row {}", page.page_number))?;
|
||||||
|
}
|
||||||
|
sqlx::query("UPDATE chapters SET page_count = $1 WHERE id = $2")
|
||||||
|
.bind(stored.len() as i32)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await
|
||||||
|
.context("update page_count")?;
|
||||||
|
tx.commit().await.context("commit chapter sync")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Best-effort delete of partially-written page blobs after a chapter sync
|
||||||
|
/// fails, so a retry doesn't accumulate orphans. Errors are logged, not
|
||||||
|
/// raised — a leftover blob is harmless and a future reaper can sweep it.
|
||||||
|
pub(crate) async fn cleanup_orphans(storage: &dyn Storage, keys: &[String]) {
|
||||||
|
for key in keys {
|
||||||
|
if let Err(e) = storage.delete(key).await {
|
||||||
|
tracing::warn!(
|
||||||
|
%key,
|
||||||
|
error = ?e,
|
||||||
|
"failed to delete orphaned page blob after chapter sync failure"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Suppress unused-import warning for `session::registrable_domain`
|
||||||
|
// until the bin/crawler wiring lands in this branch and uses it
|
||||||
|
// through this module.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
fn _keep_session_in_scope() {
|
||||||
|
let _ = session::registrable_domain;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::storage::LocalStorage;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn cleanup_orphans_deletes_written_keys() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = LocalStorage::new(dir.path());
|
||||||
|
let keys = vec![
|
||||||
|
"mangas/m/chapters/c/pages/0001.jpg".to_string(),
|
||||||
|
"mangas/m/chapters/c/pages/0002.jpg".to_string(),
|
||||||
|
];
|
||||||
|
for k in &keys {
|
||||||
|
storage.put(k, b"\xff\xd8\xff\xe0 jpeg-ish").await.unwrap();
|
||||||
|
assert!(storage.exists(k).await.unwrap());
|
||||||
|
}
|
||||||
|
cleanup_orphans(&storage, &keys).await;
|
||||||
|
for k in &keys {
|
||||||
|
assert!(!storage.exists(k).await.unwrap(), "{k} should be deleted");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn cleanup_orphans_tolerates_missing_keys() {
|
||||||
|
// A key that was never written (e.g. the put itself failed) must
|
||||||
|
// not make cleanup error — it's best-effort.
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let storage = LocalStorage::new(dir.path());
|
||||||
|
cleanup_orphans(&storage, &["never/written.jpg".to_string()]).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn persist_pages_inserts_rows_and_sets_page_count(pool: PgPool) {
|
||||||
|
let manga_id = Uuid::new_v4();
|
||||||
|
let chapter_id = Uuid::new_v4();
|
||||||
|
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, 'T')")
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, 1)")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let stored = vec![
|
||||||
|
StoredPage {
|
||||||
|
page_number: 1,
|
||||||
|
storage_key: "k/0001.jpg".into(),
|
||||||
|
content_type: "image/jpeg".into(),
|
||||||
|
},
|
||||||
|
StoredPage {
|
||||||
|
page_number: 2,
|
||||||
|
storage_key: "k/0002.jpg".into(),
|
||||||
|
content_type: "image/jpeg".into(),
|
||||||
|
},
|
||||||
|
];
|
||||||
|
persist_pages(&pool, chapter_id, &stored).await.unwrap();
|
||||||
|
|
||||||
|
let page_count: i32 =
|
||||||
|
sqlx::query_scalar("SELECT page_count FROM chapters WHERE id = $1")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(page_count, 2);
|
||||||
|
let rows: i64 =
|
||||||
|
sqlx::query_scalar("SELECT COUNT(*) FROM pages WHERE chapter_id = $1")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows, 2);
|
||||||
|
|
||||||
|
// Idempotent re-run (force refetch path): same rows, page_count stable.
|
||||||
|
persist_pages(&pool, chapter_id, &stored).await.unwrap();
|
||||||
|
let rows2: i64 =
|
||||||
|
sqlx::query_scalar("SELECT COUNT(*) FROM pages WHERE chapter_id = $1")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows2, 2, "re-run is idempotent via ON CONFLICT");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_chapter_pages_skips_loader_and_sorts_by_id() {
|
||||||
|
// Loader image, two real pages out of order, and one with no id.
|
||||||
|
let html = r#"
|
||||||
|
<html><body id="body"><a id="pic_container">
|
||||||
|
<img class="loading" src="/images/ajax-loader2.gif">
|
||||||
|
<img id="page2" class="page2" src="https://cdn/2.jpg">
|
||||||
|
<img id="page1" class="page1" src="https://cdn/1.jpg">
|
||||||
|
<img src="https://cdn/orphan.jpg">
|
||||||
|
<img id="not-a-page" src="https://cdn/not-a-page.jpg">
|
||||||
|
</a></body></html>
|
||||||
|
"#;
|
||||||
|
let pages = parse_chapter_pages(html).expect("parse");
|
||||||
|
assert_eq!(pages.len(), 2);
|
||||||
|
assert_eq!(pages[0].page_number, 1);
|
||||||
|
assert_eq!(pages[0].url, "https://cdn/1.jpg");
|
||||||
|
assert_eq!(pages[1].page_number, 2);
|
||||||
|
assert_eq!(pages[1].url, "https://cdn/2.jpg");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_chapter_pages_drops_images_without_src() {
|
||||||
|
let html = r#"
|
||||||
|
<a id="pic_container">
|
||||||
|
<img id="page1" src="">
|
||||||
|
<img id="page2" src="https://cdn/2.jpg">
|
||||||
|
</a>
|
||||||
|
"#;
|
||||||
|
let pages = parse_chapter_pages(html).expect("parse");
|
||||||
|
assert_eq!(pages.len(), 1);
|
||||||
|
assert_eq!(pages[0].page_number, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_chapter_pages_handles_three_digit_page_ids() {
|
||||||
|
let html = r#"
|
||||||
|
<a id="pic_container">
|
||||||
|
<img id="page126" src="https://cdn/126.jpg">
|
||||||
|
<img id="page9" src="https://cdn/9.jpg">
|
||||||
|
<img id="page50" src="https://cdn/50.jpg">
|
||||||
|
</a>
|
||||||
|
"#;
|
||||||
|
let pages = parse_chapter_pages(html).expect("parse");
|
||||||
|
assert_eq!(
|
||||||
|
pages.iter().map(|p| p.page_number).collect::<Vec<_>>(),
|
||||||
|
vec![9, 50, 126]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_chapter_pages_returns_transient_when_container_missing() {
|
||||||
|
// Reader doesn't render #logo, so the universal logo sentinel
|
||||||
|
// can't be used here — a#pic_container is the reader-specific
|
||||||
|
// marker. Broken-page response trips this.
|
||||||
|
let html = "<html><body>\
|
||||||
|
<p>we're sorry, the request file are not found.</p>\
|
||||||
|
</body></html>";
|
||||||
|
let err = parse_chapter_pages(html).expect_err("expected Transient");
|
||||||
|
assert!(err.is_transient(), "got non-transient: {err}");
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- fetch_chapter_html_with_recircuit -------------------------------
|
||||||
|
|
||||||
|
const OK_HTML: &str = r#"<html><body><a id="pic_container"><img id="page1" src="x"/></a></body></html>"#;
|
||||||
|
const UNAUTH_HTML: &str = r#"<html><body><header><div id="logo">x</div></header><main>please log in</main></body></html>"#;
|
||||||
|
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_ok_first_attempt() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetches = 0u32;
|
||||||
|
let outcome = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetches += 1;
|
||||||
|
async { Ok(OK_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok");
|
||||||
|
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
|
||||||
|
assert_eq!(fetches, 1);
|
||||||
|
assert_eq!(recircuits, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_unauth_with_single_attempt_returns_session_expired() {
|
||||||
|
// max_attempts=1 = TOR disabled, fail-fast on first Unauthenticated.
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetches = 0u32;
|
||||||
|
let outcome = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetches += 1;
|
||||||
|
async { Ok(UNAUTH_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
1,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok-result");
|
||||||
|
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
|
||||||
|
assert_eq!(fetches, 1);
|
||||||
|
assert_eq!(recircuits, 0, "no recircuit when budget is 1 (TOR disabled)");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_unauth_then_ok_within_budget() {
|
||||||
|
// max_attempts=3 = up to 3 fetches with 2 recircuits between.
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetch_n = 0u32;
|
||||||
|
let outcome = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetch_n += 1;
|
||||||
|
let n = fetch_n;
|
||||||
|
async move {
|
||||||
|
if n == 1 {
|
||||||
|
Ok(UNAUTH_HTML.to_string())
|
||||||
|
} else {
|
||||||
|
Ok(OK_HTML.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok");
|
||||||
|
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
|
||||||
|
assert_eq!(fetch_n, 2);
|
||||||
|
assert_eq!(recircuits, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_unauth_exhausts_budget_returns_session_expired() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetch_n = 0u32;
|
||||||
|
let outcome = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetch_n += 1;
|
||||||
|
async { Ok(UNAUTH_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok-result");
|
||||||
|
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
|
||||||
|
assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
|
||||||
|
assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_transient_then_ok_within_budget() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetch_n = 0u32;
|
||||||
|
let outcome = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetch_n += 1;
|
||||||
|
let n = fetch_n;
|
||||||
|
async move {
|
||||||
|
if n < 3 {
|
||||||
|
Ok(TRANSIENT_HTML.to_string())
|
||||||
|
} else {
|
||||||
|
Ok(OK_HTML.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok");
|
||||||
|
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
|
||||||
|
assert_eq!(fetch_n, 3);
|
||||||
|
assert_eq!(recircuits, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_transient_exhausts_budget_returns_persistent() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetch_n = 0u32;
|
||||||
|
let outcome = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetch_n += 1;
|
||||||
|
async { Ok(TRANSIENT_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok-result");
|
||||||
|
assert!(matches!(outcome, ChapterFetchOutcome::PersistentTransient));
|
||||||
|
assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
|
||||||
|
assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_mixed_transient_then_unauth_then_ok_shares_budget() {
|
||||||
|
// Audit-prompted regression: outcomes share the attempt counter.
|
||||||
|
// Sequence: Transient (attempt 1) → Unauth (attempt 2) → Ok (3).
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetch_n = 0u32;
|
||||||
|
let outcome = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetch_n += 1;
|
||||||
|
let n = fetch_n;
|
||||||
|
async move {
|
||||||
|
match n {
|
||||||
|
1 => Ok(TRANSIENT_HTML.to_string()),
|
||||||
|
2 => Ok(UNAUTH_HTML.to_string()),
|
||||||
|
_ => Ok(OK_HTML.to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok");
|
||||||
|
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
|
||||||
|
assert_eq!(fetch_n, 3);
|
||||||
|
assert_eq!(recircuits, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn recircuit_loop_propagates_fetch_errors() {
|
||||||
|
let mut fetch_n = 0u32;
|
||||||
|
let err = fetch_chapter_html_with_recircuit(
|
||||||
|
|| {
|
||||||
|
fetch_n += 1;
|
||||||
|
async { Err(anyhow::anyhow!("nav timeout")) }
|
||||||
|
},
|
||||||
|
|| async {},
|
||||||
|
3,
|
||||||
|
"https://example/c",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("fetch error bubbles");
|
||||||
|
assert_eq!(fetch_n, 1);
|
||||||
|
assert!(format!("{err:#}").contains("nav timeout"));
|
||||||
|
}
|
||||||
|
}
|
||||||
744
backend/src/crawler/daemon.rs
Normal file
744
backend/src/crawler/daemon.rs
Normal file
@@ -0,0 +1,744 @@
|
|||||||
|
//! In-process crawler daemon.
|
||||||
|
//!
|
||||||
|
//! Owns a cron task that fires a daily metadata pass and N worker tasks
|
||||||
|
//! that drain `SyncChapterContent` jobs from `crawler_jobs`. The dispatch
|
||||||
|
//! seams ([`MetadataPass`], [`ChapterDispatcher`]) are traits so tests can
|
||||||
|
//! inject stubs without standing up a real Chromium / `Source` impl.
|
||||||
|
//!
|
||||||
|
//! ## Cron
|
||||||
|
//!
|
||||||
|
//! Each tick:
|
||||||
|
//! 1. Acquire a Postgres advisory lock on a dedicated pool connection
|
||||||
|
//! (multi-replica safety). Skip the tick on contention.
|
||||||
|
//! 2. Call [`MetadataPass::run`] (typically `pipeline::run_metadata_pass`).
|
||||||
|
//! 3. Enqueue `SyncChapterContent` jobs for any bookmarked manga whose
|
||||||
|
//! chapters still have `page_count = 0`.
|
||||||
|
//! 4. Reap `done` jobs older than `retention_days`.
|
||||||
|
//! 5. Persist `last_metadata_tick_at` and release the lock.
|
||||||
|
//!
|
||||||
|
//! If the last persisted tick is older than the most recent scheduled slot
|
||||||
|
//! (e.g. backend was down at midnight), the daemon fires immediately on
|
||||||
|
//! startup before resuming the regular schedule.
|
||||||
|
//!
|
||||||
|
//! ## Workers
|
||||||
|
//!
|
||||||
|
//! Each worker leases one chapter-content job at a time, dispatches via the
|
||||||
|
//! [`ChapterDispatcher`], and acks `done` / `failed` / re-`pending` based on
|
||||||
|
//! the outcome. A `SessionExpired` outcome flips the sticky
|
||||||
|
//! `session_expired` flag — all workers idle while it's set (until operator
|
||||||
|
//! restart with a refreshed PHPSESSID).
|
||||||
|
//!
|
||||||
|
//! Worker dispatch is wrapped in `catch_unwind` so a panicking handler
|
||||||
|
//! marks the job failed instead of taking down the worker task.
|
||||||
|
|
||||||
|
use std::panic::AssertUnwindSafe;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use chrono::{DateTime, Datelike, NaiveTime, TimeZone, Timelike, Utc};
|
||||||
|
use chrono_tz::Tz;
|
||||||
|
use futures_util::FutureExt;
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tokio::task::JoinSet;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
use crate::crawler::content::SyncOutcome;
|
||||||
|
use crate::crawler::jobs::{self, JobPayload, Lease, KIND_SYNC_CHAPTER_CONTENT};
|
||||||
|
use crate::crawler::pipeline;
|
||||||
|
use crate::crawler::status::{Phase, StatusHandle};
|
||||||
|
|
||||||
|
/// Fixed `pg_try_advisory_lock` key. ASCII "MANGALRD" interpreted as a
|
||||||
|
/// big-endian i64. Hardcoded so every replica agrees on the lock identity
|
||||||
|
/// without consulting config.
|
||||||
|
pub const CRON_LOCK_KEY: i64 = 0x4D414E47414C5244;
|
||||||
|
|
||||||
|
const STATE_KEY_LAST_TICK: &str = "last_metadata_tick_at";
|
||||||
|
|
||||||
|
/// Lease window handed to `jobs::lease`. Kept short, but continuously
|
||||||
|
/// extended by the per-job heartbeat (see [`WorkerContext::process_lease`])
|
||||||
|
/// so a long-but-healthy job never lapses and gets stolen.
|
||||||
|
const LEASE_DURATION: Duration = Duration::from_secs(60);
|
||||||
|
|
||||||
|
/// How often the heartbeat renews the lease while a job runs. A third of
|
||||||
|
/// the lease window leaves two missed-beat's slack before expiry.
|
||||||
|
const LEASE_HEARTBEAT: Duration = Duration::from_secs(20);
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
pub trait MetadataPass: Send + Sync {
|
||||||
|
async fn run(&self) -> anyhow::Result<pipeline::MetadataStats>;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
pub trait ChapterDispatcher: Send + Sync {
|
||||||
|
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configuration for [`spawn`]. Use `None` for `metadata_pass` to disable
|
||||||
|
/// the cron entirely (worker-pool-only mode — useful when only the
|
||||||
|
/// bookmark-triggered enqueue path is wanted).
|
||||||
|
pub struct DaemonConfig {
|
||||||
|
pub metadata_pass: Option<Arc<dyn MetadataPass>>,
|
||||||
|
pub dispatcher: Arc<dyn ChapterDispatcher>,
|
||||||
|
pub chapter_workers: usize,
|
||||||
|
pub daily_at: NaiveTime,
|
||||||
|
pub tz: Tz,
|
||||||
|
pub retention_days: u32,
|
||||||
|
pub session_expired: Arc<AtomicBool>,
|
||||||
|
/// Live status surface updated by the cron + workers.
|
||||||
|
pub status: StatusHandle,
|
||||||
|
/// Hard upper bound on a single job's dispatch. A job that exceeds it
|
||||||
|
/// is acked failed (exponential backoff) rather than wedging a worker
|
||||||
|
/// forever. Must exceed [`LEASE_HEARTBEAT`] and the realistic
|
||||||
|
/// single-job runtime.
|
||||||
|
pub job_timeout: Duration,
|
||||||
|
/// Tasks that should run alongside the cron + workers and be cancelled
|
||||||
|
/// on shutdown. Used to hand the daemon ownership of the browser
|
||||||
|
/// manager's idle reaper.
|
||||||
|
pub extra_tasks: Vec<tokio::task::JoinHandle<()>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DaemonHandle {
|
||||||
|
cancel: CancellationToken,
|
||||||
|
join: JoinSet<()>,
|
||||||
|
extra: Vec<tokio::task::JoinHandle<()>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DaemonHandle {
|
||||||
|
/// Trigger shutdown and await all worker / cron / extra tasks.
|
||||||
|
pub async fn shutdown(mut self) {
|
||||||
|
self.cancel.cancel();
|
||||||
|
while self.join.join_next().await.is_some() {}
|
||||||
|
for task in self.extra.drain(..) {
|
||||||
|
let _ = task.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cancellation token that drives shutdown — exposed so callers
|
||||||
|
/// (`app::spawn_crawler_daemon`) can hand the same token to auxiliary
|
||||||
|
/// tasks (e.g. the BrowserManager idle reaper) and have them stop on
|
||||||
|
/// the daemon's signal.
|
||||||
|
pub fn cancel_token(&self) -> CancellationToken {
|
||||||
|
self.cancel.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn the daemon. Returns immediately; tasks run in the background.
|
||||||
|
/// Pass an external [`CancellationToken`] so auxiliary tasks (e.g. a
|
||||||
|
/// BrowserManager idle reaper) can share the same shutdown signal —
|
||||||
|
/// typically created in the caller, cloned into both spawns.
|
||||||
|
pub fn spawn(pool: PgPool, cancel: CancellationToken, cfg: DaemonConfig) -> DaemonHandle {
|
||||||
|
let mut join = JoinSet::new();
|
||||||
|
|
||||||
|
let DaemonConfig {
|
||||||
|
metadata_pass,
|
||||||
|
dispatcher,
|
||||||
|
chapter_workers,
|
||||||
|
daily_at,
|
||||||
|
tz,
|
||||||
|
retention_days,
|
||||||
|
session_expired,
|
||||||
|
status,
|
||||||
|
job_timeout,
|
||||||
|
extra_tasks,
|
||||||
|
} = cfg;
|
||||||
|
|
||||||
|
if let Some(metadata) = metadata_pass {
|
||||||
|
let ctx = CronContext {
|
||||||
|
pool: pool.clone(),
|
||||||
|
cancel: cancel.clone(),
|
||||||
|
daily_at,
|
||||||
|
tz,
|
||||||
|
retention_days,
|
||||||
|
metadata,
|
||||||
|
status: status.clone(),
|
||||||
|
};
|
||||||
|
join.spawn(async move { ctx.run().await });
|
||||||
|
} else {
|
||||||
|
tracing::info!("crawler daemon: no metadata_pass — cron disabled");
|
||||||
|
}
|
||||||
|
|
||||||
|
for worker_id in 0..chapter_workers.max(1) {
|
||||||
|
let ctx = WorkerContext {
|
||||||
|
pool: pool.clone(),
|
||||||
|
cancel: cancel.clone(),
|
||||||
|
dispatcher: Arc::clone(&dispatcher),
|
||||||
|
session_expired: Arc::clone(&session_expired),
|
||||||
|
status: status.clone(),
|
||||||
|
job_timeout,
|
||||||
|
id: worker_id,
|
||||||
|
};
|
||||||
|
join.spawn(async move { ctx.run().await });
|
||||||
|
}
|
||||||
|
|
||||||
|
DaemonHandle {
|
||||||
|
cancel,
|
||||||
|
join,
|
||||||
|
extra: extra_tasks,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Cron
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
struct CronContext {
|
||||||
|
pool: PgPool,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
daily_at: NaiveTime,
|
||||||
|
tz: Tz,
|
||||||
|
retention_days: u32,
|
||||||
|
metadata: Arc<dyn MetadataPass>,
|
||||||
|
status: StatusHandle,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CronContext {
|
||||||
|
async fn run(self) {
|
||||||
|
// On startup, fire immediately if the most recent slot has already
|
||||||
|
// passed and we never recorded a tick for it.
|
||||||
|
let now = Utc::now();
|
||||||
|
let mut catchup = match read_last_tick(&self.pool).await {
|
||||||
|
Ok(Some(last)) => previous_fire(now, self.daily_at, self.tz) > last,
|
||||||
|
Ok(None) => true,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(?e, "cron: read_last_tick failed; assuming no catch-up");
|
||||||
|
false
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if catchup {
|
||||||
|
tracing::info!("cron: catch-up tick (missed scheduled slot)");
|
||||||
|
self.run_tick().await;
|
||||||
|
catchup = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Recompute next-fire from now() each iteration so clock jumps
|
||||||
|
// (NTP step, suspend/resume) don't strand us on a stale instant.
|
||||||
|
let next = next_fire(Utc::now(), self.daily_at, self.tz);
|
||||||
|
let wait = (next - Utc::now()).to_std().unwrap_or(Duration::ZERO);
|
||||||
|
self.status
|
||||||
|
.set_phase(Phase::Idle {
|
||||||
|
next_fire: Some(next),
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
tracing::info!(
|
||||||
|
next_fire_utc = %next.to_rfc3339(),
|
||||||
|
wait_seconds = wait.as_secs(),
|
||||||
|
"cron: sleeping until next slot"
|
||||||
|
);
|
||||||
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(wait) => {}
|
||||||
|
_ = self.cancel.cancelled() => {
|
||||||
|
tracing::info!("cron: shutdown");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.run_tick().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_tick(&self) {
|
||||||
|
let mut conn = match self.pool.acquire().await {
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!(?e, "cron: acquire conn failed; skipping tick");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// pg_try_advisory_lock is session-scoped — we must hold the same
|
||||||
|
// connection for the unlock or the call silently no-ops on a
|
||||||
|
// different connection from the pool.
|
||||||
|
let acquired: bool = sqlx::query_scalar("SELECT pg_try_advisory_lock($1)")
|
||||||
|
.bind(CRON_LOCK_KEY)
|
||||||
|
.fetch_one(&mut *conn)
|
||||||
|
.await
|
||||||
|
.unwrap_or(false);
|
||||||
|
if !acquired {
|
||||||
|
tracing::info!("cron: tick skipped — another replica holds the lock");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Panic-isolate the tick body the same way `process_lease` does
|
||||||
|
// for worker dispatch. Without this, a panic in metadata.run
|
||||||
|
// (or any of the follow-on steps) would kill the cron task and
|
||||||
|
// no future tick would ever run — workers would keep going but
|
||||||
|
// no new metadata work would be scheduled until daemon restart.
|
||||||
|
// The advisory unlock below runs unconditionally so a panicked
|
||||||
|
// tick doesn't leave the lock held for another replica.
|
||||||
|
let metadata = &self.metadata;
|
||||||
|
let pool = &self.pool;
|
||||||
|
let retention_days = self.retention_days;
|
||||||
|
let status = &self.status;
|
||||||
|
let body = async move {
|
||||||
|
match metadata.run().await {
|
||||||
|
Ok(stats) => {
|
||||||
|
status.record_pass(&stats, Utc::now()).await;
|
||||||
|
tracing::info!(?stats, "cron: metadata pass done");
|
||||||
|
}
|
||||||
|
Err(e) => tracing::error!(?e, "cron: metadata pass failed"),
|
||||||
|
}
|
||||||
|
match pipeline::enqueue_bookmarked_pending(pool).await {
|
||||||
|
Ok(summary) => {
|
||||||
|
tracing::info!(?summary, "cron: enqueued bookmarked-pending");
|
||||||
|
}
|
||||||
|
Err(e) => tracing::error!(?e, "cron: enqueue_bookmarked_pending failed"),
|
||||||
|
}
|
||||||
|
match jobs::reap_done(pool, retention_days).await {
|
||||||
|
Ok(n) => tracing::info!(reaped = n, "cron: done-job reaper finished"),
|
||||||
|
Err(e) => tracing::error!(?e, "cron: done-job reaper failed"),
|
||||||
|
}
|
||||||
|
if let Err(e) = write_last_tick(pool, Utc::now()).await {
|
||||||
|
tracing::warn!(?e, "cron: persist last_metadata_tick_at failed");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Err(_panic) = AssertUnwindSafe(body).catch_unwind().await {
|
||||||
|
tracing::error!("cron: tick body panicked — continuing");
|
||||||
|
}
|
||||||
|
|
||||||
|
let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
|
||||||
|
.bind(CRON_LOCK_KEY)
|
||||||
|
.execute(&mut *conn)
|
||||||
|
.await;
|
||||||
|
drop(conn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Workers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
struct WorkerContext {
|
||||||
|
pool: PgPool,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
dispatcher: Arc<dyn ChapterDispatcher>,
|
||||||
|
session_expired: Arc<AtomicBool>,
|
||||||
|
status: StatusHandle,
|
||||||
|
job_timeout: Duration,
|
||||||
|
id: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WorkerContext {
|
||||||
|
async fn run(self) {
|
||||||
|
loop {
|
||||||
|
if self.cancel.is_cancelled() {
|
||||||
|
tracing::info!(worker = self.id, "worker: shutdown");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if self.session_expired.load(Ordering::Acquire) {
|
||||||
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(Duration::from_secs(30)) => continue,
|
||||||
|
_ = self.cancel.cancelled() => return,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let leases = match jobs::lease(
|
||||||
|
&self.pool,
|
||||||
|
Some(KIND_SYNC_CHAPTER_CONTENT),
|
||||||
|
1,
|
||||||
|
LEASE_DURATION,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(v) => v,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(worker = self.id, ?e, "worker: lease failed");
|
||||||
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(Duration::from_secs(5)) => continue,
|
||||||
|
_ = self.cancel.cancelled() => return,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let Some(lease) = leases.into_iter().next() else {
|
||||||
|
tokio::select! {
|
||||||
|
_ = tokio::time::sleep(Duration::from_secs(1)) => continue,
|
||||||
|
_ = self.cancel.cancelled() => return,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
self.process_lease(lease).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn process_lease(&self, lease: Lease) {
|
||||||
|
// Consumer-side dedup safety net: if the chapter already has pages
|
||||||
|
// (because a force-refetch race or a job that was re-enqueued
|
||||||
|
// after a previous one finished), ack done without re-fetching.
|
||||||
|
if let JobPayload::SyncChapterContent { chapter_id, .. } = &lease.payload {
|
||||||
|
let page_count = crate::repo::chapter::page_count(&self.pool, *chapter_id)
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
.flatten();
|
||||||
|
if matches!(page_count, Some(n) if n > 0) {
|
||||||
|
let _ = jobs::ack_done(&self.pool, lease.id).await;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heartbeat: keep the lease fresh while the (potentially long)
|
||||||
|
// dispatch runs, so a slow-but-healthy job is never re-leased and
|
||||||
|
// never inflates `attempts` toward `max_attempts`. Stops itself
|
||||||
|
// once the job is no longer ours (renew returns false).
|
||||||
|
let heartbeat = {
|
||||||
|
let hb_pool = self.pool.clone();
|
||||||
|
let hb_id = lease.id;
|
||||||
|
tokio::spawn(async move {
|
||||||
|
loop {
|
||||||
|
tokio::time::sleep(LEASE_HEARTBEAT).await;
|
||||||
|
match jobs::renew(&hb_pool, hb_id, LEASE_DURATION).await {
|
||||||
|
Ok(true) => {}
|
||||||
|
Ok(false) => break,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(lease_id = %hb_id, ?e, "heartbeat renew failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
// The "currently crawling" chapter (with its live page count) is
|
||||||
|
// registered by the dispatcher itself (RealChapterDispatcher) so it
|
||||||
|
// carries the manga/chapter identity + page progress and is removed
|
||||||
|
// via an RAII guard on every exit path.
|
||||||
|
|
||||||
|
// Outer timeout: a dispatch that exceeds `job_timeout` is acked
|
||||||
|
// failed (exponential backoff) rather than wedging the worker.
|
||||||
|
let dispatch = AssertUnwindSafe(self.dispatcher.dispatch(lease.payload.clone()))
|
||||||
|
.catch_unwind();
|
||||||
|
let outcome = tokio::time::timeout(self.job_timeout, dispatch).await;
|
||||||
|
heartbeat.abort();
|
||||||
|
|
||||||
|
let outcome = match outcome {
|
||||||
|
Ok(o) => o,
|
||||||
|
Err(_elapsed) => {
|
||||||
|
tracing::warn!(
|
||||||
|
worker = self.id,
|
||||||
|
lease_id = %lease.id,
|
||||||
|
timeout_secs = self.job_timeout.as_secs(),
|
||||||
|
"worker: dispatch timed out — ack failed"
|
||||||
|
);
|
||||||
|
let _ = jobs::ack_failed(
|
||||||
|
&self.pool,
|
||||||
|
lease.id,
|
||||||
|
"dispatch timed out",
|
||||||
|
lease.attempts,
|
||||||
|
lease.max_attempts,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
match outcome {
|
||||||
|
Ok(Ok(SyncOutcome::Fetched { .. } | SyncOutcome::Skipped)) => {
|
||||||
|
let _ = jobs::ack_done(&self.pool, lease.id).await;
|
||||||
|
}
|
||||||
|
Ok(Ok(SyncOutcome::SessionExpired)) => {
|
||||||
|
tracing::error!(
|
||||||
|
worker = self.id,
|
||||||
|
lease_id = %lease.id,
|
||||||
|
"session expired — workers will idle until restart"
|
||||||
|
);
|
||||||
|
self.session_expired.store(true, Ordering::Release);
|
||||||
|
// Push the session-expired flip to live status subscribers.
|
||||||
|
self.status.poke();
|
||||||
|
let _ = jobs::release(&self.pool, lease.id).await;
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
tracing::warn!(
|
||||||
|
worker = self.id,
|
||||||
|
lease_id = %lease.id,
|
||||||
|
error = ?e,
|
||||||
|
"worker: dispatch error — ack failed"
|
||||||
|
);
|
||||||
|
let _ = jobs::ack_failed(
|
||||||
|
&self.pool,
|
||||||
|
lease.id,
|
||||||
|
&format!("{e:#}"),
|
||||||
|
lease.attempts,
|
||||||
|
lease.max_attempts,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
Err(_panic) => {
|
||||||
|
tracing::error!(
|
||||||
|
worker = self.id,
|
||||||
|
lease_id = %lease.id,
|
||||||
|
"worker: dispatcher panicked — ack failed"
|
||||||
|
);
|
||||||
|
let _ = jobs::ack_failed(
|
||||||
|
&self.pool,
|
||||||
|
lease.id,
|
||||||
|
"worker panicked",
|
||||||
|
lease.attempts,
|
||||||
|
lease.max_attempts,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Cron timing primitives
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Compute the next UTC instant when `daily_at` (interpreted in `tz`) will
|
||||||
|
/// fire, strictly after `now`. Handles DST gaps (spring-forward) by
|
||||||
|
/// advancing past the gap; on DST overlap (fall-back) picks the later
|
||||||
|
/// instant so the job runs once, not twice.
|
||||||
|
pub fn next_fire(now: DateTime<Utc>, daily_at: NaiveTime, tz: Tz) -> DateTime<Utc> {
|
||||||
|
let now_local = now.with_timezone(&tz);
|
||||||
|
// Start with today's slot in the local TZ.
|
||||||
|
let mut candidate = local_at(now_local.date_naive(), daily_at, tz);
|
||||||
|
// If today's slot is in the past (or now), roll forward day-by-day.
|
||||||
|
while candidate <= now {
|
||||||
|
let next_day = candidate
|
||||||
|
.with_timezone(&tz)
|
||||||
|
.date_naive()
|
||||||
|
.succ_opt()
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
// Defensive: succ_opt only fails at chrono's max date.
|
||||||
|
chrono::NaiveDate::from_ymd_opt(
|
||||||
|
candidate.year(),
|
||||||
|
candidate.month(),
|
||||||
|
candidate.day(),
|
||||||
|
)
|
||||||
|
.expect("valid date")
|
||||||
|
});
|
||||||
|
candidate = local_at(next_day, daily_at, tz);
|
||||||
|
}
|
||||||
|
candidate
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The most recent fire instant at or before `now`. Used to detect missed
|
||||||
|
/// slots after a restart.
|
||||||
|
pub fn previous_fire(now: DateTime<Utc>, daily_at: NaiveTime, tz: Tz) -> DateTime<Utc> {
|
||||||
|
let now_local = now.with_timezone(&tz);
|
||||||
|
let today = local_at(now_local.date_naive(), daily_at, tz);
|
||||||
|
if today <= now {
|
||||||
|
return today;
|
||||||
|
}
|
||||||
|
let yesterday = now_local
|
||||||
|
.date_naive()
|
||||||
|
.pred_opt()
|
||||||
|
.expect("a day before now");
|
||||||
|
local_at(yesterday, daily_at, tz)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve a local date+time to a UTC instant in `tz`, navigating DST
|
||||||
|
/// edges deterministically:
|
||||||
|
/// - `LocalResult::Single` → that instant.
|
||||||
|
/// - `LocalResult::Ambiguous(_, latest)` → the later instant (fall-back
|
||||||
|
/// hour). Picking latest means a daily job fires once across the
|
||||||
|
/// repeated hour, not twice.
|
||||||
|
/// - `LocalResult::None` → spring-forward gap. Advance the local time
|
||||||
|
/// by 1 minute and try again, repeating up to 120 times (so the worst
|
||||||
|
/// case is still well inside an hour-long gap).
|
||||||
|
fn local_at(date: chrono::NaiveDate, time: NaiveTime, tz: Tz) -> DateTime<Utc> {
|
||||||
|
use chrono::LocalResult;
|
||||||
|
for offset_minutes in 0..120 {
|
||||||
|
let mut t = time;
|
||||||
|
if offset_minutes > 0 {
|
||||||
|
let added = chrono::NaiveTime::from_num_seconds_from_midnight_opt(
|
||||||
|
((time.num_seconds_from_midnight() as i64 + offset_minutes * 60) % 86_400) as u32,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
.unwrap_or(time);
|
||||||
|
t = added;
|
||||||
|
}
|
||||||
|
let naive = date.and_time(t);
|
||||||
|
match tz.from_local_datetime(&naive) {
|
||||||
|
LocalResult::Single(dt) => return dt.with_timezone(&Utc),
|
||||||
|
LocalResult::Ambiguous(_, latest) => return latest.with_timezone(&Utc),
|
||||||
|
LocalResult::None => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Should be unreachable — DST gaps are always less than an hour.
|
||||||
|
Utc.from_utc_datetime(&date.and_time(time))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// crawler_state I/O
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async fn read_last_tick(pool: &PgPool) -> sqlx::Result<Option<DateTime<Utc>>> {
|
||||||
|
let row: Option<serde_json::Value> = sqlx::query_scalar(
|
||||||
|
"SELECT value FROM crawler_state WHERE key = $1",
|
||||||
|
)
|
||||||
|
.bind(STATE_KEY_LAST_TICK)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(row.and_then(|v| {
|
||||||
|
v.get("at")
|
||||||
|
.and_then(|s| s.as_str())
|
||||||
|
.and_then(|s| DateTime::parse_from_rfc3339(s).ok())
|
||||||
|
.map(|dt| dt.with_timezone(&Utc))
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn write_last_tick(pool: &PgPool, at: DateTime<Utc>) -> sqlx::Result<()> {
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||||
|
VALUES ($1, $2, now()) \
|
||||||
|
ON CONFLICT (key) DO UPDATE \
|
||||||
|
SET value = EXCLUDED.value, updated_at = now()",
|
||||||
|
)
|
||||||
|
.bind(STATE_KEY_LAST_TICK)
|
||||||
|
.bind(json!({ "at": at.to_rfc3339() }))
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Test helpers (not gated on cfg(test) — integration tests in tests/ dir
|
||||||
|
// need them too).
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
pub mod test_support {
|
||||||
|
//! Lightweight stubs the daemon tests use. Public because integration
|
||||||
|
//! tests live outside this module.
|
||||||
|
use super::*;
|
||||||
|
use std::sync::atomic::AtomicUsize;
|
||||||
|
|
||||||
|
pub struct CountingMetadataPass {
|
||||||
|
pub count: AtomicUsize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for CountingMetadataPass {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
count: AtomicUsize::new(0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl MetadataPass for CountingMetadataPass {
|
||||||
|
async fn run(&self) -> anyhow::Result<pipeline::MetadataStats> {
|
||||||
|
self.count.fetch_add(1, Ordering::AcqRel);
|
||||||
|
Ok(pipeline::MetadataStats::default())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub type DispatchFn = Arc<
|
||||||
|
dyn Fn(JobPayload) -> futures_util::future::BoxFuture<'static, anyhow::Result<SyncOutcome>>
|
||||||
|
+ Send
|
||||||
|
+ Sync,
|
||||||
|
>;
|
||||||
|
|
||||||
|
pub struct StubDispatcher {
|
||||||
|
pub handler: DispatchFn,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl ChapterDispatcher for StubDispatcher {
|
||||||
|
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||||
|
(self.handler)(payload).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn always_done() -> Arc<StubDispatcher> {
|
||||||
|
Arc::new(StubDispatcher {
|
||||||
|
handler: Arc::new(|_| Box::pin(async { Ok(SyncOutcome::Fetched { pages: 1 }) })),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn panicking_dispatcher() -> Arc<StubDispatcher> {
|
||||||
|
Arc::new(StubDispatcher {
|
||||||
|
handler: Arc::new(|_| Box::pin(async { panic!("intentional dispatcher panic") })),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use chrono::Duration as ChronoDuration;
|
||||||
|
|
||||||
|
fn dt_utc(y: i32, mo: u32, d: u32, h: u32, mi: u32) -> DateTime<Utc> {
|
||||||
|
Utc.with_ymd_and_hms(y, mo, d, h, mi, 0).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn next_fire_in_utc_at_midnight_advances_one_day() {
|
||||||
|
let now = dt_utc(2026, 5, 25, 12, 0); // noon UTC
|
||||||
|
let at = NaiveTime::from_hms_opt(0, 0, 0).unwrap();
|
||||||
|
let next = next_fire(now, at, Tz::UTC);
|
||||||
|
// Next midnight is May 26 00:00 UTC.
|
||||||
|
assert_eq!(next, dt_utc(2026, 5, 26, 0, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn next_fire_before_today_slot_returns_today() {
|
||||||
|
let now = dt_utc(2026, 5, 25, 23, 0); // 23:00 UTC
|
||||||
|
let at = NaiveTime::from_hms_opt(23, 30, 0).unwrap();
|
||||||
|
let next = next_fire(now, at, Tz::UTC);
|
||||||
|
assert_eq!(next, dt_utc(2026, 5, 25, 23, 30));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn next_fire_skips_spring_forward_gap_in_europe_berlin() {
|
||||||
|
// 2024-03-31: clocks jump 02:00 -> 03:00 in Berlin (CET -> CEST).
|
||||||
|
// Asking for daily_at = 02:30 on the morning of the jump should
|
||||||
|
// land on the *next valid* local instant past the gap. We test
|
||||||
|
// by computing `next_fire` at 2024-03-31 00:30 UTC (= 01:30 CET,
|
||||||
|
// i.e. just before the gap). The next 02:30 local does not exist,
|
||||||
|
// so the helper advances past it.
|
||||||
|
let now = dt_utc(2024, 3, 31, 0, 30); // 01:30 local Berlin (CET = UTC+1)
|
||||||
|
let at = NaiveTime::from_hms_opt(2, 30, 0).unwrap();
|
||||||
|
let next = next_fire(now, at, Tz::Europe__Berlin);
|
||||||
|
// Local Berlin time skips from 02:00 -> 03:00. After the +1 minute
|
||||||
|
// search, the first valid slot is 03:00 local on 2024-03-31, which
|
||||||
|
// is 01:00 UTC (CEST = UTC+2).
|
||||||
|
// We assert the result is strictly between (now) and 1h later
|
||||||
|
// and is in UTC — the exact minute depends on how many +1m steps
|
||||||
|
// were required.
|
||||||
|
assert!(next > now);
|
||||||
|
assert!(next < now + ChronoDuration::hours(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn next_fire_on_fall_back_picks_later_instant() {
|
||||||
|
// 2024-10-27: clocks jump 03:00 -> 02:00 (CEST -> CET) in Berlin.
|
||||||
|
// 02:30 happens twice on that day. We pick the later one.
|
||||||
|
let now = dt_utc(2024, 10, 26, 12, 0); // day before, noon UTC
|
||||||
|
let at = NaiveTime::from_hms_opt(2, 30, 0).unwrap();
|
||||||
|
let next = next_fire(now, at, Tz::Europe__Berlin);
|
||||||
|
// First 02:30 local is 00:30 UTC (CEST = UTC+2).
|
||||||
|
// Second 02:30 local is 01:30 UTC (CET = UTC+1).
|
||||||
|
// We expect the later instant: 01:30 UTC on 2024-10-27.
|
||||||
|
assert_eq!(next, dt_utc(2024, 10, 27, 1, 30));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn previous_fire_returns_today_when_now_is_after_slot() {
|
||||||
|
let now = dt_utc(2026, 5, 25, 12, 0); // noon UTC
|
||||||
|
let at = NaiveTime::from_hms_opt(0, 0, 0).unwrap();
|
||||||
|
let prev = previous_fire(now, at, Tz::UTC);
|
||||||
|
assert_eq!(prev, dt_utc(2026, 5, 25, 0, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn previous_fire_returns_yesterday_when_now_is_before_today_slot() {
|
||||||
|
let now = dt_utc(2026, 5, 25, 8, 0); // 08:00 UTC
|
||||||
|
let at = NaiveTime::from_hms_opt(23, 30, 0).unwrap();
|
||||||
|
let prev = previous_fire(now, at, Tz::UTC);
|
||||||
|
assert_eq!(prev, dt_utc(2026, 5, 24, 23, 30));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Documents the panic-isolation pattern `run_tick` now relies on:
|
||||||
|
/// `AssertUnwindSafe(...).catch_unwind().await` must yield `Err(_)`
|
||||||
|
/// when the wrapped future panics, so the surrounding loop (or in
|
||||||
|
/// our case, the unconditional advisory-unlock that follows) keeps
|
||||||
|
/// running. The shape of this test mirrors the production callsite.
|
||||||
|
#[tokio::test]
|
||||||
|
async fn assert_unwind_safe_catches_a_panicking_future() {
|
||||||
|
let result = AssertUnwindSafe(async {
|
||||||
|
panic!("boom");
|
||||||
|
})
|
||||||
|
.catch_unwind()
|
||||||
|
.await;
|
||||||
|
assert!(result.is_err(), "panicking future must yield Err");
|
||||||
|
}
|
||||||
|
}
|
||||||
362
backend/src/crawler/detect.rs
Normal file
362
backend/src/crawler/detect.rs
Normal file
@@ -0,0 +1,362 @@
|
|||||||
|
//! Transient-page detection.
|
||||||
|
//!
|
||||||
|
//! The target site occasionally responds with a 403 + tiny "we're sorry,
|
||||||
|
//! the request file are not found" body on pages that actually exist.
|
||||||
|
//! Selectors on that body match nothing, which is indistinguishable from
|
||||||
|
//! a genuinely empty page unless we look for the broken-page markers
|
||||||
|
//! explicitly. The same shape covers full-site outages: 5xx pages,
|
||||||
|
//! Cloudflare interstitials, and "site is down" placeholders all share
|
||||||
|
//! the trait that the normal layout (`#logo` in the header) is absent.
|
||||||
|
//!
|
||||||
|
//! Helpers here are split into two signals so callers can compose them:
|
||||||
|
//! - [`is_broken_page_body`]: pattern-match on the known broken-page
|
||||||
|
//! string. Works for *any* page on the site, including the reader,
|
||||||
|
//! which doesn't render `#logo`.
|
||||||
|
//! - [`has_logo_sentinel`]: assert `#logo` is in the parsed DOM. Site-
|
||||||
|
//! structural marker — present on the manga list, manga detail,
|
||||||
|
//! chapter-list, and login probe pages. **Not** present on the reader,
|
||||||
|
//! so callers in the reader path must rely on the body signature only.
|
||||||
|
//!
|
||||||
|
//! [`PageError::Transient`] is the typed signal returned by parser and
|
||||||
|
//! navigate wrappers. Job handlers map it to "reschedule with backoff"
|
||||||
|
//! rather than the per-page silent skip the parsers used to do.
|
||||||
|
|
||||||
|
use std::future::Future;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// Universal substring of the broken-page body. The site renders the
|
||||||
|
/// exact string verbatim in a single `<p>`, so a case-insensitive
|
||||||
|
/// substring match is enough — we deliberately do *not* anchor to the
|
||||||
|
/// kaomoji because that part is more likely to change than the prose.
|
||||||
|
const BROKEN_PAGE_MARKER: &str = "we're sorry, the request file are not found";
|
||||||
|
|
||||||
|
/// Outcome of a page fetch or parse when the caller wants to
|
||||||
|
/// distinguish "site/page is transiently broken — retry later" from
|
||||||
|
/// other errors. `Transient` is the only retry-friendly variant; every
|
||||||
|
/// other failure mode stays as `anyhow::Error` and is treated as today.
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub enum PageError {
|
||||||
|
/// Page came back but the site signaled trouble — broken-page body
|
||||||
|
/// signature, structural sentinel missing, etc. Caller should
|
||||||
|
/// reschedule this fetch rather than treat it as data.
|
||||||
|
#[error("transient page error: {reason}")]
|
||||||
|
Transient { reason: String },
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PageError {
|
||||||
|
pub fn transient(reason: impl Into<String>) -> Self {
|
||||||
|
Self::Transient { reason: reason.into() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_transient(&self) -> bool {
|
||||||
|
matches!(self, Self::Transient { .. })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true when the response body matches the known broken-page
|
||||||
|
/// template. Case-insensitive substring match — small bodies (~150B)
|
||||||
|
/// make the scan trivially fast, and the broken page is always tiny so
|
||||||
|
/// false positives on a real catalog page are not a concern.
|
||||||
|
pub fn is_broken_page_body(html: &str) -> bool {
|
||||||
|
html.to_ascii_lowercase().contains(BROKEN_PAGE_MARKER)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true when the parsed document contains `#logo` — the site's
|
||||||
|
/// header logo element, present on every full-layout page and absent on
|
||||||
|
/// the broken-page response and on the reader.
|
||||||
|
pub fn has_logo_sentinel(doc: &scraper::Html) -> bool {
|
||||||
|
let sel = scraper::Selector::parse("#logo").expect("#logo is a valid selector");
|
||||||
|
doc.select(&sel).next().is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retry `op` up to `max_attempts` times whenever it returns
|
||||||
|
/// [`PageError::Transient`], sleeping `delay` between attempts.
|
||||||
|
/// Non-transient errors short-circuit immediately. Used by discover-loop
|
||||||
|
/// callers so a single broken page doesn't drop the whole walk — the
|
||||||
|
/// caller can fall back on the job system's retry/backoff once the
|
||||||
|
/// inline budget is exhausted.
|
||||||
|
pub async fn retry_on_transient<F, Fut, T>(
|
||||||
|
op: F,
|
||||||
|
max_attempts: u32,
|
||||||
|
delay: Duration,
|
||||||
|
) -> Result<T, PageError>
|
||||||
|
where
|
||||||
|
F: FnMut() -> Fut,
|
||||||
|
Fut: Future<Output = Result<T, PageError>>,
|
||||||
|
{
|
||||||
|
retry_on_transient_with_hook(op, max_attempts, delay, || async {}).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Like [`retry_on_transient`] but invokes `on_retry` between a
|
||||||
|
/// transient failure and the subsequent sleep+retry. The hook does
|
||||||
|
/// **not** fire on the first attempt, after a non-transient error, or
|
||||||
|
/// after the final attempt (no retry follows). Hook failures are not
|
||||||
|
/// propagated — return `()` from the future and log inside if needed.
|
||||||
|
///
|
||||||
|
/// Wire the TOR controller's `new_identity` here to rotate circuits
|
||||||
|
/// between page-fetch retries; see [`crate::crawler::tor`].
|
||||||
|
pub async fn retry_on_transient_with_hook<F, Fut, T, H, HFut>(
|
||||||
|
mut op: F,
|
||||||
|
max_attempts: u32,
|
||||||
|
delay: Duration,
|
||||||
|
mut on_retry: H,
|
||||||
|
) -> Result<T, PageError>
|
||||||
|
where
|
||||||
|
F: FnMut() -> Fut,
|
||||||
|
Fut: Future<Output = Result<T, PageError>>,
|
||||||
|
H: FnMut() -> HFut,
|
||||||
|
HFut: Future<Output = ()>,
|
||||||
|
{
|
||||||
|
debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
loop {
|
||||||
|
attempt += 1;
|
||||||
|
match op().await {
|
||||||
|
Ok(v) => return Ok(v),
|
||||||
|
Err(e) if !e.is_transient() => return Err(e),
|
||||||
|
Err(e) if attempt >= max_attempts => return Err(e),
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
attempt,
|
||||||
|
max_attempts,
|
||||||
|
error = %e,
|
||||||
|
"transient error; running on-retry hook and sleeping before retry"
|
||||||
|
);
|
||||||
|
on_retry().await;
|
||||||
|
tokio::time::sleep(delay).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn broken_page_body_matches_exact_template() {
|
||||||
|
let html = "<html><head></head><body>\
|
||||||
|
<p>we're sorry, the request file are not found. Σ(っ°Д °;)っ</p>\
|
||||||
|
</body></html>";
|
||||||
|
assert!(is_broken_page_body(html));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn broken_page_body_is_case_insensitive() {
|
||||||
|
let html = "<p>WE'RE SORRY, THE REQUEST FILE ARE NOT FOUND.</p>";
|
||||||
|
assert!(is_broken_page_body(html));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn broken_page_body_does_not_match_normal_listing() {
|
||||||
|
let html = "<html><body><div id='logo'></div>\
|
||||||
|
<ul><li>Manga A</li><li>Manga B</li></ul></body></html>";
|
||||||
|
assert!(!is_broken_page_body(html));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn broken_page_body_does_not_match_empty_string() {
|
||||||
|
assert!(!is_broken_page_body(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn logo_sentinel_present_on_normal_page() {
|
||||||
|
let doc = scraper::Html::parse_document(
|
||||||
|
"<html><body><div id='logo'>Site</div><main>...</main></body></html>",
|
||||||
|
);
|
||||||
|
assert!(has_logo_sentinel(&doc));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn logo_sentinel_absent_on_broken_page() {
|
||||||
|
let doc = scraper::Html::parse_document(
|
||||||
|
"<html><head></head><body>\
|
||||||
|
<p>we're sorry, the request file are not found.</p></body></html>",
|
||||||
|
);
|
||||||
|
assert!(!has_logo_sentinel(&doc));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn logo_sentinel_absent_on_empty_document() {
|
||||||
|
let doc = scraper::Html::parse_document("");
|
||||||
|
assert!(!has_logo_sentinel(&doc));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn page_error_transient_constructor_sets_reason() {
|
||||||
|
let e = PageError::transient("logo missing");
|
||||||
|
assert!(e.is_transient());
|
||||||
|
assert_eq!(e.to_string(), "transient page error: logo missing");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn page_error_other_is_not_transient() {
|
||||||
|
let e: PageError = anyhow::anyhow!("something else").into();
|
||||||
|
assert!(!e.is_transient());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn retry_returns_ok_after_a_transient_streak() {
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient(
|
||||||
|
|| {
|
||||||
|
attempt += 1;
|
||||||
|
let n = attempt;
|
||||||
|
async move {
|
||||||
|
if n < 3 {
|
||||||
|
Err(PageError::transient("not yet"))
|
||||||
|
} else {
|
||||||
|
Ok(42)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
5,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(result.unwrap(), 42);
|
||||||
|
assert_eq!(attempt, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn retry_gives_up_after_max_attempts_on_persistent_transient() {
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient(
|
||||||
|
|| {
|
||||||
|
attempt += 1;
|
||||||
|
async { Err(PageError::transient("always")) }
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let err = result.expect_err("expected Transient");
|
||||||
|
assert!(err.is_transient());
|
||||||
|
assert_eq!(attempt, 3, "retried max_attempts times, no more");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn retry_does_not_retry_non_transient_errors() {
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient(
|
||||||
|
|| {
|
||||||
|
attempt += 1;
|
||||||
|
async { Err(PageError::Other(anyhow::anyhow!("permanent"))) }
|
||||||
|
},
|
||||||
|
5,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert!(!result.unwrap_err().is_transient());
|
||||||
|
assert_eq!(attempt, 1, "non-transient must fail immediately");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn retry_returns_ok_on_first_attempt_without_sleeping() {
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient(
|
||||||
|
|| {
|
||||||
|
attempt += 1;
|
||||||
|
async { Ok(7) }
|
||||||
|
},
|
||||||
|
5,
|
||||||
|
Duration::from_secs(60),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(result.unwrap(), 7);
|
||||||
|
assert_eq!(attempt, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn hook_fires_once_between_transient_and_success() {
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
let mut hook_calls = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|
||||||
|
|| {
|
||||||
|
attempt += 1;
|
||||||
|
let n = attempt;
|
||||||
|
async move {
|
||||||
|
if n < 2 {
|
||||||
|
Err(PageError::transient("once"))
|
||||||
|
} else {
|
||||||
|
Ok(99)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
5,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
|| {
|
||||||
|
hook_calls += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(result.unwrap(), 99);
|
||||||
|
assert_eq!(attempt, 2);
|
||||||
|
assert_eq!(hook_calls, 1, "hook fires exactly once between attempts");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn hook_does_not_fire_when_first_attempt_succeeds() {
|
||||||
|
let mut hook_calls = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|
||||||
|
|| async { Ok(1) },
|
||||||
|
5,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
|| {
|
||||||
|
hook_calls += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert!(result.is_ok());
|
||||||
|
assert_eq!(hook_calls, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn hook_does_not_fire_after_non_transient_error() {
|
||||||
|
let mut hook_calls = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|
||||||
|
|| async { Err(PageError::Other(anyhow::anyhow!("permanent"))) },
|
||||||
|
5,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
|| {
|
||||||
|
hook_calls += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert_eq!(hook_calls, 0, "non-transient must short-circuit before hook");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn hook_does_not_fire_after_final_failed_attempt() {
|
||||||
|
// With max_attempts=3 and three persistent transients, the hook
|
||||||
|
// should run twice (between 1→2 and 2→3) — never a third time,
|
||||||
|
// because no retry follows attempt 3.
|
||||||
|
let mut attempt = 0u32;
|
||||||
|
let mut hook_calls = 0u32;
|
||||||
|
let result: Result<i32, PageError> = retry_on_transient_with_hook(
|
||||||
|
|| {
|
||||||
|
attempt += 1;
|
||||||
|
async { Err(PageError::transient("always")) }
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
|| {
|
||||||
|
hook_calls += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
assert_eq!(attempt, 3);
|
||||||
|
assert_eq!(hook_calls, 2, "hook fires N-1 times for N attempts that all fail transient");
|
||||||
|
}
|
||||||
|
}
|
||||||
15
backend/src/crawler/diff.rs
Normal file
15
backend/src/crawler/diff.rs
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
//! Change-detection rules between the source and our DB.
|
||||||
|
//!
|
||||||
|
//! | Event | Signal |
|
||||||
|
//! |--------------------|----------------------------------------------------------------------------------------|
|
||||||
|
//! | New manga | `(source_id, source_manga_key)` not in `manga_sources` |
|
||||||
|
//! | Updated metadata | freshly computed `metadata_hash` differs from the stored one |
|
||||||
|
//! | Dropped manga | `last_seen_at < discover_run_started_at` for N consecutive successful discover runs |
|
||||||
|
//! | New chapter | `(source_id, source_chapter_key)` not in `chapter_sources` |
|
||||||
|
//! | Dropped chapter | present in DB but absent from the latest `fetch_chapter_list` for the same manga |
|
||||||
|
//!
|
||||||
|
//! Dropped is always a soft flag (`dropped_at`), never a row delete —
|
||||||
|
//! restoring is a matter of clearing the flag if the source brings the
|
||||||
|
//! item back.
|
||||||
|
//!
|
||||||
|
//! Scaffold only — implementations land once `repo::crawler` exists.
|
||||||
371
backend/src/crawler/jobs.rs
Normal file
371
backend/src/crawler/jobs.rs
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
//! Persistent job queue and its job kinds.
|
||||||
|
//!
|
||||||
|
//! Backed by Postgres (the `crawler_jobs` table). Workers lease rows
|
||||||
|
//! with `SELECT ... FOR UPDATE SKIP LOCKED`, heartbeat via
|
||||||
|
//! `leased_until`, and ack by transitioning to `done` (or backoff /
|
||||||
|
//! `dead`). Handlers are idempotent so a crash mid-run is recoverable
|
||||||
|
//! by replay.
|
||||||
|
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||||
|
pub enum JobPayload {
|
||||||
|
/// Fetch one manga's detail page, upsert metadata, enqueue
|
||||||
|
/// `SyncChapterList`.
|
||||||
|
SyncManga {
|
||||||
|
source_id: String,
|
||||||
|
source_manga_key: String,
|
||||||
|
},
|
||||||
|
/// Diff the chapter list, enqueue `SyncChapterContent` for new
|
||||||
|
/// chapters, soft-drop vanished ones.
|
||||||
|
SyncChapterList {
|
||||||
|
source_id: String,
|
||||||
|
manga_id: Uuid,
|
||||||
|
source_manga_key: String,
|
||||||
|
},
|
||||||
|
/// Download a single chapter's page images into storage.
|
||||||
|
SyncChapterContent {
|
||||||
|
source_id: String,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
source_chapter_key: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, sqlx::Type, Serialize, Deserialize)]
|
||||||
|
#[sqlx(type_name = "text", rename_all = "snake_case")]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum JobState {
|
||||||
|
Pending,
|
||||||
|
Running,
|
||||||
|
Done,
|
||||||
|
Failed,
|
||||||
|
Dead,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Kind discriminator stored in `payload->>'kind'`. Public so callers
|
||||||
|
/// (daemon worker, bookmark hook) can filter `lease()` to a single kind
|
||||||
|
/// without re-spelling the literal.
|
||||||
|
pub const KIND_SYNC_CHAPTER_CONTENT: &str = "sync_chapter_content";
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum EnqueueResult {
|
||||||
|
Inserted(Uuid),
|
||||||
|
Skipped,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct Lease {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub payload: JobPayload,
|
||||||
|
pub attempts: i32,
|
||||||
|
pub max_attempts: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Deterministic exponential backoff base for `ack_failed` retries.
|
||||||
|
/// `attempts` is the post-increment value reported by `lease()` (so the
|
||||||
|
/// first failure has `attempts == 1` and waits 60s, the second 120s,
|
||||||
|
/// etc.). Capped at 1h to avoid runaway long sleeps that would outlive
|
||||||
|
/// the daemon process. Jitter is applied separately by [`apply_jitter`].
|
||||||
|
fn backoff_base(attempts: i32) -> Duration {
|
||||||
|
let shift = attempts.saturating_sub(1).clamp(0, 20) as u32;
|
||||||
|
let secs = 60u64.saturating_mul(1u64 << shift);
|
||||||
|
Duration::from_secs(secs.min(3600))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply ±20% jitter to a backoff duration. `jitter` is a fraction in
|
||||||
|
/// `[0.0, 1.0)` (e.g. `rand::random::<f64>()`), mapped to a multiplier in
|
||||||
|
/// `[0.8, 1.2)`. Pure so the bounds stay unit-testable. Spreading retries
|
||||||
|
/// avoids a thundering herd when a source outage fails many jobs at once.
|
||||||
|
fn apply_jitter(base: Duration, jitter: f64) -> Duration {
|
||||||
|
let frac = jitter.clamp(0.0, 1.0);
|
||||||
|
let mult = 0.8 + 0.4 * frac; // [0.8, 1.2)
|
||||||
|
Duration::from_secs((base.as_secs_f64() * mult).round() as u64)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Jittered exponential backoff for `ack_failed`. Wraps [`backoff_base`]
|
||||||
|
/// with a random ±20% spread.
|
||||||
|
fn backoff_for(attempts: i32) -> Duration {
|
||||||
|
apply_jitter(backoff_base(attempts), rand::random::<f64>())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Insert a new pending job. For `SyncChapterContent` payloads the
|
||||||
|
/// partial unique index `crawler_jobs_chapter_content_dedup_idx` blocks
|
||||||
|
/// a second `(pending|running)` insert per chapter_id, returning
|
||||||
|
/// `Skipped`. The slot frees again once the previous job leaves the
|
||||||
|
/// in-flight states (done/failed/dead), so a re-enqueue after a force
|
||||||
|
/// refetch succeeds.
|
||||||
|
pub async fn enqueue(pool: &PgPool, payload: &JobPayload) -> sqlx::Result<EnqueueResult> {
|
||||||
|
let json = serde_json::to_value(payload).expect("JobPayload is always serializable");
|
||||||
|
let id: Option<Uuid> = sqlx::query_scalar(
|
||||||
|
"INSERT INTO crawler_jobs (payload) VALUES ($1) \
|
||||||
|
ON CONFLICT DO NOTHING RETURNING id",
|
||||||
|
)
|
||||||
|
.bind(json)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(match id {
|
||||||
|
Some(id) => EnqueueResult::Inserted(id),
|
||||||
|
None => EnqueueResult::Skipped,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lease up to `max` rows whose `state` is `pending`, or `running` with
|
||||||
|
/// an expired `leased_until` (the crashed-worker recovery path). The
|
||||||
|
/// inner CTE uses `FOR UPDATE SKIP LOCKED` so concurrent leasers don't
|
||||||
|
/// block each other and each row is handed to exactly one worker.
|
||||||
|
///
|
||||||
|
/// `kind_filter` matches against `payload->>'kind'`; `None` means
|
||||||
|
/// any kind.
|
||||||
|
///
|
||||||
|
/// Ties on `scheduled_at` (the common case: a cron batch enqueues
|
||||||
|
/// everything with the same default `now()`) break by `created_at`, so
|
||||||
|
/// jobs come off the queue in insertion order. The enqueue paths insert
|
||||||
|
/// chapter-content jobs in ascending `chapters.number` order, so this
|
||||||
|
/// tiebreaker is what propagates that intent through to dequeue.
|
||||||
|
pub async fn lease(
|
||||||
|
pool: &PgPool,
|
||||||
|
kind_filter: Option<&str>,
|
||||||
|
max: i64,
|
||||||
|
lease_duration: Duration,
|
||||||
|
) -> sqlx::Result<Vec<Lease>> {
|
||||||
|
let lease_ms: i64 = lease_duration.as_millis().min(i64::MAX as u128) as i64;
|
||||||
|
let rows: Vec<(Uuid, serde_json::Value, i32, i32)> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
WITH leased AS (
|
||||||
|
SELECT id FROM crawler_jobs
|
||||||
|
WHERE (state = 'pending' OR (state = 'running' AND leased_until < now()))
|
||||||
|
AND scheduled_at <= now()
|
||||||
|
AND ($1::text IS NULL OR payload->>'kind' = $1)
|
||||||
|
ORDER BY scheduled_at, created_at
|
||||||
|
LIMIT $2
|
||||||
|
FOR UPDATE SKIP LOCKED
|
||||||
|
)
|
||||||
|
UPDATE crawler_jobs j
|
||||||
|
SET state = 'running',
|
||||||
|
attempts = j.attempts + 1,
|
||||||
|
leased_until = now() + ($3::bigint || ' milliseconds')::interval,
|
||||||
|
updated_at = now()
|
||||||
|
FROM leased l
|
||||||
|
WHERE j.id = l.id
|
||||||
|
RETURNING j.id, j.payload, j.attempts, j.max_attempts
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(kind_filter)
|
||||||
|
.bind(max)
|
||||||
|
.bind(lease_ms)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut leases = Vec::with_capacity(rows.len());
|
||||||
|
for (id, payload_json, attempts, max_attempts) in rows {
|
||||||
|
let payload: JobPayload = serde_json::from_value(payload_json).map_err(|e| {
|
||||||
|
sqlx::Error::Decode(format!("invalid JobPayload JSON for job {id}: {e}").into())
|
||||||
|
})?;
|
||||||
|
leases.push(Lease {
|
||||||
|
id,
|
||||||
|
payload,
|
||||||
|
attempts,
|
||||||
|
max_attempts,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(leases)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extend the lease on a still-owned `running` job. Returns `true` if the
|
||||||
|
/// row was updated (we still hold the lease), `false` if the job is no
|
||||||
|
/// longer `running` (re-leased after a missed heartbeat, or already
|
||||||
|
/// acked) — the caller's heartbeat loop should stop. The `state =
|
||||||
|
/// 'running'` guard mirrors [`ack_done`]'s rationale.
|
||||||
|
///
|
||||||
|
/// This is the heartbeat primitive: a worker renews periodically while a
|
||||||
|
/// long-but-healthy job runs so `leased_until` never lapses, which would
|
||||||
|
/// otherwise let another worker steal the in-flight job and spuriously
|
||||||
|
/// inflate `attempts` toward `max_attempts`.
|
||||||
|
pub async fn renew(
|
||||||
|
pool: &PgPool,
|
||||||
|
lease_id: Uuid,
|
||||||
|
lease_duration: Duration,
|
||||||
|
) -> sqlx::Result<bool> {
|
||||||
|
let lease_ms: i64 = lease_duration.as_millis().min(i64::MAX as u128) as i64;
|
||||||
|
let res = sqlx::query(
|
||||||
|
"UPDATE crawler_jobs \
|
||||||
|
SET leased_until = now() + ($2::bigint || ' milliseconds')::interval, \
|
||||||
|
updated_at = now() \
|
||||||
|
WHERE id = $1 AND state = 'running'",
|
||||||
|
)
|
||||||
|
.bind(lease_id)
|
||||||
|
.bind(lease_ms)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(res.rows_affected() > 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a leased job as successfully completed. The `state = 'running'`
|
||||||
|
/// predicate guards against a late ack from a worker whose lease expired
|
||||||
|
/// and was already re-leased by another worker: without it, the late ack
|
||||||
|
/// would clobber the new lease's `state` and `leased_until`. `rows_affected
|
||||||
|
/// == 0` means we lost the lease — surfaced as a warn rather than an
|
||||||
|
/// error because the new lease holder is doing real work; the late ack
|
||||||
|
/// just has to step aside.
|
||||||
|
pub async fn ack_done(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
|
||||||
|
let res = sqlx::query(
|
||||||
|
"UPDATE crawler_jobs \
|
||||||
|
SET state = 'done', leased_until = NULL, updated_at = now() \
|
||||||
|
WHERE id = $1 AND state = 'running'",
|
||||||
|
)
|
||||||
|
.bind(lease_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
if res.rows_affected() == 0 {
|
||||||
|
tracing::warn!(
|
||||||
|
%lease_id,
|
||||||
|
"ack_done: lease no longer running — likely re-leased by another worker; skipping update"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a leased job as failed. If the current attempt count has reached
|
||||||
|
/// `max_attempts` the job is terminally dead and stops retrying;
|
||||||
|
/// otherwise it goes back to `pending` with `scheduled_at` pushed into
|
||||||
|
/// the future by the exponential backoff. See [`ack_done`] for the
|
||||||
|
/// `state = 'running'` guard rationale.
|
||||||
|
pub async fn ack_failed(
|
||||||
|
pool: &PgPool,
|
||||||
|
lease_id: Uuid,
|
||||||
|
error: &str,
|
||||||
|
attempts: i32,
|
||||||
|
max_attempts: i32,
|
||||||
|
) -> sqlx::Result<()> {
|
||||||
|
let res = if attempts >= max_attempts {
|
||||||
|
sqlx::query(
|
||||||
|
"UPDATE crawler_jobs \
|
||||||
|
SET state = 'dead', last_error = $2, leased_until = NULL, updated_at = now() \
|
||||||
|
WHERE id = $1 AND state = 'running'",
|
||||||
|
)
|
||||||
|
.bind(lease_id)
|
||||||
|
.bind(error)
|
||||||
|
.execute(pool)
|
||||||
|
.await?
|
||||||
|
} else {
|
||||||
|
let backoff_ms: i64 = backoff_for(attempts).as_millis().min(i64::MAX as u128) as i64;
|
||||||
|
sqlx::query(
|
||||||
|
"UPDATE crawler_jobs \
|
||||||
|
SET state = 'pending', last_error = $2, leased_until = NULL, \
|
||||||
|
scheduled_at = now() + ($3::bigint || ' milliseconds')::interval, \
|
||||||
|
updated_at = now() \
|
||||||
|
WHERE id = $1 AND state = 'running'",
|
||||||
|
)
|
||||||
|
.bind(lease_id)
|
||||||
|
.bind(error)
|
||||||
|
.bind(backoff_ms)
|
||||||
|
.execute(pool)
|
||||||
|
.await?
|
||||||
|
};
|
||||||
|
if res.rows_affected() == 0 {
|
||||||
|
tracing::warn!(
|
||||||
|
%lease_id,
|
||||||
|
"ack_failed: lease no longer running — likely re-leased by another worker; skipping update"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a leased job to `pending` without burning a retry attempt.
|
||||||
|
/// Used on graceful shutdown and on session-expired aborts where the
|
||||||
|
/// failure isn't the job's fault. See [`ack_done`] for the
|
||||||
|
/// `state = 'running'` guard rationale — important here because
|
||||||
|
/// `attempts - 1` would otherwise spuriously decrement the new lease's
|
||||||
|
/// attempt count.
|
||||||
|
pub async fn release(pool: &PgPool, lease_id: Uuid) -> sqlx::Result<()> {
|
||||||
|
let res = sqlx::query(
|
||||||
|
"UPDATE crawler_jobs \
|
||||||
|
SET state = 'pending', leased_until = NULL, \
|
||||||
|
attempts = GREATEST(0, attempts - 1), updated_at = now() \
|
||||||
|
WHERE id = $1 AND state = 'running'",
|
||||||
|
)
|
||||||
|
.bind(lease_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
if res.rows_affected() == 0 {
|
||||||
|
tracing::warn!(
|
||||||
|
%lease_id,
|
||||||
|
"release: lease no longer running — likely re-leased by another worker; skipping update"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete `done` jobs whose `updated_at` is older than `retention_days`
|
||||||
|
/// days. `0` disables the reaper without touching the table. Returns the
|
||||||
|
/// number of rows removed.
|
||||||
|
pub async fn reap_done(pool: &PgPool, retention_days: u32) -> sqlx::Result<u64> {
|
||||||
|
if retention_days == 0 {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
let result = sqlx::query(
|
||||||
|
"DELETE FROM crawler_jobs \
|
||||||
|
WHERE state = 'done' \
|
||||||
|
AND updated_at < now() - ($1::bigint || ' days')::interval",
|
||||||
|
)
|
||||||
|
.bind(retention_days as i64)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(result.rows_affected())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn backoff_base_grows_exponentially_and_caps_at_one_hour() {
|
||||||
|
// attempts == 1 → 60s, doubling each step.
|
||||||
|
assert_eq!(backoff_base(1), Duration::from_secs(60));
|
||||||
|
assert_eq!(backoff_base(2), Duration::from_secs(120));
|
||||||
|
assert_eq!(backoff_base(3), Duration::from_secs(240));
|
||||||
|
assert_eq!(backoff_base(4), Duration::from_secs(480));
|
||||||
|
assert_eq!(backoff_base(5), Duration::from_secs(960));
|
||||||
|
assert_eq!(backoff_base(6), Duration::from_secs(1920));
|
||||||
|
// 7th: 60 * 64 = 3840 → capped to 3600.
|
||||||
|
assert_eq!(backoff_base(7), Duration::from_secs(3600));
|
||||||
|
assert_eq!(backoff_base(20), Duration::from_secs(3600));
|
||||||
|
// Garbage / zero / negatives stay sane.
|
||||||
|
assert_eq!(backoff_base(0), Duration::from_secs(60));
|
||||||
|
assert_eq!(backoff_base(-5), Duration::from_secs(60));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn apply_jitter_stays_within_plus_minus_twenty_percent() {
|
||||||
|
let base = Duration::from_secs(100);
|
||||||
|
// Lower bound (jitter = 0.0) → 0.8x.
|
||||||
|
assert_eq!(apply_jitter(base, 0.0), Duration::from_secs(80));
|
||||||
|
// Midpoint (jitter = 0.5) → 1.0x.
|
||||||
|
assert_eq!(apply_jitter(base, 0.5), Duration::from_secs(100));
|
||||||
|
// Upper end (jitter → 1.0) → ~1.2x.
|
||||||
|
assert_eq!(apply_jitter(base, 1.0), Duration::from_secs(120));
|
||||||
|
// Out-of-range inputs are clamped, never panic.
|
||||||
|
assert_eq!(apply_jitter(base, -3.0), Duration::from_secs(80));
|
||||||
|
assert_eq!(apply_jitter(base, 9.0), Duration::from_secs(120));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn backoff_for_random_jitter_stays_in_band() {
|
||||||
|
// The production wrapper draws its own randomness; assert the
|
||||||
|
// result for a mid-range attempt always lands within the jitter
|
||||||
|
// band of the base, across many draws.
|
||||||
|
let base = backoff_base(3).as_secs_f64(); // 240s
|
||||||
|
for _ in 0..1000 {
|
||||||
|
let v = backoff_for(3).as_secs_f64();
|
||||||
|
assert!(
|
||||||
|
v >= base * 0.8 - 1.0 && v <= base * 1.2 + 1.0,
|
||||||
|
"jittered backoff {v} outside band of base {base}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
33
backend/src/crawler/mod.rs
Normal file
33
backend/src/crawler/mod.rs
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
//! Crawler subsystem.
|
||||||
|
//!
|
||||||
|
//! Runs as its own binary (`src/bin/crawler.rs`) and shares `domain`,
|
||||||
|
//! `repo`, and `storage` with the API binary. Layering mirrors the
|
||||||
|
//! `Storage` trait pattern: callers depend on the `source::Source`
|
||||||
|
//! trait, not on a concrete site; new sites plug in as additional
|
||||||
|
//! impls without touching the job runner.
|
||||||
|
//!
|
||||||
|
//! Submodules:
|
||||||
|
//! - [`browser`]: launches and pools Chromium via `chromiumoxide`.
|
||||||
|
//! First run downloads a known-good build via the `fetcher` feature.
|
||||||
|
//! - [`source`]: the `Source` trait. Per-site impls live alongside it.
|
||||||
|
//! - [`jobs`]: job kinds, queue wrapper, handler dispatch.
|
||||||
|
//! - [`diff`]: change detection — new / updated / dropped semantics.
|
||||||
|
|
||||||
|
pub mod browser;
|
||||||
|
pub mod browser_manager;
|
||||||
|
pub mod content;
|
||||||
|
pub mod daemon;
|
||||||
|
pub mod detect;
|
||||||
|
pub mod diff;
|
||||||
|
pub mod jobs;
|
||||||
|
pub mod nav;
|
||||||
|
pub mod pipeline;
|
||||||
|
pub mod rate_limit;
|
||||||
|
pub mod resync;
|
||||||
|
pub mod safety;
|
||||||
|
pub mod session;
|
||||||
|
pub mod session_control;
|
||||||
|
pub mod source;
|
||||||
|
pub mod status;
|
||||||
|
pub mod tor;
|
||||||
|
pub mod url_utils;
|
||||||
241
backend/src/crawler/nav.rs
Normal file
241
backend/src/crawler/nav.rs
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
//! Page navigation helpers — wrap `chromiumoxide` `wait_for_navigation`
|
||||||
|
//! with a timeout so a hung TLS handshake or a page that never fires
|
||||||
|
//! `load` cannot wedge a worker (or the cron metadata pass) forever.
|
||||||
|
//!
|
||||||
|
//! [`NAV_TIMEOUT`] is the global budget. Callers in the crawler use
|
||||||
|
//! [`wait_for_nav`] to get back a typed error so transient timeouts can
|
||||||
|
//! be reported separately from underlying CDP errors.
|
||||||
|
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use chromiumoxide::error::CdpError;
|
||||||
|
use chromiumoxide::Page;
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// Maximum wall-clock time we'll wait for a single page navigation. A
|
||||||
|
/// healthy Chromium reaches `load` in well under a second on the target
|
||||||
|
/// site; a 30-second cap is generous enough for slow TLS handshakes on
|
||||||
|
/// the first request after a fresh process while still catching real
|
||||||
|
/// hangs before they wedge the daemon.
|
||||||
|
pub const NAV_TIMEOUT: Duration = Duration::from_secs(30);
|
||||||
|
|
||||||
|
/// Outcome of a timed-out navigation. `Timeout` is the transient signal
|
||||||
|
/// callers translate into a retry-friendly error
|
||||||
|
/// ([`crate::crawler::detect::PageError::Transient`] in the source path,
|
||||||
|
/// a context'd anyhow elsewhere). `Cdp` carries the underlying
|
||||||
|
/// chromiumoxide error unchanged.
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub enum NavError {
|
||||||
|
#[error("navigation timed out after {0:?}")]
|
||||||
|
Timeout(Duration),
|
||||||
|
#[error(transparent)]
|
||||||
|
Cdp(#[from] CdpError),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for the page's next navigation to complete, capped at
|
||||||
|
/// [`NAV_TIMEOUT`]. Replaces bare `page.wait_for_navigation().await`
|
||||||
|
/// throughout the crawler.
|
||||||
|
pub async fn wait_for_nav(page: &Page) -> Result<(), NavError> {
|
||||||
|
match tokio::time::timeout(NAV_TIMEOUT, page.wait_for_navigation()).await {
|
||||||
|
Err(_elapsed) => Err(NavError::Timeout(NAV_TIMEOUT)),
|
||||||
|
Ok(Err(e)) => Err(NavError::Cdp(e)),
|
||||||
|
Ok(Ok(_)) => Ok(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Poll interval for [`wait_for_selector`]. 100ms is fast enough that a
|
||||||
|
/// page rendering in 200ms isn't held back noticeably, and slow enough
|
||||||
|
/// not to spam CDP with `find_element` calls on a page that's actually
|
||||||
|
/// taking its time.
|
||||||
|
const SELECTOR_POLL_INTERVAL: Duration = Duration::from_millis(100);
|
||||||
|
|
||||||
|
/// Wait until `selector` matches at least one element on `page`, or
|
||||||
|
/// `timeout` elapses. Used after a navigation to confirm a page-type-
|
||||||
|
/// specific marker is in the DOM before parsing — replaces the fixed
|
||||||
|
/// post-nav sleep that previously masked partial-render races.
|
||||||
|
///
|
||||||
|
/// chromiumoxide 0.7.0 has no built-in `wait_for_selector`, so we poll
|
||||||
|
/// `find_element` at [`SELECTOR_POLL_INTERVAL`] until success or budget
|
||||||
|
/// exhaustion. A failed `find_element` is *not* an error here — it just
|
||||||
|
/// means "not yet" — we only surface an error once the overall
|
||||||
|
/// `timeout` is up.
|
||||||
|
pub async fn wait_for_selector(
|
||||||
|
page: &Page,
|
||||||
|
selector: &str,
|
||||||
|
timeout: Duration,
|
||||||
|
) -> Result<(), NavError> {
|
||||||
|
let deadline = tokio::time::Instant::now() + timeout;
|
||||||
|
loop {
|
||||||
|
if page.find_element(selector).await.is_ok() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
if tokio::time::Instant::now() >= deadline {
|
||||||
|
return Err(NavError::Timeout(timeout));
|
||||||
|
}
|
||||||
|
let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
|
||||||
|
let sleep_for = SELECTOR_POLL_INTERVAL.min(remaining);
|
||||||
|
tokio::time::sleep(sleep_for).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-page-type budget for [`wait_for_selector`]. Shorter than
|
||||||
|
/// [`NAV_TIMEOUT`] because by the time we're waiting on a selector, the
|
||||||
|
/// page has already responded — we're only absorbing post-load JS
|
||||||
|
/// finishing its row injection, which on a healthy site takes well
|
||||||
|
/// under a second.
|
||||||
|
pub const SELECTOR_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
|
impl NavError {
|
||||||
|
/// Does this navigation error indicate the underlying Chromium
|
||||||
|
/// process has died or its CDP connection has dropped? Used by the
|
||||||
|
/// dispatcher to decide whether to invalidate the
|
||||||
|
/// [`crate::crawler::browser_manager::BrowserManager`] handle so
|
||||||
|
/// the next acquire re-launches.
|
||||||
|
///
|
||||||
|
/// Both variants count: a `Timeout` past [`NAV_TIMEOUT`] is in
|
||||||
|
/// practice always either a hung CDP transport or a wedged page
|
||||||
|
/// the browser can't recover from on its own, and a `Cdp` error
|
||||||
|
/// surfacing at the navigation layer means the chromium-facing
|
||||||
|
/// channel is the failing layer.
|
||||||
|
pub fn is_likely_browser_dead(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
Self::Timeout(_) => true,
|
||||||
|
Self::Cdp(_) => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Walk an `anyhow::Error` chain looking for typed evidence that the
|
||||||
|
/// chromium-facing layer is the failing one. Two markers count:
|
||||||
|
///
|
||||||
|
/// 1. A wrapped [`NavError`] flagged by [`NavError::is_likely_browser_dead`].
|
||||||
|
/// 2. A wrapped [`CdpError`] (via `anyhow::Error::from(CdpError)` at a
|
||||||
|
/// `Browser::new_page` call site, or any other direct CDP boundary).
|
||||||
|
///
|
||||||
|
/// Earlier versions also substring-matched the chain for "connection",
|
||||||
|
/// "closed", "channel", etc. as a fallback. That was too broad —
|
||||||
|
/// reqwest TCP-reset errors during CDN image downloads, sqlx
|
||||||
|
/// connection-pool errors, and similar non-browser failures contain
|
||||||
|
/// those words and triggered spurious chromium relaunches. The typed
|
||||||
|
/// downcasts cover every place we hand a chromium error to anyhow,
|
||||||
|
/// so the fallback is unnecessary.
|
||||||
|
pub fn anyhow_looks_browser_dead(err: &anyhow::Error) -> bool {
|
||||||
|
for cause in err.chain() {
|
||||||
|
if let Some(nav) = cause.downcast_ref::<NavError>() {
|
||||||
|
if nav.is_likely_browser_dead() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cause.downcast_ref::<CdpError>().is_some() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::future::pending;
|
||||||
|
|
||||||
|
/// Sanity-check the timeout pattern used by [`wait_for_nav`]: a
|
||||||
|
/// future that never resolves must yield `Elapsed` within the
|
||||||
|
/// configured budget. We can't easily stand up a real `Page` in a
|
||||||
|
/// unit test, so we assert the underlying primitive behaves the way
|
||||||
|
/// the helper depends on.
|
||||||
|
#[tokio::test(flavor = "current_thread", start_paused = true)]
|
||||||
|
async fn timeout_elapses_on_a_future_that_never_resolves() {
|
||||||
|
let result =
|
||||||
|
tokio::time::timeout(Duration::from_millis(50), pending::<()>()).await;
|
||||||
|
assert!(result.is_err(), "expected Elapsed on a hung future");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nav_error_timeout_message_includes_duration() {
|
||||||
|
let e = NavError::Timeout(Duration::from_secs(30));
|
||||||
|
assert_eq!(e.to_string(), "navigation timed out after 30s");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn timeout_is_treated_as_likely_browser_dead() {
|
||||||
|
let e = NavError::Timeout(NAV_TIMEOUT);
|
||||||
|
assert!(e.is_likely_browser_dead());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn anyhow_with_nav_timeout_in_chain_is_flagged() {
|
||||||
|
let inner: Result<(), NavError> = Err(NavError::Timeout(NAV_TIMEOUT));
|
||||||
|
let outer = inner.unwrap_err();
|
||||||
|
let wrapped: anyhow::Error =
|
||||||
|
anyhow::Error::new(outer).context("wait for chapter nav");
|
||||||
|
assert!(anyhow_looks_browser_dead(&wrapped));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn anyhow_with_cdp_error_in_chain_is_flagged() {
|
||||||
|
// `Browser::new_page` errors get wrapped via
|
||||||
|
// `anyhow::Error::from(CdpError)` at the navigate / dispatch
|
||||||
|
// call sites. Walking the chain and downcasting to CdpError is
|
||||||
|
// what catches that path. Any CdpError variant counts; the
|
||||||
|
// Serde variant is the easiest to construct in a unit test.
|
||||||
|
let serde_err: serde_json::Error =
|
||||||
|
serde_json::from_str::<i32>("not a number").unwrap_err();
|
||||||
|
let cdp = CdpError::Serde(serde_err);
|
||||||
|
let wrapped: anyhow::Error =
|
||||||
|
anyhow::Error::from(cdp).context("open chapter page");
|
||||||
|
assert!(anyhow_looks_browser_dead(&wrapped));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn anyhow_with_innocuous_parse_error_is_not_flagged() {
|
||||||
|
let e: anyhow::Error =
|
||||||
|
anyhow::anyhow!("parse manga detail: chapter row regex did not match");
|
||||||
|
assert!(!anyhow_looks_browser_dead(&e));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn anyhow_with_reqwest_style_connection_message_is_not_flagged() {
|
||||||
|
// Regression: the earlier substring fallback flagged any error
|
||||||
|
// whose message contained "connection" or "closed" as browser-
|
||||||
|
// dead. A TCP reset from a CDN during image download, or a
|
||||||
|
// sqlx pool-connection error, would burn a chromium relaunch
|
||||||
|
// even though the browser is fine. Typed downcasts only —
|
||||||
|
// these untyped strings must pass through.
|
||||||
|
for msg in [
|
||||||
|
"error sending request: connection reset by peer",
|
||||||
|
"PoolTimedOut: timed out waiting for a connection",
|
||||||
|
"request to https://cdn/x.jpg: connection closed before message completed",
|
||||||
|
"transport error during image fetch",
|
||||||
|
] {
|
||||||
|
let e: anyhow::Error = anyhow::anyhow!("{msg}");
|
||||||
|
assert!(
|
||||||
|
!anyhow_looks_browser_dead(&e),
|
||||||
|
"must not flag non-browser error: {msg}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same sanity check as [`timeout_elapses_on_a_future_that_never_resolves`],
|
||||||
|
/// but for the [`wait_for_selector`] polling pattern: the loop must
|
||||||
|
/// surrender on `Elapsed` rather than spinning past the deadline.
|
||||||
|
#[tokio::test(flavor = "current_thread", start_paused = true)]
|
||||||
|
async fn selector_polling_pattern_surrenders_at_deadline() {
|
||||||
|
let timeout = Duration::from_millis(300);
|
||||||
|
let start = tokio::time::Instant::now();
|
||||||
|
let deadline = start + timeout;
|
||||||
|
// Simulate find_element forever returning "not found".
|
||||||
|
let mut polls = 0u32;
|
||||||
|
let result: Result<(), NavError> = loop {
|
||||||
|
polls += 1;
|
||||||
|
if tokio::time::Instant::now() >= deadline {
|
||||||
|
break Err(NavError::Timeout(timeout));
|
||||||
|
}
|
||||||
|
tokio::time::sleep(SELECTOR_POLL_INTERVAL).await;
|
||||||
|
};
|
||||||
|
assert!(matches!(result, Err(NavError::Timeout(_))));
|
||||||
|
// 300ms / 100ms poll interval ≈ 3 iterations plus the final check
|
||||||
|
// that breaks out. Allow some slack since the first poll happens
|
||||||
|
// before any sleep.
|
||||||
|
assert!(polls >= 3, "expected at least 3 poll iterations, got {polls}");
|
||||||
|
}
|
||||||
|
}
|
||||||
906
backend/src/crawler/pipeline.rs
Normal file
906
backend/src/crawler/pipeline.rs
Normal file
@@ -0,0 +1,906 @@
|
|||||||
|
//! Crawler pipeline — the reusable metadata pass and the enqueue helpers
|
||||||
|
//! that fan out chapter-content work. Shared between the daemon (cron tick)
|
||||||
|
//! and the CLI (`bin/crawler.rs`) so behavior stays in lockstep.
|
||||||
|
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::crawler::browser_manager::BrowserManager;
|
||||||
|
use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
|
||||||
|
use crate::crawler::rate_limit::HostRateLimiters;
|
||||||
|
use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist};
|
||||||
|
use crate::crawler::source::target::TargetSource;
|
||||||
|
use crate::crawler::source::{FetchContext, Source, SourceMangaRef};
|
||||||
|
use crate::repo;
|
||||||
|
use crate::repo::crawler::UpsertStatus;
|
||||||
|
use crate::storage::Storage;
|
||||||
|
|
||||||
|
/// Coarse counters surfaced for logging at the end of a metadata pass.
|
||||||
|
#[derive(Debug, Default, Clone, Copy)]
|
||||||
|
pub struct MetadataStats {
|
||||||
|
pub discovered: usize,
|
||||||
|
pub upserted: usize,
|
||||||
|
pub covers_fetched: usize,
|
||||||
|
pub mangas_failed: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decide whether the per-ref loop should stop on the manga just
|
||||||
|
/// processed. The walk halts only when (a) the previous run exited
|
||||||
|
/// cleanly — so the index tail is known to be caught up and we're not
|
||||||
|
/// in a recovery sweep — AND (b) this manga's metadata hash matched
|
||||||
|
/// storage (`Unchanged`) AND (c) the chapter sync confirmed zero new
|
||||||
|
/// chapters. A `None` chapter count (skip_chapters, or a chapter-sync
|
||||||
|
/// error we logged-and-swallowed) refuses the stop because we can't
|
||||||
|
/// verify the tail is unchanged from a single piece of evidence.
|
||||||
|
///
|
||||||
|
/// Pure function so the rule is unit-testable without the walker, DB,
|
||||||
|
/// or browser.
|
||||||
|
pub(crate) fn should_stop(
|
||||||
|
was_clean: bool,
|
||||||
|
status: UpsertStatus,
|
||||||
|
chapters_new: Option<usize>,
|
||||||
|
) -> bool {
|
||||||
|
was_clean
|
||||||
|
&& matches!(status, UpsertStatus::Unchanged)
|
||||||
|
&& chapters_new == Some(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether the just-finished walk should be recorded as a clean exit.
|
||||||
|
/// `true` writes the recovery flag back to `completed: true`; `false`
|
||||||
|
/// leaves it `false` so the next tick treats this run as crashed and
|
||||||
|
/// does a recovery sweep.
|
||||||
|
///
|
||||||
|
/// `hit_limit` (the caller-imposed `CRAWLER_LIMIT` cap) is *not* an
|
||||||
|
/// argument: a limit cap by definition does not reach the catalog tail,
|
||||||
|
/// so it can never count as a clean exit. Encoding that in the type
|
||||||
|
/// (rather than as an `&& !hit_limit` clause inline) prevents a future
|
||||||
|
/// edit from accidentally adding it back to the truth table.
|
||||||
|
pub(crate) fn should_mark_clean_exit(
|
||||||
|
walked_to_completion: bool,
|
||||||
|
hit_stop_condition: bool,
|
||||||
|
) -> bool {
|
||||||
|
walked_to_completion || hit_stop_condition
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Circuit-breaker: abort the walk once `consecutive` `fetch_manga`
|
||||||
|
/// failures reach `threshold`. A `threshold` of 0 disables the breaker
|
||||||
|
/// (unbounded — the legacy behaviour). When it fires the caller must NOT
|
||||||
|
/// mark a clean exit, so the next tick does a recovery sweep over the
|
||||||
|
/// catalog tail the aborted pass never reached.
|
||||||
|
///
|
||||||
|
/// Pure so the rule is unit-testable without the walker.
|
||||||
|
pub(crate) fn should_abort_pass(consecutive: u32, threshold: u32) -> bool {
|
||||||
|
threshold > 0 && consecutive >= threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
|
||||||
|
/// for the target source. Pure metadata; chapter content is enqueued as
|
||||||
|
/// separate `SyncChapterContent` jobs by the caller after this returns.
|
||||||
|
///
|
||||||
|
/// `limit == 0` means no cap (full sweep up to the source's own bound).
|
||||||
|
/// `skip_chapters == true` is the "metadata-only" mode (parser doesn't
|
||||||
|
/// extract chapters, and `sync_manga_chapters` is skipped — otherwise an
|
||||||
|
/// empty chapter list would soft-drop existing rows). In this mode the
|
||||||
|
/// stop condition never fires because chapter freshness can't be
|
||||||
|
/// confirmed, so the walk always runs to end-of-source.
|
||||||
|
///
|
||||||
|
/// The walk is always newest-first. Steady-state runs stop on the first
|
||||||
|
/// manga where metadata is `Unchanged` AND chapter sync reports zero
|
||||||
|
/// new chapters — the source orders by `update_date DESC`, so anything
|
||||||
|
/// with a fresh chapter or fresh metadata is bumped to the top and will
|
||||||
|
/// be processed before we hit a fully-caught-up manga.
|
||||||
|
///
|
||||||
|
/// A per-source recovery flag stored in `crawler_state`
|
||||||
|
/// (`last_run_completed:<source_id>`) gates the early stop: it's set to
|
||||||
|
/// `false` right after `ensure_source` and back to `true` only when the
|
||||||
|
/// run exits via end-of-walk OR the intentional stop. A crash, panic,
|
||||||
|
/// or SIGKILL leaves the flag at `false`, so the next tick reads it,
|
||||||
|
/// recognizes the previous run did not exit cleanly, and walks the
|
||||||
|
/// full catalog (ignoring the stop condition) to re-cover anything the
|
||||||
|
/// crashed run missed past its crash point. Once that recovery sweep
|
||||||
|
/// reaches end-of-walk, steady-state resumes.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub async fn run_metadata_pass(
|
||||||
|
browser_manager: &BrowserManager,
|
||||||
|
db: &PgPool,
|
||||||
|
storage: &dyn Storage,
|
||||||
|
http: &reqwest::Client,
|
||||||
|
rate: &HostRateLimiters,
|
||||||
|
start_url: &str,
|
||||||
|
limit: usize,
|
||||||
|
skip_chapters: bool,
|
||||||
|
allowlist: &DownloadAllowlist,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
max_consecutive_failures: u32,
|
||||||
|
status: Option<&crate::crawler::status::StatusHandle>,
|
||||||
|
tor: Option<&crate::crawler::tor::TorController>,
|
||||||
|
) -> anyhow::Result<MetadataStats> {
|
||||||
|
let lease = browser_manager
|
||||||
|
.acquire()
|
||||||
|
.await
|
||||||
|
.context("acquire browser lease for metadata pass")?;
|
||||||
|
let browser_ref: &chromiumoxide::Browser = &lease;
|
||||||
|
if let Some(s) = status {
|
||||||
|
s.set_phase(crate::crawler::status::Phase::WalkingList).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
let source = {
|
||||||
|
let s = TargetSource::new(start_url.to_string());
|
||||||
|
if skip_chapters {
|
||||||
|
s.without_chapter_parsing()
|
||||||
|
} else {
|
||||||
|
s
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let ctx = FetchContext {
|
||||||
|
browser: browser_ref,
|
||||||
|
rate,
|
||||||
|
tor,
|
||||||
|
};
|
||||||
|
|
||||||
|
let source_id = source.id();
|
||||||
|
repo::crawler::ensure_source(
|
||||||
|
db,
|
||||||
|
source_id,
|
||||||
|
"Target Site",
|
||||||
|
&origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.context("ensure_source")?;
|
||||||
|
|
||||||
|
// Read BEFORE flipping to "in-flight" — a `false` here means the
|
||||||
|
// previous run didn't reach a clean exit, and this run must walk
|
||||||
|
// the full catalog (recovery sweep) instead of bailing on the
|
||||||
|
// first caught-up manga.
|
||||||
|
let was_clean = repo::crawler::last_run_completed_cleanly(db, source_id)
|
||||||
|
.await
|
||||||
|
.context("read last_run_completed_cleanly")?;
|
||||||
|
repo::crawler::mark_run_started(db, source_id)
|
||||||
|
.await
|
||||||
|
.context("mark_run_started")?;
|
||||||
|
|
||||||
|
let max_refs = (limit > 0).then_some(limit);
|
||||||
|
|
||||||
|
tracing::info!(was_clean, ?max_refs, "starting metadata pass");
|
||||||
|
let mut walker = source
|
||||||
|
.discover(&ctx)
|
||||||
|
.await
|
||||||
|
.context("discover failed")?;
|
||||||
|
|
||||||
|
let mut stats = MetadataStats::default();
|
||||||
|
// Run-scoped dedup of `source_manga_key`s already processed this pass.
|
||||||
|
// A shift in the source index causes the slot-last item of the page
|
||||||
|
// we just read to reappear at slot 0 of the next page; skipping it
|
||||||
|
// here prevents redundant fetch_manga + upsert and avoids spuriously
|
||||||
|
// tripping the stop condition with a re-confirm of an entry we
|
||||||
|
// already counted.
|
||||||
|
let mut seen: HashSet<String> = HashSet::new();
|
||||||
|
let mut walked_to_completion = false;
|
||||||
|
let mut hit_limit = false;
|
||||||
|
let mut hit_stop_condition = false;
|
||||||
|
// Circuit-breaker state: consecutive fetch_manga failures. A sustained
|
||||||
|
// run abort (source outage) leaves the pass un-clean → recovery sweep
|
||||||
|
// next tick.
|
||||||
|
let mut consecutive_failures = 0u32;
|
||||||
|
let mut hit_failure_breaker = false;
|
||||||
|
|
||||||
|
'outer: loop {
|
||||||
|
let batch = match walker.next_batch(&ctx).await? {
|
||||||
|
Some(b) => b,
|
||||||
|
None => {
|
||||||
|
walked_to_completion = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for r in batch {
|
||||||
|
// Cooperative checkpoint: if a coordinated browser restart is
|
||||||
|
// pending, yield our (long-lived) lease so the drain can
|
||||||
|
// proceed instead of stalling for the rest of the walk. The
|
||||||
|
// pass exits un-clean, so the next tick recovery-sweeps the
|
||||||
|
// tail we didn't reach.
|
||||||
|
if browser_manager.is_restart_pending() {
|
||||||
|
tracing::info!(
|
||||||
|
"metadata pass: browser restart pending — yielding (recovery sweep next tick)"
|
||||||
|
);
|
||||||
|
break 'outer;
|
||||||
|
}
|
||||||
|
if max_refs.map(|m| stats.discovered >= m).unwrap_or(false) {
|
||||||
|
hit_limit = true;
|
||||||
|
tracing::info!(cap = ?max_refs, "max_results reached; halting walk");
|
||||||
|
break 'outer;
|
||||||
|
}
|
||||||
|
// Skip refs we've already *successfully* processed this pass.
|
||||||
|
// Checking `contains` here (rather than `insert`) keeps the key
|
||||||
|
// out of `seen` on failure paths below, so a transient fetch or
|
||||||
|
// upsert error gets a second chance if the ref reappears in
|
||||||
|
// another batch. Done *before* counting toward
|
||||||
|
// `stats.discovered` (the skipped ref did no work) and *before*
|
||||||
|
// touching the stop check (a `continue` here doesn't let a
|
||||||
|
// re-confirm trip the stop condition). The matching
|
||||||
|
// `seen.insert(...)` lives just after the successful upsert
|
||||||
|
// below.
|
||||||
|
if seen.contains(&r.source_manga_key) {
|
||||||
|
tracing::debug!(
|
||||||
|
key = %r.source_manga_key,
|
||||||
|
"skip already-seen key in this run"
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
stats.discovered += 1;
|
||||||
|
if let Some(s) = status {
|
||||||
|
s.set_phase(crate::crawler::status::Phase::FetchingMetadata {
|
||||||
|
index: stats.discovered,
|
||||||
|
total: max_refs,
|
||||||
|
title: r.title.clone(),
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
tracing::info!(
|
||||||
|
idx = stats.discovered,
|
||||||
|
key = %r.source_manga_key,
|
||||||
|
"fetching metadata"
|
||||||
|
);
|
||||||
|
let manga = match source.fetch_manga(&ctx, &r).await {
|
||||||
|
Ok(m) => {
|
||||||
|
consecutive_failures = 0;
|
||||||
|
m
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
key = %r.source_manga_key,
|
||||||
|
url = %r.url,
|
||||||
|
error = ?e,
|
||||||
|
"fetch_manga failed"
|
||||||
|
);
|
||||||
|
stats.mangas_failed += 1;
|
||||||
|
consecutive_failures += 1;
|
||||||
|
if should_abort_pass(consecutive_failures, max_consecutive_failures) {
|
||||||
|
hit_failure_breaker = true;
|
||||||
|
tracing::error!(
|
||||||
|
consecutive_failures,
|
||||||
|
threshold = max_consecutive_failures,
|
||||||
|
"metadata pass: too many consecutive fetch_manga failures; \
|
||||||
|
aborting (recovery sweep on next tick)"
|
||||||
|
);
|
||||||
|
break 'outer;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Partial-render guard: an empty chapter list paired with a
|
||||||
|
// prior count > 0 is overwhelmingly a chromium snapshot
|
||||||
|
// taken between the #chapter_table wrapper render and its
|
||||||
|
// rows render. The wait_for_selector wait in `navigate`
|
||||||
|
// narrows this window but cannot close it for slow renders
|
||||||
|
// beyond the selector budget. Treat as a transient failure
|
||||||
|
// here — skip upsert, skip seen.insert — so the next batch
|
||||||
|
// (or the next tick) retries. Skipped in `skip_chapters`
|
||||||
|
// mode because the parser is configured to return an empty
|
||||||
|
// Vec by design there.
|
||||||
|
if !skip_chapters && manga.chapters.is_empty() {
|
||||||
|
match repo::crawler::live_chapter_count_for_source_manga(
|
||||||
|
db, source_id, &r.source_manga_key,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(prior) if prior > 0 => {
|
||||||
|
tracing::warn!(
|
||||||
|
key = %r.source_manga_key,
|
||||||
|
url = %r.url,
|
||||||
|
prior_chapter_count = prior,
|
||||||
|
"fetch_manga returned empty chapters but prior count > 0; treating as partial-render transient and skipping"
|
||||||
|
);
|
||||||
|
stats.mangas_failed += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(e) => {
|
||||||
|
// DB lookup failed — fail safe: skip rather
|
||||||
|
// than risk a soft-drop on a manga whose prior
|
||||||
|
// count we couldn't confirm.
|
||||||
|
tracing::warn!(
|
||||||
|
key = %r.source_manga_key,
|
||||||
|
error = ?e,
|
||||||
|
"live_chapter_count_for_source_manga failed; skipping cautiously"
|
||||||
|
);
|
||||||
|
stats.mangas_failed += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let upsert = match repo::crawler::upsert_manga_from_source(
|
||||||
|
db, source_id, &r.url, &manga,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(u) => u,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!(
|
||||||
|
key = %r.source_manga_key,
|
||||||
|
error = ?e,
|
||||||
|
"upsert_manga_from_source failed"
|
||||||
|
);
|
||||||
|
stats.mangas_failed += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
stats.upserted += 1;
|
||||||
|
// Record success in the dedup set. Cover and chapter-sync
|
||||||
|
// failures below are non-fatal and don't roll this back —
|
||||||
|
// metadata is the durable source of truth for the dedup.
|
||||||
|
seen.insert(r.source_manga_key.clone());
|
||||||
|
tracing::info!(
|
||||||
|
key = %manga.source_manga_key,
|
||||||
|
manga_id = %upsert.manga_id,
|
||||||
|
status = ?upsert.status,
|
||||||
|
title = %manga.title,
|
||||||
|
"manga upserted"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Cover image: download when missing in storage or when metadata
|
||||||
|
// signaled an update (cover URL is part of metadata_hash, so
|
||||||
|
// Updated implies the URL may have moved). Failures are non-fatal.
|
||||||
|
let needs_cover = upsert.cover_image_path.is_none()
|
||||||
|
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
|
||||||
|
if needs_cover {
|
||||||
|
if let Some(cover_url) = manga.cover_url.as_deref() {
|
||||||
|
if let Some(s) = status {
|
||||||
|
s.set_current_cover(Some(crate::crawler::status::CoverTarget {
|
||||||
|
manga_id: upsert.manga_id,
|
||||||
|
manga_title: manga.title.clone(),
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
let cover_result = download_and_store_cover(
|
||||||
|
db,
|
||||||
|
storage,
|
||||||
|
http,
|
||||||
|
rate,
|
||||||
|
&r.url,
|
||||||
|
upsert.manga_id,
|
||||||
|
cover_url,
|
||||||
|
allowlist,
|
||||||
|
max_image_bytes,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
if let Some(s) = status {
|
||||||
|
s.set_current_cover(None).await;
|
||||||
|
}
|
||||||
|
match cover_result {
|
||||||
|
Ok(()) => stats.covers_fetched += 1,
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
manga_id = %upsert.manga_id,
|
||||||
|
error = ?e,
|
||||||
|
"cover download failed"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chapter sync. `chapters_new` feeds the stop check below:
|
||||||
|
// `None` (skip_chapters mode, or a logged-and-swallowed sync
|
||||||
|
// error) refuses to stop on this manga because we can't
|
||||||
|
// confirm "no new chapters."
|
||||||
|
let chapters_new: Option<usize> = if skip_chapters {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
match repo::crawler::sync_manga_chapters(
|
||||||
|
db,
|
||||||
|
source_id,
|
||||||
|
upsert.manga_id,
|
||||||
|
&manga.chapters,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(diff) => {
|
||||||
|
tracing::info!(
|
||||||
|
manga_id = %upsert.manga_id,
|
||||||
|
new = diff.new,
|
||||||
|
refreshed = diff.refreshed,
|
||||||
|
dropped = diff.dropped,
|
||||||
|
"chapters synced"
|
||||||
|
);
|
||||||
|
Some(diff.new)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
manga_id = %upsert.manga_id,
|
||||||
|
error = ?e,
|
||||||
|
"chapter sync failed"
|
||||||
|
);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if should_stop(was_clean, upsert.status, chapters_new) {
|
||||||
|
hit_stop_condition = true;
|
||||||
|
tracing::info!(
|
||||||
|
key = %manga.source_manga_key,
|
||||||
|
"stop condition met (Unchanged metadata + 0 new chapters); halting walk"
|
||||||
|
);
|
||||||
|
break 'outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recovery-flag write. Only on a clean exit (end-of-walk OR the
|
||||||
|
// intentional stop). `hit_limit` is a caller-imposed early break
|
||||||
|
// and does NOT count — the catalog tail wasn't reached, so a future
|
||||||
|
// tick still needs to walk past where we stopped. The truth table is
|
||||||
|
// pinned by `should_mark_clean_exit` so a future edit that adds
|
||||||
|
// `hit_limit` back into the disjunction trips its unit test. Flag-
|
||||||
|
// write errors are warned and swallowed: the run already did its
|
||||||
|
// work, and a stale `false` flag just buys a recovery sweep on the
|
||||||
|
// next tick.
|
||||||
|
let exited_cleanly = should_mark_clean_exit(walked_to_completion, hit_stop_condition);
|
||||||
|
if exited_cleanly {
|
||||||
|
if let Err(e) = repo::crawler::mark_run_completed(db, source_id).await {
|
||||||
|
tracing::warn!(error = ?e, "mark_run_completed failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
was_clean,
|
||||||
|
discovered = stats.discovered,
|
||||||
|
upserted = stats.upserted,
|
||||||
|
covers_fetched = stats.covers_fetched,
|
||||||
|
mangas_failed = stats.mangas_failed,
|
||||||
|
walked_to_completion,
|
||||||
|
hit_limit,
|
||||||
|
hit_stop_condition,
|
||||||
|
hit_failure_breaker,
|
||||||
|
exited_cleanly,
|
||||||
|
"metadata pass complete"
|
||||||
|
);
|
||||||
|
|
||||||
|
drop(lease);
|
||||||
|
Ok(stats)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Quarantine window for chapters whose latest `SyncChapterContent` job is
|
||||||
|
/// `dead`. The partial dedup index `crawler_jobs_chapter_content_dedup_idx`
|
||||||
|
/// only blocks `(pending|running)` duplicates, so without this gate a
|
||||||
|
/// permanently-failing chapter is re-enqueued every cron tick, burns
|
||||||
|
/// `max_attempts` retries, dies again, and spins forever. With the gate,
|
||||||
|
/// dead chapters get a week of silence before the next attempt — long
|
||||||
|
/// enough for a transient site issue to resolve, short enough that
|
||||||
|
/// permanent failures don't stay permanent if conditions change.
|
||||||
|
const CHAPTER_DEAD_QUARANTINE_DAYS: i64 = 7;
|
||||||
|
|
||||||
|
/// Enqueue a `SyncChapterContent` job for every chapter of *any* bookmarked
|
||||||
|
/// manga that still has `page_count = 0` and a non-dropped source row.
|
||||||
|
/// Chapters whose latest job is `dead` within `CHAPTER_DEAD_QUARANTINE_DAYS`
|
||||||
|
/// are excluded to break the dead-letter spin.
|
||||||
|
/// Returns `(inserted, skipped)` counts. Dedup index handles repeats.
|
||||||
|
pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<EnqueueSummary> {
|
||||||
|
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
|
||||||
|
FROM chapters c
|
||||||
|
JOIN bookmarks b ON b.manga_id = c.manga_id
|
||||||
|
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||||
|
WHERE c.page_count = 0
|
||||||
|
AND cs.dropped_at IS NULL
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM crawler_jobs cj
|
||||||
|
WHERE cj.payload->>'kind' = 'sync_chapter_content'
|
||||||
|
AND cj.payload->>'chapter_id' = c.id::text
|
||||||
|
AND cj.state = 'dead'
|
||||||
|
AND cj.updated_at > now() - ($1::bigint || ' days')::interval
|
||||||
|
)
|
||||||
|
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.number, c.created_at
|
||||||
|
ORDER BY c.manga_id, c.number ASC, c.created_at ASC
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(CHAPTER_DEAD_QUARANTINE_DAYS)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await
|
||||||
|
.context("query bookmarked-pending chapters")?;
|
||||||
|
|
||||||
|
let mut summary = EnqueueSummary::default();
|
||||||
|
for (source_id, chapter_id, source_chapter_key) in rows {
|
||||||
|
let payload = JobPayload::SyncChapterContent {
|
||||||
|
source_id,
|
||||||
|
chapter_id,
|
||||||
|
source_chapter_key,
|
||||||
|
};
|
||||||
|
match jobs::enqueue(pool, &payload).await {
|
||||||
|
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
|
||||||
|
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
%chapter_id,
|
||||||
|
error = ?e,
|
||||||
|
"enqueue chapter content failed"
|
||||||
|
);
|
||||||
|
summary.failed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(summary)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enqueue chapter-content jobs for a *single* manga (the bookmark-create
|
||||||
|
/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`], including
|
||||||
|
/// the dead-letter quarantine — a freshly bookmarked manga should not
|
||||||
|
/// burn retries on chapters that just died on the cron tick.
|
||||||
|
pub async fn enqueue_pending_for_manga(
|
||||||
|
pool: &PgPool,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> anyhow::Result<EnqueueSummary> {
|
||||||
|
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
|
||||||
|
FROM chapters c
|
||||||
|
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||||
|
WHERE c.manga_id = $1
|
||||||
|
AND c.page_count = 0
|
||||||
|
AND cs.dropped_at IS NULL
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM crawler_jobs cj
|
||||||
|
WHERE cj.payload->>'kind' = 'sync_chapter_content'
|
||||||
|
AND cj.payload->>'chapter_id' = c.id::text
|
||||||
|
AND cj.state = 'dead'
|
||||||
|
AND cj.updated_at > now() - ($2::bigint || ' days')::interval
|
||||||
|
)
|
||||||
|
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.number, c.created_at
|
||||||
|
ORDER BY c.number ASC, c.created_at ASC, cs.source_id
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(CHAPTER_DEAD_QUARANTINE_DAYS)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await
|
||||||
|
.context("query pending chapters for manga")?;
|
||||||
|
|
||||||
|
let mut summary = EnqueueSummary::default();
|
||||||
|
for (source_id, chapter_id, source_chapter_key) in rows {
|
||||||
|
let payload = JobPayload::SyncChapterContent {
|
||||||
|
source_id,
|
||||||
|
chapter_id,
|
||||||
|
source_chapter_key,
|
||||||
|
};
|
||||||
|
match jobs::enqueue(pool, &payload).await {
|
||||||
|
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
|
||||||
|
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
%chapter_id,
|
||||||
|
error = ?e,
|
||||||
|
"enqueue chapter content failed"
|
||||||
|
);
|
||||||
|
summary.failed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(summary)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone, Copy)]
|
||||||
|
pub struct EnqueueSummary {
|
||||||
|
pub inserted: usize,
|
||||||
|
pub skipped: usize,
|
||||||
|
pub failed: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone, Copy)]
|
||||||
|
pub struct CoverBackfillStats {
|
||||||
|
pub considered: usize,
|
||||||
|
pub fetched: usize,
|
||||||
|
pub failed: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Default per-tick cap for [`backfill_missing_covers`]. The metadata pass
|
||||||
|
/// already retries covers when its walk reaches the affected manga; this
|
||||||
|
/// backfill exists to catch the residual case where the early-stop
|
||||||
|
/// optimisation prevents the walk from reaching mangas whose cover failed
|
||||||
|
/// on first attempt. A small cap is enough because the backlog only grows
|
||||||
|
/// from sporadic download failures, not from systematic misses.
|
||||||
|
pub const COVER_BACKFILL_DEFAULT_MAX: usize = 10;
|
||||||
|
|
||||||
|
/// Re-attempt cover downloads for mangas where `cover_image_path IS NULL`
|
||||||
|
/// but a live `manga_sources` row exists. Refetches the source detail
|
||||||
|
/// page (which is where the cover URL lives) and downloads the cover.
|
||||||
|
///
|
||||||
|
/// Bounded by `max_mangas` per call so a steady stream of failing covers
|
||||||
|
/// — e.g. a CDN host that's persistently 502 — can't monopolise a cron
|
||||||
|
/// tick. Orders by `manga_sources.last_seen_at DESC` so the freshest
|
||||||
|
/// missing-cover mangas are addressed first.
|
||||||
|
///
|
||||||
|
/// Failures are logged and counted, not raised: a single bad cover URL
|
||||||
|
/// must not stall every other backfill behind it.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub async fn backfill_missing_covers(
|
||||||
|
browser_manager: &BrowserManager,
|
||||||
|
db: &PgPool,
|
||||||
|
storage: &dyn Storage,
|
||||||
|
http: &reqwest::Client,
|
||||||
|
rate: &HostRateLimiters,
|
||||||
|
max_mangas: usize,
|
||||||
|
allowlist: &DownloadAllowlist,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
status: Option<&crate::crawler::status::StatusHandle>,
|
||||||
|
tor: Option<&crate::crawler::tor::TorController>,
|
||||||
|
) -> anyhow::Result<CoverBackfillStats> {
|
||||||
|
let mut stats = CoverBackfillStats::default();
|
||||||
|
if max_mangas == 0 {
|
||||||
|
return Ok(stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
let entries = repo::crawler::list_missing_covers(db, max_mangas as i64)
|
||||||
|
.await
|
||||||
|
.context("list_missing_covers")?;
|
||||||
|
|
||||||
|
if entries.is_empty() {
|
||||||
|
return Ok(stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
let lease = browser_manager
|
||||||
|
.acquire()
|
||||||
|
.await
|
||||||
|
.context("acquire browser lease for cover backfill")?;
|
||||||
|
let browser_ref: &chromiumoxide::Browser = &lease;
|
||||||
|
let ctx = FetchContext { browser: browser_ref, rate, tor };
|
||||||
|
|
||||||
|
let total = entries.len();
|
||||||
|
for (index, entry) in entries.into_iter().enumerate() {
|
||||||
|
stats.considered += 1;
|
||||||
|
if let Some(s) = status {
|
||||||
|
s.set_phase(crate::crawler::status::Phase::CoverBackfill { index, total })
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
// Metadata-only TargetSource: skip chapter-list parsing so a
|
||||||
|
// missing-cover refetch doesn't soft-drop chapters on a partial
|
||||||
|
// render. Cover URL alone is what we need.
|
||||||
|
let source = TargetSource::new(entry.source_url.clone()).without_chapter_parsing();
|
||||||
|
let r = SourceMangaRef {
|
||||||
|
source_manga_key: entry.source_manga_key.clone(),
|
||||||
|
title: String::new(),
|
||||||
|
url: entry.source_url.clone(),
|
||||||
|
};
|
||||||
|
let manga = match source.fetch_manga(&ctx, &r).await {
|
||||||
|
Ok(manga) => manga,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
manga_id = %entry.manga_id,
|
||||||
|
url = %entry.source_url,
|
||||||
|
error = ?e,
|
||||||
|
"cover backfill: fetch_manga failed"
|
||||||
|
);
|
||||||
|
stats.failed += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let Some(cover_url) = manga.cover_url.clone() else {
|
||||||
|
tracing::warn!(
|
||||||
|
manga_id = %entry.manga_id,
|
||||||
|
url = %entry.source_url,
|
||||||
|
"cover backfill: source returned no cover_url"
|
||||||
|
);
|
||||||
|
stats.failed += 1;
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
if let Some(s) = status {
|
||||||
|
s.set_current_cover(Some(crate::crawler::status::CoverTarget {
|
||||||
|
manga_id: entry.manga_id,
|
||||||
|
manga_title: manga.title.clone(),
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
let cover_result = download_and_store_cover(
|
||||||
|
db,
|
||||||
|
storage,
|
||||||
|
http,
|
||||||
|
rate,
|
||||||
|
&entry.source_url,
|
||||||
|
entry.manga_id,
|
||||||
|
&cover_url,
|
||||||
|
allowlist,
|
||||||
|
max_image_bytes,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
if let Some(s) = status {
|
||||||
|
s.set_current_cover(None).await;
|
||||||
|
}
|
||||||
|
match cover_result {
|
||||||
|
Ok(()) => stats.fetched += 1,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
manga_id = %entry.manga_id,
|
||||||
|
url = %entry.source_url,
|
||||||
|
error = ?e,
|
||||||
|
"cover backfill: download failed"
|
||||||
|
);
|
||||||
|
stats.failed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(lease);
|
||||||
|
Ok(stats)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Download a cover image and persist its storage path. Local to the
|
||||||
|
/// pipeline because the CLI still calls it from its inline chapter-content
|
||||||
|
/// loop; once the worker pool fully replaces that path we can fold this
|
||||||
|
/// into `pipeline` proper.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub(crate) async fn download_and_store_cover(
|
||||||
|
db: &PgPool,
|
||||||
|
storage: &dyn Storage,
|
||||||
|
http: &reqwest::Client,
|
||||||
|
rate: &HostRateLimiters,
|
||||||
|
manga_url: &str,
|
||||||
|
manga_id: Uuid,
|
||||||
|
cover_url: &str,
|
||||||
|
allowlist: &DownloadAllowlist,
|
||||||
|
max_image_bytes: usize,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let absolute = reqwest::Url::parse(manga_url)
|
||||||
|
.context("parse manga URL")?
|
||||||
|
.join(cover_url)
|
||||||
|
.context("join cover URL onto manga URL")?;
|
||||||
|
|
||||||
|
rate.wait_for(absolute.as_str()).await?;
|
||||||
|
let bytes = fetch_bytes_capped(
|
||||||
|
http,
|
||||||
|
absolute.as_str(),
|
||||||
|
Some(manga_url),
|
||||||
|
allowlist,
|
||||||
|
max_image_bytes,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
if !looks_like_image(&bytes) {
|
||||||
|
anyhow::bail!(
|
||||||
|
"cover URL {absolute} returned non-image bytes; refusing to store as binary blob"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
let ext = infer::get(&bytes)
|
||||||
|
.map(|k| k.extension())
|
||||||
|
.expect("looks_like_image asserted infer succeeded");
|
||||||
|
let key = format!("mangas/{manga_id}/cover.{ext}");
|
||||||
|
|
||||||
|
storage
|
||||||
|
.put(&key, &bytes)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("store cover at {key}"))?;
|
||||||
|
repo::manga::set_cover_image_path(db, manga_id, &key)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("update cover_image_path for {manga_id}"))?;
|
||||||
|
tracing::info!(
|
||||||
|
manga_id = %manga_id,
|
||||||
|
key = %key,
|
||||||
|
bytes = bytes.len(),
|
||||||
|
%absolute,
|
||||||
|
"cover stored"
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
use crate::crawler::url_utils::origin_of;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stop_condition_fires_on_unchanged_metadata_and_zero_new_chapters() {
|
||||||
|
// The whole point of the rule: in steady state, a manga whose
|
||||||
|
// metadata hash matches AND whose chapter list gained no new
|
||||||
|
// entries proves we've reached the caught-up tail of a
|
||||||
|
// newest-first index.
|
||||||
|
assert!(should_stop(true, UpsertStatus::Unchanged, Some(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stop_condition_refuses_when_chapters_added() {
|
||||||
|
// Unchanged metadata + N new chapters means the source bumped
|
||||||
|
// this manga because of the chapter add; the rest of the index
|
||||||
|
// is still ahead of us. Don't bail.
|
||||||
|
assert!(!should_stop(true, UpsertStatus::Unchanged, Some(1)));
|
||||||
|
assert!(!should_stop(true, UpsertStatus::Unchanged, Some(42)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stop_condition_refuses_when_metadata_changed() {
|
||||||
|
// Updated or New metadata always continues — even with zero new
|
||||||
|
// chapters — because the change-of-metadata bump itself is what
|
||||||
|
// the walk is following.
|
||||||
|
assert!(!should_stop(true, UpsertStatus::Updated, Some(0)));
|
||||||
|
assert!(!should_stop(true, UpsertStatus::New, Some(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stop_condition_refuses_when_chapter_count_unknown() {
|
||||||
|
// skip_chapters mode (CLI metadata-only sweep) or a
|
||||||
|
// logged-and-swallowed chapter sync error: we can't claim "no
|
||||||
|
// new chapters" from absence of evidence, so don't stop. The
|
||||||
|
// operator who runs metadata-only intentionally wants a full
|
||||||
|
// walk anyway.
|
||||||
|
assert!(!should_stop(true, UpsertStatus::Unchanged, None));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn stop_condition_disabled_in_recovery_mode() {
|
||||||
|
// was_clean = false means the previous run did not exit cleanly;
|
||||||
|
// the catalog past its crash point is potentially un-synced. Walk
|
||||||
|
// to end-of-source no matter what individual mangas report.
|
||||||
|
assert!(!should_stop(false, UpsertStatus::Unchanged, Some(0)));
|
||||||
|
assert!(!should_stop(false, UpsertStatus::Unchanged, Some(1)));
|
||||||
|
assert!(!should_stop(false, UpsertStatus::Updated, Some(0)));
|
||||||
|
assert!(!should_stop(false, UpsertStatus::New, None));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn abort_pass_fires_at_threshold_and_respects_disable() {
|
||||||
|
// Disabled (0) never fires, no matter how many failures.
|
||||||
|
assert!(!should_abort_pass(0, 0));
|
||||||
|
assert!(!should_abort_pass(100, 0));
|
||||||
|
// Below threshold: keep going.
|
||||||
|
assert!(!should_abort_pass(9, 10));
|
||||||
|
// At/above threshold: abort.
|
||||||
|
assert!(should_abort_pass(10, 10));
|
||||||
|
assert!(should_abort_pass(11, 10));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_exit_when_walked_to_completion() {
|
||||||
|
// End-of-walk reached the catalog tail — the recovery flag may
|
||||||
|
// safely flip back to `true`.
|
||||||
|
assert!(should_mark_clean_exit(true, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_exit_when_stop_condition_fired() {
|
||||||
|
// First Unchanged + 0-new-chapter manga is a complete steady-
|
||||||
|
// state exit: every manga newer than this point was synced, and
|
||||||
|
// by source-side `update_date DESC` ordering everything past
|
||||||
|
// this point is at least as caught-up.
|
||||||
|
assert!(should_mark_clean_exit(false, true));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn dirty_exit_when_neither_completion_nor_stop_fired() {
|
||||||
|
// The walk ended for some other reason — including the
|
||||||
|
// caller-imposed `hit_limit` cap, which is the regression case
|
||||||
|
// this test exists for. `should_mark_clean_exit` does not take
|
||||||
|
// `hit_limit` as a parameter, so a future edit that adds
|
||||||
|
// `|| hit_limit` to the inline expression in `run_metadata_pass`
|
||||||
|
// would need to also touch this helper, and would fail this
|
||||||
|
// assertion when it did.
|
||||||
|
assert!(!should_mark_clean_exit(false, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn run_scoped_seen_set_skips_duplicate_source_manga_keys() {
|
||||||
|
// Pins the per-ref loop contract: `contains` gates whether work
|
||||||
|
// runs, and `insert` only fires on the success path (after upsert).
|
||||||
|
// A failed ref that reappears later in the same pass must get a
|
||||||
|
// second chance — that's why the loop uses contains-then-insert
|
||||||
|
// instead of insert-and-skip-on-collision.
|
||||||
|
let mut seen: HashSet<String> = HashSet::new();
|
||||||
|
|
||||||
|
// First sighting of a key: not yet seen → loop proceeds.
|
||||||
|
assert!(!seen.contains("manga-a"), "first sighting is unseen");
|
||||||
|
// Simulate a failed fetch_manga: do NOT insert. Next sighting must
|
||||||
|
// still be considered unseen so the loop retries it.
|
||||||
|
assert!(!seen.contains("manga-a"), "failed key is still retryable");
|
||||||
|
|
||||||
|
// Now simulate a successful upsert — insert is called.
|
||||||
|
seen.insert("manga-a".to_string());
|
||||||
|
// Subsequent sightings of the same key are skipped.
|
||||||
|
assert!(seen.contains("manga-a"), "successful key is now seen");
|
||||||
|
|
||||||
|
// Distinct keys never collide.
|
||||||
|
assert!(!seen.contains("manga-b"), "different key independent");
|
||||||
|
seen.insert("manga-b".to_string());
|
||||||
|
assert!(seen.contains("manga-b"));
|
||||||
|
assert!(seen.contains("manga-a"), "first key still recorded");
|
||||||
|
}
|
||||||
|
}
|
||||||
178
backend/src/crawler/rate_limit.rs
Normal file
178
backend/src/crawler/rate_limit.rs
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
//! Per-host request pacing.
|
||||||
|
//!
|
||||||
|
//! `RateLimiter` is a single-token bucket: each `wait().await` returns
|
||||||
|
//! immediately when at least `interval` has elapsed since the last call,
|
||||||
|
//! otherwise sleeps just enough to satisfy it. Uses
|
||||||
|
//! `tokio::time::Instant` so tests can run under `start_paused` virtual
|
||||||
|
//! time without sleeping for real.
|
||||||
|
//!
|
||||||
|
//! `HostRateLimiters` is the multi-host wrapper actually used by the
|
||||||
|
//! crawler — concurrent workers issuing requests to different origins
|
||||||
|
//! (catalog vs. CDN) don't contend on a shared budget; each host gets
|
||||||
|
//! its own bucket. `wait_for(url)` extracts the host, lazily creates a
|
||||||
|
//! limiter for it, and serializes only against other callers hitting
|
||||||
|
//! the same host.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
use tokio::time::Instant;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct RateLimiter {
|
||||||
|
interval: Duration,
|
||||||
|
last: Option<Instant>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RateLimiter {
|
||||||
|
pub fn new(interval: Duration) -> Self {
|
||||||
|
Self {
|
||||||
|
interval,
|
||||||
|
last: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn wait(&mut self) {
|
||||||
|
if let Some(last) = self.last {
|
||||||
|
let elapsed = last.elapsed();
|
||||||
|
if elapsed < self.interval {
|
||||||
|
tokio::time::sleep(self.interval - elapsed).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.last = Some(Instant::now());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-host rate limiter map. The outer `Mutex<HashMap>` is held only
|
||||||
|
/// during the entry-or-insert + Arc clone; the per-host `Mutex<RateLimiter>`
|
||||||
|
/// is held during the actual `wait().await`. So N workers calling
|
||||||
|
/// `wait_for(url)` on N different hosts contend nowhere except the brief
|
||||||
|
/// HashMap lookup; workers hitting the same host serialize on that
|
||||||
|
/// host's bucket.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct HostRateLimiters {
|
||||||
|
default_interval: Duration,
|
||||||
|
overrides: HashMap<String, Duration>,
|
||||||
|
map: Mutex<HashMap<String, Arc<Mutex<RateLimiter>>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HostRateLimiters {
|
||||||
|
pub fn new(default_interval: Duration) -> Self {
|
||||||
|
Self {
|
||||||
|
default_interval,
|
||||||
|
overrides: HashMap::new(),
|
||||||
|
map: Mutex::new(HashMap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set a per-host interval that overrides `default_interval`. Calls
|
||||||
|
/// after a host's limiter has been instantiated do *not* re-create
|
||||||
|
/// it — set all overrides before the first `wait_for` to that host.
|
||||||
|
pub fn with_override(mut self, host: impl Into<String>, interval: Duration) -> Self {
|
||||||
|
self.overrides.insert(host.into(), interval);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Block until the per-host budget allows the next request to
|
||||||
|
/// `url`'s host. Returns an error only when the URL has no host
|
||||||
|
/// (malformed input).
|
||||||
|
pub async fn wait_for(&self, url: &str) -> anyhow::Result<()> {
|
||||||
|
let host = host_of(url)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("no host in url: {url}"))?;
|
||||||
|
let limiter = {
|
||||||
|
let mut map = self.map.lock().await;
|
||||||
|
map.entry(host.clone())
|
||||||
|
.or_insert_with(|| {
|
||||||
|
let interval = self
|
||||||
|
.overrides
|
||||||
|
.get(&host)
|
||||||
|
.copied()
|
||||||
|
.unwrap_or(self.default_interval);
|
||||||
|
Arc::new(Mutex::new(RateLimiter::new(interval)))
|
||||||
|
})
|
||||||
|
.clone()
|
||||||
|
};
|
||||||
|
limiter.lock().await.wait().await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// `host_of` was duplicated across session/rate_limit/pipeline; the
|
||||||
|
// canonical version now lives in `crawler::url_utils`.
|
||||||
|
use crate::crawler::url_utils::host_of;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn first_call_does_not_sleep() {
|
||||||
|
let mut rl = RateLimiter::new(Duration::from_millis(100));
|
||||||
|
let t0 = Instant::now();
|
||||||
|
rl.wait().await;
|
||||||
|
assert_eq!(Instant::now() - t0, Duration::ZERO);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn second_call_sleeps_to_fill_interval() {
|
||||||
|
let mut rl = RateLimiter::new(Duration::from_millis(100));
|
||||||
|
let t0 = Instant::now();
|
||||||
|
rl.wait().await;
|
||||||
|
rl.wait().await;
|
||||||
|
// Second call had to wait the full 100ms after the (instant)
|
||||||
|
// first call.
|
||||||
|
assert_eq!(Instant::now() - t0, Duration::from_millis(100));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn no_sleep_if_interval_already_elapsed() {
|
||||||
|
let mut rl = RateLimiter::new(Duration::from_millis(100));
|
||||||
|
rl.wait().await;
|
||||||
|
tokio::time::sleep(Duration::from_millis(250)).await;
|
||||||
|
let t0 = Instant::now();
|
||||||
|
rl.wait().await;
|
||||||
|
// Already 250ms past — no further wait needed.
|
||||||
|
assert_eq!(Instant::now() - t0, Duration::ZERO);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn host_of_parses_scheme_path_and_port() {
|
||||||
|
assert_eq!(host_of("https://Example.com/path").as_deref(), Some("example.com"));
|
||||||
|
assert_eq!(host_of("http://cdn.foo.bar/img.jpg").as_deref(), Some("cdn.foo.bar"));
|
||||||
|
assert_eq!(host_of("http://localhost:8080/x").as_deref(), Some("localhost"));
|
||||||
|
assert!(host_of("not a url").is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn host_rate_limiters_pace_per_host() {
|
||||||
|
// Two hosts at 100ms each. Two consecutive calls to the SAME
|
||||||
|
// host wait 100ms total. Two consecutive calls to DIFFERENT
|
||||||
|
// hosts both fire immediately.
|
||||||
|
let rl = HostRateLimiters::new(Duration::from_millis(100));
|
||||||
|
|
||||||
|
let t0 = Instant::now();
|
||||||
|
rl.wait_for("https://a.example/x").await.unwrap();
|
||||||
|
rl.wait_for("https://b.example/y").await.unwrap();
|
||||||
|
assert_eq!(Instant::now() - t0, Duration::ZERO, "different hosts don't contend");
|
||||||
|
|
||||||
|
let t1 = Instant::now();
|
||||||
|
rl.wait_for("https://a.example/x").await.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
Instant::now() - t1,
|
||||||
|
Duration::from_millis(100),
|
||||||
|
"second call to same host waits a full interval"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn host_rate_limiters_honor_overrides() {
|
||||||
|
let rl = HostRateLimiters::new(Duration::from_millis(1000))
|
||||||
|
.with_override("fast.example", Duration::from_millis(100));
|
||||||
|
|
||||||
|
rl.wait_for("https://fast.example/a").await.unwrap();
|
||||||
|
let t0 = Instant::now();
|
||||||
|
rl.wait_for("https://fast.example/b").await.unwrap();
|
||||||
|
assert_eq!(Instant::now() - t0, Duration::from_millis(100));
|
||||||
|
}
|
||||||
|
}
|
||||||
279
backend/src/crawler/resync.rs
Normal file
279
backend/src/crawler/resync.rs
Normal file
@@ -0,0 +1,279 @@
|
|||||||
|
//! Admin-triggered resync of a single manga's metadata + cover, or a
|
||||||
|
//! single chapter's content.
|
||||||
|
//!
|
||||||
|
//! The cron tick already retries covers and chapter content on its own
|
||||||
|
//! schedule. This module exists for the operator-controlled path:
|
||||||
|
//! "this manga's metadata is stale / its cover never landed / this
|
||||||
|
//! chapter is broken — pull from source now, not at the next daily
|
||||||
|
//! tick." Wired into the admin API, never into the queue, so the work
|
||||||
|
//! happens synchronously with the HTTP request and the admin sees the
|
||||||
|
//! refreshed row in the response.
|
||||||
|
//!
|
||||||
|
//! Shares the daemon's [`BrowserManager`], rate limiter, HTTP client,
|
||||||
|
//! and TOR controller so a force resync respects the same per-host
|
||||||
|
//! pacing and recircuit budget the daily crawl uses — admin actions
|
||||||
|
//! must not let an operator accidentally hammer the source.
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::crawler::browser_manager::BrowserManager;
|
||||||
|
use crate::crawler::content::{self, SyncOutcome};
|
||||||
|
use crate::crawler::pipeline;
|
||||||
|
use crate::crawler::rate_limit::HostRateLimiters;
|
||||||
|
use crate::crawler::safety::DownloadAllowlist;
|
||||||
|
use crate::crawler::source::target::TargetSource;
|
||||||
|
use crate::crawler::source::{FetchContext, Source, SourceMangaRef};
|
||||||
|
use crate::crawler::tor::TorController;
|
||||||
|
use crate::repo;
|
||||||
|
use crate::repo::crawler::UpsertStatus;
|
||||||
|
use crate::storage::Storage;
|
||||||
|
|
||||||
|
/// Outcome of [`ResyncService::resync_manga`]. Mirrors the bits the
|
||||||
|
/// admin UI cares about — was the row actually re-upserted, did the
|
||||||
|
/// cover land — so the response can show "metadata refreshed, cover
|
||||||
|
/// re-downloaded" or "metadata unchanged" without a second round-trip.
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub struct MangaResyncOutcome {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub metadata_status: UpsertStatus,
|
||||||
|
pub cover_fetched: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Outcome of [`ResyncService::resync_chapter`]. `Fetched(pages)` is the
|
||||||
|
/// success case; `Skipped` means the source row was already gone or the
|
||||||
|
/// chapter had no live source.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum ChapterResyncOutcome {
|
||||||
|
Fetched { chapter_id: Uuid, pages: usize },
|
||||||
|
Skipped { chapter_id: Uuid, reason: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service exposed by the daemon to the admin API. Optional on
|
||||||
|
/// [`AppState`] — `None` when the crawler daemon is disabled
|
||||||
|
/// (`CRAWLER_DAEMON=false`), in which case admin handlers return 503.
|
||||||
|
#[async_trait]
|
||||||
|
pub trait ResyncService: Send + Sync {
|
||||||
|
async fn resync_manga(&self, manga_id: Uuid) -> anyhow::Result<MangaResyncOutcome>;
|
||||||
|
async fn resync_chapter(&self, chapter_id: Uuid) -> anyhow::Result<ChapterResyncOutcome>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Errors with a stable shape so the API layer can map them to the
|
||||||
|
/// right HTTP status (404 vs 422 vs 5xx). Anything else surfaces as a
|
||||||
|
/// generic 500.
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum ResyncError {
|
||||||
|
#[error("manga has no source to resync from")]
|
||||||
|
NoMangaSource,
|
||||||
|
#[error("chapter has no source to resync from")]
|
||||||
|
NoChapterSource,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RealResyncService {
|
||||||
|
pub browser_manager: Arc<BrowserManager>,
|
||||||
|
pub db: PgPool,
|
||||||
|
pub storage: Arc<dyn Storage>,
|
||||||
|
pub http: reqwest::Client,
|
||||||
|
pub rate: Arc<HostRateLimiters>,
|
||||||
|
pub download_allowlist: DownloadAllowlist,
|
||||||
|
pub max_image_bytes: usize,
|
||||||
|
pub tor: Option<Arc<TorController>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl ResyncService for RealResyncService {
|
||||||
|
async fn resync_manga(&self, manga_id: Uuid) -> anyhow::Result<MangaResyncOutcome> {
|
||||||
|
// Pick the freshest live source row. Multi-source mangas
|
||||||
|
// (theoretical — only one Source impl today) get the row whose
|
||||||
|
// `last_seen_at` is newest; soft-dropped rows are skipped.
|
||||||
|
let row: Option<(String, String, String)> = sqlx::query_as(
|
||||||
|
"SELECT source_id, source_manga_key, source_url \
|
||||||
|
FROM manga_sources \
|
||||||
|
WHERE manga_id = $1 AND dropped_at IS NULL \
|
||||||
|
ORDER BY last_seen_at DESC \
|
||||||
|
LIMIT 1",
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_optional(&self.db)
|
||||||
|
.await
|
||||||
|
.context("look up manga_sources for resync")?;
|
||||||
|
let Some((_source_id, source_manga_key, source_url)) = row else {
|
||||||
|
return Err(ResyncError::NoMangaSource.into());
|
||||||
|
};
|
||||||
|
|
||||||
|
let lease = self
|
||||||
|
.browser_manager
|
||||||
|
.acquire()
|
||||||
|
.await
|
||||||
|
.context("acquire browser lease for manga resync")?;
|
||||||
|
let browser_ref: &chromiumoxide::Browser = &lease;
|
||||||
|
let ctx = FetchContext {
|
||||||
|
browser: browser_ref,
|
||||||
|
rate: &self.rate,
|
||||||
|
tor: self.tor.as_deref(),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse chapters too — a force resync is "make this manga fully
|
||||||
|
// current," not just metadata. The full pipeline handles the
|
||||||
|
// partial-render guard for us; we replicate the same caution
|
||||||
|
// here by skipping the chapter sync when the parser returned
|
||||||
|
// empty but the manga previously had chapters.
|
||||||
|
let source = TargetSource::new(source_url.clone());
|
||||||
|
let r = SourceMangaRef {
|
||||||
|
source_manga_key: source_manga_key.clone(),
|
||||||
|
title: String::new(),
|
||||||
|
url: source_url.clone(),
|
||||||
|
};
|
||||||
|
let manga = source
|
||||||
|
.fetch_manga(&ctx, &r)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("fetch_manga during resync of {manga_id}"))?;
|
||||||
|
|
||||||
|
// Partial-render guard: same logic as run_metadata_pass.
|
||||||
|
let source_id = source.id();
|
||||||
|
if !manga.chapters.is_empty() || {
|
||||||
|
let prior = repo::crawler::live_chapter_count_for_source_manga(
|
||||||
|
&self.db,
|
||||||
|
source_id,
|
||||||
|
&source_manga_key,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap_or(0);
|
||||||
|
prior == 0
|
||||||
|
} {
|
||||||
|
// Either the new fetch surfaced chapters, or there were
|
||||||
|
// none before either — chapter sync is safe to run.
|
||||||
|
} else {
|
||||||
|
tracing::warn!(
|
||||||
|
%manga_id,
|
||||||
|
source_url = %source_url,
|
||||||
|
"resync_manga: fetch returned empty chapters but prior count > 0; skipping chapter sync to avoid soft-drop"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let upsert = repo::crawler::upsert_manga_from_source(
|
||||||
|
&self.db,
|
||||||
|
source_id,
|
||||||
|
&source_url,
|
||||||
|
&manga,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("upsert_manga_from_source during resync of {manga_id}"))?;
|
||||||
|
|
||||||
|
// Cover refetch: force-download regardless of UpsertStatus.
|
||||||
|
// Admin clicked "resync" because they want the cover too.
|
||||||
|
let mut cover_fetched = false;
|
||||||
|
if let Some(cover_url) = manga.cover_url.as_deref() {
|
||||||
|
match pipeline::download_and_store_cover(
|
||||||
|
&self.db,
|
||||||
|
self.storage.as_ref(),
|
||||||
|
&self.http,
|
||||||
|
&self.rate,
|
||||||
|
&source_url,
|
||||||
|
upsert.manga_id,
|
||||||
|
cover_url,
|
||||||
|
&self.download_allowlist,
|
||||||
|
self.max_image_bytes,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(()) => cover_fetched = true,
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
%manga_id,
|
||||||
|
error = ?e,
|
||||||
|
"resync_manga: cover download failed"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chapter sync — only when the partial-render guard above
|
||||||
|
// didn't bail.
|
||||||
|
let prior_chapter_count = repo::crawler::live_chapter_count_for_source_manga(
|
||||||
|
&self.db,
|
||||||
|
source_id,
|
||||||
|
&source_manga_key,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap_or(0);
|
||||||
|
if !manga.chapters.is_empty() || prior_chapter_count == 0 {
|
||||||
|
match repo::crawler::sync_manga_chapters(
|
||||||
|
&self.db,
|
||||||
|
source_id,
|
||||||
|
upsert.manga_id,
|
||||||
|
&manga.chapters,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(diff) => tracing::info!(
|
||||||
|
%manga_id,
|
||||||
|
new = diff.new,
|
||||||
|
refreshed = diff.refreshed,
|
||||||
|
dropped = diff.dropped,
|
||||||
|
"resync_manga: chapters synced"
|
||||||
|
),
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
%manga_id,
|
||||||
|
error = ?e,
|
||||||
|
"resync_manga: chapter sync failed"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(lease);
|
||||||
|
Ok(MangaResyncOutcome {
|
||||||
|
manga_id: upsert.manga_id,
|
||||||
|
metadata_status: upsert.status,
|
||||||
|
cover_fetched,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn resync_chapter(&self, chapter_id: Uuid) -> anyhow::Result<ChapterResyncOutcome> {
|
||||||
|
let row = repo::chapter::dispatch_target(&self.db, chapter_id)
|
||||||
|
.await
|
||||||
|
.context("look up chapter_sources for resync")?;
|
||||||
|
let Some((manga_id, source_url, _title, _number)) = row else {
|
||||||
|
return Err(ResyncError::NoChapterSource.into());
|
||||||
|
};
|
||||||
|
|
||||||
|
let lease = self
|
||||||
|
.browser_manager
|
||||||
|
.acquire()
|
||||||
|
.await
|
||||||
|
.context("acquire browser lease for chapter resync")?;
|
||||||
|
let result = content::sync_chapter_content(
|
||||||
|
&lease,
|
||||||
|
&self.db,
|
||||||
|
self.storage.as_ref(),
|
||||||
|
&self.http,
|
||||||
|
&self.rate,
|
||||||
|
chapter_id,
|
||||||
|
manga_id,
|
||||||
|
&source_url,
|
||||||
|
true,
|
||||||
|
&self.download_allowlist,
|
||||||
|
self.max_image_bytes,
|
||||||
|
self.tor.as_deref(),
|
||||||
|
// Admin resync isn't a daemon worker slot — no live status.
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
drop(lease);
|
||||||
|
|
||||||
|
match result? {
|
||||||
|
SyncOutcome::Fetched { pages } => {
|
||||||
|
Ok(ChapterResyncOutcome::Fetched { chapter_id, pages })
|
||||||
|
}
|
||||||
|
SyncOutcome::Skipped => Ok(ChapterResyncOutcome::Skipped {
|
||||||
|
chapter_id,
|
||||||
|
reason: "chapter already had pages on disk".to_string(),
|
||||||
|
}),
|
||||||
|
SyncOutcome::SessionExpired => {
|
||||||
|
anyhow::bail!("source session expired — operator must refresh PHPSESSID")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
558
backend/src/crawler/safety.rs
Normal file
558
backend/src/crawler/safety.rs
Normal file
@@ -0,0 +1,558 @@
|
|||||||
|
//! Defensive helpers for the image-download paths.
|
||||||
|
//!
|
||||||
|
//! Two threats this module addresses:
|
||||||
|
//!
|
||||||
|
//! - **SSRF**: a scraped chapter or manga page can embed an absolute
|
||||||
|
//! `<img src="http://10.0.0.1/...">`. The crawler runs inside the
|
||||||
|
//! backend container with intra-compose access to `postgres:5432`
|
||||||
|
//! and possibly other internal services; without a host check the
|
||||||
|
//! crawler would happily probe them. [`is_safe_url`] rejects
|
||||||
|
//! anything whose host isn't on the operator-configured allowlist,
|
||||||
|
//! plus any IP literal in RFC1918 / loopback / link-local / unique-
|
||||||
|
//! local space (including IPv4-mapped IPv6 like `::ffff:127.0.0.1`)
|
||||||
|
//! as a second defence for the case where an allowlisted hostname's
|
||||||
|
//! DNS happens to resolve to a literal private address.
|
||||||
|
//!
|
||||||
|
//! **DNS rebinding is not covered.** A hostname like `cdn.allowed.com`
|
||||||
|
//! that *resolves* to `127.0.0.1` via hostile DNS bypasses the IP
|
||||||
|
//! check entirely — `is_safe_url` only inspects URL strings, not
|
||||||
|
//! resolved IPs. Mitigating that requires a custom reqwest resolver
|
||||||
|
//! that filters IPs after DNS, which would mean rebuilding reqwest's
|
||||||
|
//! connector. The allowlist + good operator DNS hygiene is the
|
||||||
|
//! realistic mitigation today.
|
||||||
|
//!
|
||||||
|
//! - **Unbounded download**: `Response::bytes().await` reads the full
|
||||||
|
//! body before returning. A malicious source serving a 10 GiB image
|
||||||
|
//! would fill memory and then disk. [`accumulate_capped`] streams
|
||||||
|
//! the body chunk-by-chunk into a [`bytes::BytesMut`] and bails as
|
||||||
|
//! soon as the running total exceeds the cap.
|
||||||
|
//!
|
||||||
|
//! Both helpers are pure-data: the SSRF check is keyed off a parsed
|
||||||
|
//! URL string, and the byte accumulator is keyed off a generic stream.
|
||||||
|
//! Easy to unit-test without a live network or browser.
|
||||||
|
|
||||||
|
use std::net::IpAddr;
|
||||||
|
|
||||||
|
use anyhow::{bail, Context};
|
||||||
|
use bytes::BytesMut;
|
||||||
|
use futures_util::StreamExt;
|
||||||
|
use reqwest::Url;
|
||||||
|
|
||||||
|
/// Default per-image download cap. A page image is generally <2 MiB;
|
||||||
|
/// 32 MiB leaves headroom for high-resolution covers while still
|
||||||
|
/// stopping a misbehaving CDN dead. Override via `CRAWLER_MAX_IMAGE_BYTES`.
|
||||||
|
pub const DEFAULT_MAX_IMAGE_BYTES: usize = 32 * 1024 * 1024;
|
||||||
|
|
||||||
|
/// Hosts that are always allowed in addition to the operator's
|
||||||
|
/// configured allowlist. None by default — keeping the surface area
|
||||||
|
/// minimal so the only way a URL gets through is if it matches an
|
||||||
|
/// explicit catalog/CDN entry.
|
||||||
|
///
|
||||||
|
/// `allow_any` flips the host check off entirely (private-IP and
|
||||||
|
/// scheme checks still apply). It exists for operators whose sources
|
||||||
|
/// shard images across numbered CDN subdomains (`cdn1`, `cdn2`, …)
|
||||||
|
/// where enumerating each host upfront is impractical. Off by default.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct DownloadAllowlist {
|
||||||
|
hosts: Vec<String>,
|
||||||
|
allow_any: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DownloadAllowlist {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
hosts: Vec::new(),
|
||||||
|
allow_any: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bypass the host allowlist. Scheme, localhost, and private-IP
|
||||||
|
/// checks in [`is_safe_url`] continue to apply — this only opens
|
||||||
|
/// up public hosts that weren't pre-enumerated.
|
||||||
|
pub fn allow_any() -> Self {
|
||||||
|
Self {
|
||||||
|
hosts: Vec::new(),
|
||||||
|
allow_any: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a host (case-insensitive match). Sub-domains are *not*
|
||||||
|
/// implied: pass `cdn.example.com` and `example.com` separately
|
||||||
|
/// if both should be reachable.
|
||||||
|
pub fn allow(mut self, host: impl Into<String>) -> Self {
|
||||||
|
let h = host.into().to_ascii_lowercase();
|
||||||
|
if !h.is_empty() && !self.hosts.iter().any(|existing| existing == &h) {
|
||||||
|
self.hosts.push(h);
|
||||||
|
}
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.hosts.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn contains(&self, host: &str) -> bool {
|
||||||
|
if self.allow_any {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
let lower = host.to_ascii_lowercase();
|
||||||
|
self.hosts.iter().any(|h| h == &lower)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Verify a URL is safe for the crawler to fetch.
|
||||||
|
///
|
||||||
|
/// Rejects:
|
||||||
|
/// - non-http(s) schemes (file://, gopher://, …),
|
||||||
|
/// - any IP literal in private / loopback / link-local / unique-local
|
||||||
|
/// space (defense in depth — a DNS allowlist alone wouldn't cover an
|
||||||
|
/// attacker that places an entry like `cdn.evil` pointing at
|
||||||
|
/// `192.168.1.1`),
|
||||||
|
/// - the literal hostname `localhost`,
|
||||||
|
/// - hosts that aren't on the supplied allowlist.
|
||||||
|
///
|
||||||
|
/// An empty allowlist rejects everything (the conservative default —
|
||||||
|
/// callers must explicitly allow the catalog and CDN hosts).
|
||||||
|
pub fn is_safe_url(raw_url: &str, allow: &DownloadAllowlist) -> Result<(), UrlSafetyError> {
|
||||||
|
let url = Url::parse(raw_url).map_err(|_| UrlSafetyError::Unparseable)?;
|
||||||
|
let scheme = url.scheme();
|
||||||
|
if scheme != "http" && scheme != "https" {
|
||||||
|
return Err(UrlSafetyError::BadScheme(scheme.to_string()));
|
||||||
|
}
|
||||||
|
let host = url.host_str().ok_or(UrlSafetyError::NoHost)?;
|
||||||
|
let lower_host = host.to_ascii_lowercase();
|
||||||
|
if lower_host == "localhost" {
|
||||||
|
return Err(UrlSafetyError::Loopback);
|
||||||
|
}
|
||||||
|
// Reject IP literals in private/loopback ranges regardless of the
|
||||||
|
// allowlist — if someone puts an IP literal on the allowlist they
|
||||||
|
// almost certainly didn't mean a private range.
|
||||||
|
// reqwest::Url normalises IPv6 literals as `[::1]` (brackets
|
||||||
|
// included) in `host_str()`. Strip the brackets before parsing.
|
||||||
|
let ip_candidate = lower_host
|
||||||
|
.strip_prefix('[')
|
||||||
|
.and_then(|s| s.strip_suffix(']'))
|
||||||
|
.unwrap_or(&lower_host);
|
||||||
|
if let Ok(ip) = ip_candidate.parse::<IpAddr>() {
|
||||||
|
if is_private_ip(&ip) {
|
||||||
|
return Err(UrlSafetyError::PrivateIp(ip));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allow.contains(&lower_host) {
|
||||||
|
return Err(UrlSafetyError::HostNotAllowed(lower_host));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_private_ip(ip: &IpAddr) -> bool {
|
||||||
|
match ip {
|
||||||
|
IpAddr::V4(v4) => {
|
||||||
|
v4.is_loopback()
|
||||||
|
|| v4.is_private()
|
||||||
|
|| v4.is_link_local()
|
||||||
|
|| v4.is_unspecified()
|
||||||
|
|| v4.is_broadcast()
|
||||||
|
// CGNAT 100.64.0.0/10
|
||||||
|
|| (v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64)
|
||||||
|
// 169.254/16 link-local already covered, but 0.0.0.0/8 is special-use
|
||||||
|
|| v4.octets()[0] == 0
|
||||||
|
}
|
||||||
|
IpAddr::V6(v6) => {
|
||||||
|
// IPv4-mapped IPv6 (::ffff:0:0/96): unwrap to the embedded
|
||||||
|
// IPv4 and recurse so `::ffff:127.0.0.1` is caught by the
|
||||||
|
// IPv4 loopback check rather than passing through.
|
||||||
|
// `Ipv6Addr::is_loopback()` only matches `::1` exactly.
|
||||||
|
if let Some(v4) = v6.to_ipv4_mapped() {
|
||||||
|
return is_private_ip(&IpAddr::V4(v4));
|
||||||
|
}
|
||||||
|
v6.is_loopback()
|
||||||
|
|| v6.is_unspecified()
|
||||||
|
// fc00::/7 unique-local
|
||||||
|
|| (v6.segments()[0] & 0xfe00) == 0xfc00
|
||||||
|
// fe80::/10 link-local
|
||||||
|
|| (v6.segments()[0] & 0xffc0) == 0xfe80
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||||
|
pub enum UrlSafetyError {
|
||||||
|
#[error("URL is not parseable")]
|
||||||
|
Unparseable,
|
||||||
|
#[error("scheme {0:?} is not http or https")]
|
||||||
|
BadScheme(String),
|
||||||
|
#[error("URL is missing a host")]
|
||||||
|
NoHost,
|
||||||
|
#[error("host points at the loopback interface")]
|
||||||
|
Loopback,
|
||||||
|
#[error("host is a private/internal IP: {0}")]
|
||||||
|
PrivateIp(IpAddr),
|
||||||
|
#[error("host {0:?} is not on the crawler download allowlist")]
|
||||||
|
HostNotAllowed(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drain a byte stream into a single buffer, bailing out as soon as
|
||||||
|
/// the running total exceeds `max_bytes`. Generic over the stream so
|
||||||
|
/// it's testable without a live HTTP response.
|
||||||
|
pub async fn accumulate_capped<S, E>(stream: S, max_bytes: usize) -> anyhow::Result<bytes::Bytes>
|
||||||
|
where
|
||||||
|
S: futures_core::Stream<Item = Result<bytes::Bytes, E>>,
|
||||||
|
E: std::error::Error + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
let mut buf = BytesMut::new();
|
||||||
|
let mut stream = std::pin::pin!(stream);
|
||||||
|
while let Some(chunk) = stream.next().await {
|
||||||
|
let chunk = chunk.map_err(|e| anyhow::anyhow!("stream chunk: {e}"))?;
|
||||||
|
if buf.len().saturating_add(chunk.len()) > max_bytes {
|
||||||
|
bail!(
|
||||||
|
"response exceeds {max_bytes}-byte cap (received >{}+{})",
|
||||||
|
buf.len(),
|
||||||
|
chunk.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
buf.extend_from_slice(&chunk);
|
||||||
|
}
|
||||||
|
Ok(buf.freeze())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send `req` and stream the response into a length-limited buffer.
|
||||||
|
/// Combines [`is_safe_url`] check + [`accumulate_capped`] so each
|
||||||
|
/// call-site is one line.
|
||||||
|
pub async fn fetch_bytes_capped(
|
||||||
|
http: &reqwest::Client,
|
||||||
|
url: &str,
|
||||||
|
referer: Option<&str>,
|
||||||
|
allow: &DownloadAllowlist,
|
||||||
|
max_bytes: usize,
|
||||||
|
) -> anyhow::Result<bytes::Bytes> {
|
||||||
|
is_safe_url(url, allow).with_context(|| format!("reject unsafe URL {url}"))?;
|
||||||
|
let mut req = http.get(url);
|
||||||
|
if let Some(r) = referer {
|
||||||
|
req = req.header(reqwest::header::REFERER, r);
|
||||||
|
}
|
||||||
|
let resp = req
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("GET {url}"))?
|
||||||
|
.error_for_status()
|
||||||
|
.with_context(|| format!("non-2xx for {url}"))?;
|
||||||
|
accumulate_capped(resp.bytes_stream(), max_bytes)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("download body for {url}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True when `bytes` sniffs as one of the *renderable* image formats
|
||||||
|
/// the `/files/*key` endpoint can serve with a correct Content-Type:
|
||||||
|
/// JPEG, PNG, WebP, GIF, AVIF. Matches the upload pipeline's
|
||||||
|
/// whitelist in `upload::parse_image`.
|
||||||
|
///
|
||||||
|
/// `infer::MatcherType::Image` is intentionally NOT used — it also
|
||||||
|
/// matches BMP, TIFF, HEIF, ICO, PSD, and JP2. Those would sniff as
|
||||||
|
/// "image" here but [`api::files::content_type_for`] would fall back
|
||||||
|
/// to `application/octet-stream`, prompting browsers to download
|
||||||
|
/// instead of render. Keep the two layers aligned.
|
||||||
|
pub fn looks_like_image(bytes: &[u8]) -> bool {
|
||||||
|
matches!(
|
||||||
|
infer::get(bytes).map(|k| k.mime_type()),
|
||||||
|
Some("image/jpeg" | "image/png" | "image/webp" | "image/gif" | "image/avif")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use futures_util::stream;
|
||||||
|
|
||||||
|
fn allow_just(host: &str) -> DownloadAllowlist {
|
||||||
|
DownloadAllowlist::new().allow(host)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_admits_arbitrary_public_host() {
|
||||||
|
// Operators who can't pre-enumerate a numbered-CDN fleet
|
||||||
|
// (cdn1, cdn2, …) opt into allow_any. Any public host passes.
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
assert!(is_safe_url("https://cdn7.random.tld/x.jpg", &allow).is_ok());
|
||||||
|
assert!(is_safe_url("https://anything-goes.example/", &allow).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_still_blocks_private_ips() {
|
||||||
|
// The point of the bypass is the host-allowlist check, not the
|
||||||
|
// SSRF defense. Private/loopback IPs stay refused.
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
for url in [
|
||||||
|
"http://10.0.0.1/",
|
||||||
|
"http://192.168.1.1/",
|
||||||
|
"http://169.254.169.254/",
|
||||||
|
"http://127.0.0.1/",
|
||||||
|
"http://[::1]/",
|
||||||
|
"http://[::ffff:127.0.0.1]/",
|
||||||
|
] {
|
||||||
|
assert!(
|
||||||
|
matches!(
|
||||||
|
is_safe_url(url, &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::PrivateIp(_)
|
||||||
|
),
|
||||||
|
"allow_any must still reject {url}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_still_blocks_localhost() {
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::Loopback
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allow_any_still_blocks_non_http_schemes() {
|
||||||
|
let allow = DownloadAllowlist::allow_any();
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::BadScheme(_)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_allows_listed_host() {
|
||||||
|
let allow = allow_just("cdn.example.com");
|
||||||
|
assert!(is_safe_url("https://cdn.example.com/img.jpg", &allow).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_unlisted_host() {
|
||||||
|
let allow = allow_just("cdn.example.com");
|
||||||
|
let err = is_safe_url("https://evil.example.org/img.jpg", &allow).unwrap_err();
|
||||||
|
assert!(matches!(err, UrlSafetyError::HostNotAllowed(h) if h == "evil.example.org"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_localhost_even_if_allowlisted() {
|
||||||
|
let allow = allow_just("localhost");
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("http://localhost:8080/", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::Loopback
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_loopback_ipv4() {
|
||||||
|
let allow = allow_just("127.0.0.1");
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("http://127.0.0.1/", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::PrivateIp(_)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_rfc1918() {
|
||||||
|
let allow = allow_just("10.0.0.1");
|
||||||
|
for url in [
|
||||||
|
"http://10.0.0.1/",
|
||||||
|
"http://192.168.1.1/",
|
||||||
|
"http://172.16.0.5/",
|
||||||
|
"http://172.31.255.255/",
|
||||||
|
] {
|
||||||
|
assert!(
|
||||||
|
matches!(
|
||||||
|
is_safe_url(url, &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::PrivateIp(_)
|
||||||
|
),
|
||||||
|
"should reject {url}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_link_local() {
|
||||||
|
let allow = allow_just("169.254.169.254");
|
||||||
|
// 169.254.169.254 is the AWS/GCP metadata service — the most
|
||||||
|
// dangerous SSRF target on a default cloud VM.
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("http://169.254.169.254/", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::PrivateIp(_)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_ipv6_loopback_and_ula() {
|
||||||
|
// Debug what host_str returns first — reqwest::Url normalises
|
||||||
|
// IPv6 literals as `[::1]` with brackets, which doesn't parse
|
||||||
|
// as `IpAddr` directly. The implementation strips them.
|
||||||
|
let allow = allow_just("[::1]");
|
||||||
|
let err = is_safe_url("http://[::1]/", &allow).unwrap_err();
|
||||||
|
assert!(
|
||||||
|
matches!(err, UrlSafetyError::PrivateIp(_)),
|
||||||
|
"expected PrivateIp, got {err:?}"
|
||||||
|
);
|
||||||
|
let allow = allow_just("[fd00::1]");
|
||||||
|
let err = is_safe_url("http://[fd00::1]/", &allow).unwrap_err();
|
||||||
|
assert!(
|
||||||
|
matches!(err, UrlSafetyError::PrivateIp(_)),
|
||||||
|
"expected PrivateIp, got {err:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_ipv4_mapped_ipv6_loopback() {
|
||||||
|
// `Ipv6Addr::is_loopback()` only matches `::1` exactly, so
|
||||||
|
// `::ffff:127.0.0.1` would slip through without the
|
||||||
|
// to_ipv4_mapped() unwrap in is_private_ip.
|
||||||
|
let allow = allow_just("[::ffff:127.0.0.1]");
|
||||||
|
let err = is_safe_url("http://[::ffff:127.0.0.1]/", &allow).unwrap_err();
|
||||||
|
assert!(
|
||||||
|
matches!(err, UrlSafetyError::PrivateIp(_)),
|
||||||
|
"expected PrivateIp, got {err:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_ipv4_mapped_ipv6_rfc1918() {
|
||||||
|
let allow = allow_just("[::ffff:10.0.0.1]");
|
||||||
|
let err = is_safe_url("http://[::ffff:10.0.0.1]/", &allow).unwrap_err();
|
||||||
|
assert!(matches!(err, UrlSafetyError::PrivateIp(_)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_blocks_non_http_schemes() {
|
||||||
|
let allow = allow_just("anywhere");
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("file:///etc/passwd", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::BadScheme(_)
|
||||||
|
));
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("gopher://anywhere:70/", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::BadScheme(_)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_rejects_unparseable() {
|
||||||
|
let allow = allow_just("anywhere");
|
||||||
|
assert!(matches!(
|
||||||
|
is_safe_url("not a url", &allow).unwrap_err(),
|
||||||
|
UrlSafetyError::Unparseable
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn safe_url_empty_allowlist_rejects_everything() {
|
||||||
|
let allow = DownloadAllowlist::new();
|
||||||
|
let err = is_safe_url("https://cdn.example.com/img.jpg", &allow).unwrap_err();
|
||||||
|
assert!(matches!(err, UrlSafetyError::HostNotAllowed(_)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn allowlist_matches_case_insensitively() {
|
||||||
|
let allow = DownloadAllowlist::new().allow("CDN.Example.COM");
|
||||||
|
assert!(is_safe_url("https://cdn.example.com/x.jpg", &allow).is_ok());
|
||||||
|
assert!(is_safe_url("https://CDN.EXAMPLE.com/x.jpg", &allow).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn accumulate_capped_returns_full_body_under_cap() {
|
||||||
|
let chunks: Vec<Result<bytes::Bytes, std::io::Error>> = vec![
|
||||||
|
Ok(bytes::Bytes::from_static(b"hello ")),
|
||||||
|
Ok(bytes::Bytes::from_static(b"world")),
|
||||||
|
];
|
||||||
|
let s = stream::iter(chunks);
|
||||||
|
let out = accumulate_capped(s, 100).await.unwrap();
|
||||||
|
assert_eq!(out.as_ref(), b"hello world");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn accumulate_capped_bails_past_cap() {
|
||||||
|
let chunks: Vec<Result<bytes::Bytes, std::io::Error>> = vec![
|
||||||
|
Ok(bytes::Bytes::from(vec![0u8; 50])),
|
||||||
|
Ok(bytes::Bytes::from(vec![0u8; 60])),
|
||||||
|
];
|
||||||
|
let s = stream::iter(chunks);
|
||||||
|
let err = accumulate_capped(s, 100).await.unwrap_err();
|
||||||
|
assert!(err.to_string().contains("100-byte cap"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn accumulate_capped_surfaces_stream_errors() {
|
||||||
|
let chunks: Vec<Result<bytes::Bytes, std::io::Error>> = vec![
|
||||||
|
Ok(bytes::Bytes::from_static(b"ok")),
|
||||||
|
Err(std::io::Error::other("network blip")),
|
||||||
|
];
|
||||||
|
let s = stream::iter(chunks);
|
||||||
|
let err = accumulate_capped(s, 100).await.unwrap_err();
|
||||||
|
assert!(err.to_string().contains("network blip"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn looks_like_image_accepts_jpeg() {
|
||||||
|
// JPEG SOI + APP0 segment.
|
||||||
|
let jpeg = [0xff, 0xd8, 0xff, 0xe0, 0, 0x10, b'J', b'F', b'I', b'F'];
|
||||||
|
assert!(looks_like_image(&jpeg));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn looks_like_image_accepts_png() {
|
||||||
|
let png = [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0, 0, 0, 0];
|
||||||
|
assert!(looks_like_image(&png));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn looks_like_image_rejects_html_disguised_as_image() {
|
||||||
|
let html = b"<html><body>not an image</body></html>";
|
||||||
|
assert!(!looks_like_image(html));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn looks_like_image_rejects_empty() {
|
||||||
|
assert!(!looks_like_image(&[]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn looks_like_image_rejects_renderable_but_unsupported_formats() {
|
||||||
|
// BMP, TIFF, ICO, PSD are `infer::MatcherType::Image` but the
|
||||||
|
// /files/*key handler doesn't have Content-Type mappings for
|
||||||
|
// them, so they'd be served as application/octet-stream and
|
||||||
|
// download instead of render. Reject at the crawler so we
|
||||||
|
// never land them in storage.
|
||||||
|
// BMP magic: "BM" + 4-byte size.
|
||||||
|
let bmp = [b'B', b'M', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
|
||||||
|
assert!(!looks_like_image(&bmp), "BMP must be rejected (not renderable by /files)");
|
||||||
|
|
||||||
|
// TIFF little-endian magic: "II" + 42.
|
||||||
|
let tiff = [0x49, 0x49, 0x2a, 0x00, 0, 0, 0, 0];
|
||||||
|
assert!(!looks_like_image(&tiff), "TIFF must be rejected");
|
||||||
|
|
||||||
|
// ICO magic: 0x00,0x00,0x01,0x00.
|
||||||
|
let ico = [0x00, 0x00, 0x01, 0x00, 1, 0, 16, 16, 0, 0, 1, 0, 0x18, 0, 0x40, 0, 0, 0, 0x16, 0, 0, 0];
|
||||||
|
assert!(!looks_like_image(&ico), "ICO must be rejected");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn looks_like_image_accepts_webp_gif_avif() {
|
||||||
|
// Cover the three remaining whitelisted formats so a future
|
||||||
|
// tightening that drops one would fail noisily.
|
||||||
|
let webp = [
|
||||||
|
b'R', b'I', b'F', b'F',
|
||||||
|
0, 0, 0, 0,
|
||||||
|
b'W', b'E', b'B', b'P',
|
||||||
|
b'V', b'P', b'8', b' ',
|
||||||
|
];
|
||||||
|
assert!(looks_like_image(&webp));
|
||||||
|
|
||||||
|
let gif = [b'G', b'I', b'F', b'8', b'7', b'a', 0, 0, 0, 0];
|
||||||
|
assert!(looks_like_image(&gif));
|
||||||
|
|
||||||
|
let avif = [
|
||||||
|
0x00, 0x00, 0x00, 0x18,
|
||||||
|
b'f', b't', b'y', b'p',
|
||||||
|
b'a', b'v', b'i', b'f',
|
||||||
|
0x00, 0x00, 0x00, 0x00,
|
||||||
|
b'm', b'i', b'f', b'1',
|
||||||
|
b'a', b'v', b'i', b'f',
|
||||||
|
];
|
||||||
|
assert!(looks_like_image(&avif));
|
||||||
|
}
|
||||||
|
}
|
||||||
635
backend/src/crawler/session.rs
Normal file
635
backend/src/crawler/session.rs
Normal file
@@ -0,0 +1,635 @@
|
|||||||
|
//! PHPSESSID injection + login probe.
|
||||||
|
//!
|
||||||
|
//! The catalog site we crawl renders chapter pages as a single multi-
|
||||||
|
//! page list only for logged-in users. We don't try to bypass the
|
||||||
|
//! login (CAPTCHA wall) — instead the operator pastes their browser's
|
||||||
|
//! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects
|
||||||
|
//! it into Chromium *and* reqwest before the first navigation.
|
||||||
|
//!
|
||||||
|
//! Two things the cookie alone doesn't give us:
|
||||||
|
//! 1. The cookie value is only meaningful to the *server* — we have
|
||||||
|
//! no way to predict from the value alone whether it's still valid.
|
||||||
|
//! `verify_session` does a navigation and inspects the probe page
|
||||||
|
//! for three outcomes: broken-page response (transient — retry the
|
||||||
|
//! probe), `#logo` present but `#avatar_menu` absent (genuine logout
|
||||||
|
//! — bail loudly), or both present (authenticated). The earlier
|
||||||
|
//! avatar-only check conflated "site is hiccuping" with "session is
|
||||||
|
//! dead" and refused to start the crawler when the site had a brief
|
||||||
|
//! 503.
|
||||||
|
//! 2. The reqwest client (used for cover and chapter-image downloads)
|
||||||
|
//! has its own cookie store; we seed it for the catalog host only.
|
||||||
|
//! CDN hosts are deliberately *not* given the cookie — they serve
|
||||||
|
//! image bytes by signed URLs and don't need it.
|
||||||
|
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context};
|
||||||
|
use chromiumoxide::browser::Browser;
|
||||||
|
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
|
||||||
|
|
||||||
|
use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body};
|
||||||
|
|
||||||
|
/// Outcome of inspecting a probe-page response.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum SessionProbe {
|
||||||
|
/// `#logo` present and `#avatar_menu` present — session valid.
|
||||||
|
Ok,
|
||||||
|
/// `#logo` present but `#avatar_menu` absent — site rendered the
|
||||||
|
/// normal layout for an unauthenticated visitor; refresh PHPSESSID.
|
||||||
|
Unauthenticated,
|
||||||
|
/// Broken-page body signature or `#logo` missing — site is hiccuping.
|
||||||
|
/// Caller retries the probe rather than blaming the session.
|
||||||
|
Transient,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Re-export so existing callers keep working after the helper moved
|
||||||
|
/// to `crawler::url_utils`. The body lives there.
|
||||||
|
pub use crate::crawler::url_utils::registrable_domain;
|
||||||
|
|
||||||
|
/// Inject the PHPSESSID cookie into the browser's cookie store for the
|
||||||
|
/// catalog domain. Must be called before any navigation that depends on
|
||||||
|
/// authentication; subsequent navigations include the cookie
|
||||||
|
/// automatically.
|
||||||
|
pub async fn inject_phpsessid(
|
||||||
|
browser: &Browser,
|
||||||
|
sid: &str,
|
||||||
|
cookie_domain: &str,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let cookie = CookieParam {
|
||||||
|
name: "PHPSESSID".to_string(),
|
||||||
|
value: sid.to_string(),
|
||||||
|
url: None,
|
||||||
|
domain: Some(cookie_domain.to_string()),
|
||||||
|
path: Some("/".to_string()),
|
||||||
|
secure: None,
|
||||||
|
http_only: Some(true),
|
||||||
|
same_site: None,
|
||||||
|
expires: None,
|
||||||
|
priority: None,
|
||||||
|
same_party: None,
|
||||||
|
source_scheme: None,
|
||||||
|
source_port: None,
|
||||||
|
partition_key: None,
|
||||||
|
};
|
||||||
|
browser
|
||||||
|
.set_cookies(vec![cookie])
|
||||||
|
.await
|
||||||
|
.context("set PHPSESSID in chromium cookie store")?;
|
||||||
|
tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Three-way classification of a probe-page response. Pure over HTML so
|
||||||
|
/// it's unit-testable without a real browser. Order matters: a body
|
||||||
|
/// matching the broken-page template is `Transient` even if the page
|
||||||
|
/// happens to contain `#avatar_menu` HTML somewhere — trust the universal
|
||||||
|
/// site signal over a stray selector match.
|
||||||
|
pub fn classify_probe(html: &str) -> SessionProbe {
|
||||||
|
if is_broken_page_body(html) {
|
||||||
|
return SessionProbe::Transient;
|
||||||
|
}
|
||||||
|
let doc = scraper::Html::parse_document(html);
|
||||||
|
if !has_logo_sentinel(&doc) {
|
||||||
|
return SessionProbe::Transient;
|
||||||
|
}
|
||||||
|
let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap();
|
||||||
|
if doc.select(&avatar_sel).next().is_some() {
|
||||||
|
SessionProbe::Ok
|
||||||
|
} else {
|
||||||
|
SessionProbe::Unauthenticated
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Three-way classification of a chapter page response.
|
||||||
|
///
|
||||||
|
/// Reader pages don't render `#logo`, so [`classify_probe`] can't be
|
||||||
|
/// reused as-is. The chapter-specific marker is `a#pic_container`
|
||||||
|
/// (asserted by the reader-page parser at `parse_chapter_pages`).
|
||||||
|
///
|
||||||
|
/// Order matters: broken-page body wins over selector matches, so a
|
||||||
|
/// transient site-wide 5xx that happens to render the avatar widget
|
||||||
|
/// elsewhere doesn't falsely reach `Ok`.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum ChapterProbe {
|
||||||
|
/// `a#pic_container` present — reader rendered. Whether
|
||||||
|
/// `#avatar_menu` is also there is informational; if the reader
|
||||||
|
/// loaded the session is by definition still good.
|
||||||
|
Ok,
|
||||||
|
/// Site rendered a "logged out" or "please log in" page (no
|
||||||
|
/// reader, no broken-page body, and no avatar widget either).
|
||||||
|
/// Distinguishes the genuine expired-session case from a
|
||||||
|
/// transient site hiccup.
|
||||||
|
Unauthenticated,
|
||||||
|
/// Broken-page body, or reader didn't render but the user is
|
||||||
|
/// still logged in (avatar widget present). Caller should retry
|
||||||
|
/// rather than blame the session.
|
||||||
|
Transient,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn classify_chapter_probe(html: &str) -> ChapterProbe {
|
||||||
|
if is_broken_page_body(html) {
|
||||||
|
return ChapterProbe::Transient;
|
||||||
|
}
|
||||||
|
let doc = scraper::Html::parse_document(html);
|
||||||
|
let container = scraper::Selector::parse("a#pic_container").unwrap();
|
||||||
|
if doc.select(&container).next().is_some() {
|
||||||
|
return ChapterProbe::Ok;
|
||||||
|
}
|
||||||
|
let avatar = scraper::Selector::parse("#avatar_menu").unwrap();
|
||||||
|
if doc.select(&avatar).next().is_some() {
|
||||||
|
// Logged-in user, but the reader didn't render — most likely
|
||||||
|
// the layout shifted or the site is serving an interstitial.
|
||||||
|
ChapterProbe::Transient
|
||||||
|
} else {
|
||||||
|
// No reader, no avatar, no broken-body marker — site rendered
|
||||||
|
// the "please log in" page, which is the genuine session-
|
||||||
|
// expired signal on this route.
|
||||||
|
ChapterProbe::Unauthenticated
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// In-startup retry budget for the session probe. Small but non-zero —
|
||||||
|
/// startup hitting a 5-second site hiccup shouldn't fail the operator
|
||||||
|
/// with "PHPSESSID expired" when the session is actually fine.
|
||||||
|
const PROBE_MAX_ATTEMPTS: u32 = 3;
|
||||||
|
const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
|
||||||
|
|
||||||
|
/// Navigate to `probe_url` and classify the response. Retries the probe
|
||||||
|
/// on `Transient` outcomes (broken-page body, missing `#logo`); fails
|
||||||
|
/// fast on `Unauthenticated`; returns `Ok(())` on success.
|
||||||
|
///
|
||||||
|
/// This burns one navigation per attempt against the catalog's rate
|
||||||
|
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
|
||||||
|
/// minutes into a backfill costs 30 minutes.
|
||||||
|
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
|
||||||
|
verify_session_with_recircuit(browser, probe_url, None, 0).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Like [`verify_session`] but, when `tor` is `Some`, signals
|
||||||
|
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
|
||||||
|
/// `Unauthenticated` as recoverable (up to `tor_max_attempts` total
|
||||||
|
/// probes, calling NEWNYM between each).
|
||||||
|
///
|
||||||
|
/// `verify_session` is `verify_session_with_recircuit(..., None, _)`,
|
||||||
|
/// which collapses the `Unauthenticated` budget to 1 attempt — i.e.
|
||||||
|
/// fail-fast, exactly the pre-TOR behavior.
|
||||||
|
pub async fn verify_session_with_recircuit(
|
||||||
|
browser: &Browser,
|
||||||
|
probe_url: &str,
|
||||||
|
tor: Option<&crate::crawler::tor::TorController>,
|
||||||
|
tor_max_attempts: u32,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 };
|
||||||
|
run_session_probe_loop(
|
||||||
|
|| fetch_probe_html(browser, probe_url),
|
||||||
|
|| async {
|
||||||
|
if let Some(t) = tor {
|
||||||
|
if let Err(e) = t.new_identity().await {
|
||||||
|
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
PROBE_MAX_ATTEMPTS,
|
||||||
|
unauth_max_attempts,
|
||||||
|
PROBE_RETRY_DELAY,
|
||||||
|
probe_url,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pure-over-IO loop body for the session probe. Generic over the
|
||||||
|
/// fetch and recircuit closures so it can be unit-tested without a
|
||||||
|
/// real browser or TOR daemon.
|
||||||
|
///
|
||||||
|
/// Both budgets count **total attempts**, including the first — so
|
||||||
|
/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits
|
||||||
|
/// between them, and `unauth_max_attempts = 1` means "fail-fast, no
|
||||||
|
/// retry". This matches [`crate::crawler::detect::retry_on_transient`]
|
||||||
|
/// and the content-path recircuit loop.
|
||||||
|
///
|
||||||
|
/// Outcomes:
|
||||||
|
/// - `SessionProbe::Ok` → return `Ok(())`.
|
||||||
|
/// - `SessionProbe::Unauthenticated` → recircuit + retry while
|
||||||
|
/// under the unauth budget. After the cap, bail with the
|
||||||
|
/// "PHPSESSID expired" diagnostic, mentioning the attempt count so
|
||||||
|
/// a TOR-misconfig diagnosis is easier.
|
||||||
|
/// - `SessionProbe::Transient` → same shape against the transient
|
||||||
|
/// budget; bails with "site down or rate-limiting" after the cap.
|
||||||
|
async fn run_session_probe_loop<F, Fut, R, RFut>(
|
||||||
|
mut fetch_html: F,
|
||||||
|
mut recircuit: R,
|
||||||
|
transient_max_attempts: u32,
|
||||||
|
unauth_max_attempts: u32,
|
||||||
|
retry_delay: Duration,
|
||||||
|
probe_url_for_msg: &str,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
where
|
||||||
|
F: FnMut() -> Fut,
|
||||||
|
Fut: std::future::Future<Output = anyhow::Result<String>>,
|
||||||
|
R: FnMut() -> RFut,
|
||||||
|
RFut: std::future::Future<Output = ()>,
|
||||||
|
{
|
||||||
|
debug_assert!(transient_max_attempts >= 1);
|
||||||
|
debug_assert!(unauth_max_attempts >= 1);
|
||||||
|
let mut transient_attempts = 0u32;
|
||||||
|
let mut unauth_attempts = 0u32;
|
||||||
|
loop {
|
||||||
|
let html = fetch_html().await?;
|
||||||
|
match classify_probe(&html) {
|
||||||
|
SessionProbe::Ok => {
|
||||||
|
tracing::info!(
|
||||||
|
transient_attempts,
|
||||||
|
unauth_attempts,
|
||||||
|
"session probe ok — #logo + #avatar_menu present"
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
SessionProbe::Unauthenticated => {
|
||||||
|
unauth_attempts += 1;
|
||||||
|
if unauth_attempts >= unauth_max_attempts {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
|
||||||
|
after {unauth_attempts} attempt(s); PHPSESSID is missing, \
|
||||||
|
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||||
|
));
|
||||||
|
}
|
||||||
|
tracing::warn!(
|
||||||
|
attempt = unauth_attempts,
|
||||||
|
max_attempts = unauth_max_attempts,
|
||||||
|
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
|
||||||
|
NEWNYM and retrying"
|
||||||
|
);
|
||||||
|
recircuit().await;
|
||||||
|
tokio::time::sleep(retry_delay).await;
|
||||||
|
}
|
||||||
|
SessionProbe::Transient => {
|
||||||
|
transient_attempts += 1;
|
||||||
|
if transient_attempts >= transient_max_attempts {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"session probe failed — probe page at {probe_url_for_msg} returned \
|
||||||
|
a broken-page response after {transient_max_attempts} attempts. \
|
||||||
|
The site appears to be down or rate-limiting us; try again \
|
||||||
|
later before refreshing CRAWLER_PHPSESSID."
|
||||||
|
));
|
||||||
|
}
|
||||||
|
tracing::warn!(
|
||||||
|
attempt = transient_attempts,
|
||||||
|
max_attempts = transient_max_attempts,
|
||||||
|
"session probe got a transient page; recircuit + retry"
|
||||||
|
);
|
||||||
|
recircuit().await;
|
||||||
|
tokio::time::sleep(retry_delay).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<String> {
|
||||||
|
let page = browser
|
||||||
|
.new_page(probe_url)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("open probe page {probe_url}"))?;
|
||||||
|
crate::crawler::nav::wait_for_nav(&page)
|
||||||
|
.await
|
||||||
|
.context("wait for nav on probe")?;
|
||||||
|
// Best-effort wait for the layout marker. Timeout is fine — the
|
||||||
|
// probe classifier handles a missing `#logo` as Transient anyway,
|
||||||
|
// and the verify loop retries on Transient.
|
||||||
|
let _ = crate::crawler::nav::wait_for_selector(
|
||||||
|
&page,
|
||||||
|
"#logo",
|
||||||
|
crate::crawler::nav::SELECTOR_TIMEOUT,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let html = page.content().await.context("read probe html")?;
|
||||||
|
page.close().await.ok();
|
||||||
|
Ok(html)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// registrable_domain tests live in crawler::url_utils now —
|
||||||
|
// it's the canonical home for that helper.
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_probe_ok_when_logo_and_avatar_present() {
|
||||||
|
let html = r#"<html><body>
|
||||||
|
<header><div id="logo">Target</div><div id="avatar_menu"></div></header>
|
||||||
|
</body></html>"#;
|
||||||
|
assert_eq!(classify_probe(html), SessionProbe::Ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_probe_unauth_when_logo_present_but_avatar_absent() {
|
||||||
|
// Real "logged out" response: site layout renders fine, just no
|
||||||
|
// avatar widget. This is the only state that should blame the
|
||||||
|
// session cookie.
|
||||||
|
let html = r#"<html><body>
|
||||||
|
<header><div id="logo">Target</div></header>
|
||||||
|
<main>Please log in.</main>
|
||||||
|
</body></html>"#;
|
||||||
|
assert_eq!(classify_probe(html), SessionProbe::Unauthenticated);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_probe_transient_on_broken_page_body() {
|
||||||
|
let html = "<html><body>\
|
||||||
|
<p>we're sorry, the request file are not found.</p>\
|
||||||
|
</body></html>";
|
||||||
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_probe_transient_when_logo_missing() {
|
||||||
|
// No broken-body marker, but no site layout either — treat as
|
||||||
|
// transient (could be a Cloudflare interstitial, a 5xx page,
|
||||||
|
// etc.) rather than blaming the session.
|
||||||
|
let html = "<html><body><h1>Service Unavailable</h1></body></html>";
|
||||||
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_probe_transient_on_empty_response() {
|
||||||
|
assert_eq!(classify_probe(""), SessionProbe::Transient);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_chapter_probe_ok_when_reader_rendered() {
|
||||||
|
let html = r#"
|
||||||
|
<html><body>
|
||||||
|
<a id="pic_container">
|
||||||
|
<img id="page1" src="https://cdn/1.jpg">
|
||||||
|
</a>
|
||||||
|
</body></html>
|
||||||
|
"#;
|
||||||
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() {
|
||||||
|
// What a logged-out hit on a chapter URL renders: a normal
|
||||||
|
// site layout (header etc.) with a "please log in" body, but
|
||||||
|
// no reader and no avatar widget.
|
||||||
|
let html = r#"
|
||||||
|
<html><body>
|
||||||
|
<header><div id="logo">Catalog</div></header>
|
||||||
|
<main>Please log in to read this chapter.</main>
|
||||||
|
</body></html>
|
||||||
|
"#;
|
||||||
|
assert_eq!(
|
||||||
|
classify_chapter_probe(html),
|
||||||
|
ChapterProbe::Unauthenticated
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() {
|
||||||
|
// Avatar shows the session is still valid; reader didn't
|
||||||
|
// render — site is serving an interstitial or the layout
|
||||||
|
// momentarily shifted. Retry, don't blame the session.
|
||||||
|
let html = r#"
|
||||||
|
<html><body>
|
||||||
|
<header><div id="logo">Catalog</div><div id="avatar_menu"></div></header>
|
||||||
|
<main>Site maintenance — back in 5 minutes.</main>
|
||||||
|
</body></html>
|
||||||
|
"#;
|
||||||
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_chapter_probe_transient_on_broken_page_body() {
|
||||||
|
let html =
|
||||||
|
"<html><body><p>we're sorry, the request file are not found.</p></body></html>";
|
||||||
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() {
|
||||||
|
// Regression for the original bug: the binary
|
||||||
|
// find_element("#avatar_menu") check treated "no avatar" as
|
||||||
|
// session-expired even when a transient hiccup was the real
|
||||||
|
// cause. classify_chapter_probe must NOT trip on that pattern
|
||||||
|
// when pic_container *is* present.
|
||||||
|
let html = r#"
|
||||||
|
<html><body>
|
||||||
|
<a id="pic_container">
|
||||||
|
<img id="page1" src="https://cdn/1.jpg">
|
||||||
|
</a>
|
||||||
|
</body></html>
|
||||||
|
"#;
|
||||||
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- run_session_probe_loop -----------------------------------------
|
||||||
|
//
|
||||||
|
// These tests exercise the recircuit-aware loop without a real
|
||||||
|
// browser. The fetch and recircuit closures are mocked over Vecs of
|
||||||
|
// canned outcomes / counters.
|
||||||
|
|
||||||
|
const OK_HTML: &str = r#"<html><body><div id="logo"></div><div id="avatar_menu"></div></body></html>"#;
|
||||||
|
const UNAUTH_HTML: &str = r#"<html><body><div id="logo"></div></body></html>"#;
|
||||||
|
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn probe_loop_ok_on_first_attempt_does_not_recircuit() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut fetched = 0u32;
|
||||||
|
run_session_probe_loop(
|
||||||
|
|| {
|
||||||
|
fetched += 1;
|
||||||
|
async { Ok(OK_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
3,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
"https://example/probe",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok on first attempt");
|
||||||
|
assert_eq!(fetched, 1);
|
||||||
|
assert_eq!(recircuits, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn probe_loop_unauth_then_ok_when_attempt_budget_available() {
|
||||||
|
// Budget = 3 total attempts. Unauth on call 1, ok on call 2.
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut call = 0u32;
|
||||||
|
run_session_probe_loop(
|
||||||
|
|| {
|
||||||
|
call += 1;
|
||||||
|
let n = call;
|
||||||
|
async move {
|
||||||
|
if n == 1 {
|
||||||
|
Ok(UNAUTH_HTML.to_string())
|
||||||
|
} else {
|
||||||
|
Ok(OK_HTML.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
3,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
"https://example/probe",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("recovers after one recircuit");
|
||||||
|
assert_eq!(call, 2);
|
||||||
|
assert_eq!(recircuits, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() {
|
||||||
|
// Budget = 1 total attempt = no retry (matches no-TOR behavior).
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut call = 0u32;
|
||||||
|
let err = run_session_probe_loop(
|
||||||
|
|| {
|
||||||
|
call += 1;
|
||||||
|
async { Ok(UNAUTH_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
1,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
"https://example/probe",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("budget=1 → fail-fast");
|
||||||
|
assert_eq!(call, 1, "no retry when budget is 1");
|
||||||
|
assert_eq!(recircuits, 0);
|
||||||
|
let msg = format!("{err:#}");
|
||||||
|
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
|
||||||
|
assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut call = 0u32;
|
||||||
|
let err = run_session_probe_loop(
|
||||||
|
|| {
|
||||||
|
call += 1;
|
||||||
|
async { Ok(UNAUTH_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
10, // transient budget irrelevant here
|
||||||
|
3, // 3 attempts total, 2 recircuits between
|
||||||
|
Duration::from_millis(0),
|
||||||
|
"https://example/probe",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("exhausts unauth budget");
|
||||||
|
assert_eq!(call, 3);
|
||||||
|
assert_eq!(recircuits, 2);
|
||||||
|
let msg = format!("{err:#}");
|
||||||
|
assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn probe_loop_transient_repeats_until_max_then_errors() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut call = 0u32;
|
||||||
|
let err = run_session_probe_loop(
|
||||||
|
|| {
|
||||||
|
call += 1;
|
||||||
|
async { Ok(TRANSIENT_HTML.to_string()) }
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
1,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
"https://example/probe",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("transient until max → fail");
|
||||||
|
assert_eq!(call, 3);
|
||||||
|
// Recircuit fires between attempts: 3 attempts → 2 recircuits.
|
||||||
|
assert_eq!(recircuits, 2);
|
||||||
|
let msg = format!("{err:#}");
|
||||||
|
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn probe_loop_transient_then_ok_returns_ok_after_one_recircuit() {
|
||||||
|
let mut recircuits = 0u32;
|
||||||
|
let mut call = 0u32;
|
||||||
|
run_session_probe_loop(
|
||||||
|
|| {
|
||||||
|
call += 1;
|
||||||
|
let n = call;
|
||||||
|
async move {
|
||||||
|
if n == 1 {
|
||||||
|
Ok(TRANSIENT_HTML.to_string())
|
||||||
|
} else {
|
||||||
|
Ok(OK_HTML.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
recircuits += 1;
|
||||||
|
async {}
|
||||||
|
},
|
||||||
|
3,
|
||||||
|
1,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
"https://example/probe",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("ok on second try");
|
||||||
|
assert_eq!(call, 2);
|
||||||
|
assert_eq!(recircuits, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn probe_loop_propagates_fetch_errors_immediately() {
|
||||||
|
let mut call = 0u32;
|
||||||
|
let err = run_session_probe_loop(
|
||||||
|
|| {
|
||||||
|
call += 1;
|
||||||
|
async { Err(anyhow!("nav timeout")) }
|
||||||
|
},
|
||||||
|
|| async {},
|
||||||
|
5,
|
||||||
|
5,
|
||||||
|
Duration::from_millis(0),
|
||||||
|
"https://example/probe",
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("fetch error bubbles");
|
||||||
|
assert_eq!(call, 1);
|
||||||
|
assert!(format!("{err:#}").contains("nav timeout"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
|
||||||
|
// Defensive: if a broken-page body somehow contains an
|
||||||
|
// #avatar_menu element (e.g. an unrelated debug page on the
|
||||||
|
// same template), the body signature still wins.
|
||||||
|
let html = r#"<html><body>
|
||||||
|
<p>we're sorry, the request file are not found.</p>
|
||||||
|
<div id="logo"></div>
|
||||||
|
<div id="avatar_menu"></div>
|
||||||
|
</body></html>"#;
|
||||||
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
||||||
|
}
|
||||||
|
}
|
||||||
180
backend/src/crawler/session_control.rs
Normal file
180
backend/src/crawler/session_control.rs
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
//! Runtime-updatable crawler session (PHPSESSID).
|
||||||
|
//!
|
||||||
|
//! At startup the session comes from `CRAWLER_PHPSESSID`, but it expires
|
||||||
|
//! and previously needed a container restart to refresh. This controller
|
||||||
|
//! lets an admin push a fresh cookie at runtime: it rewrites the reqwest
|
||||||
|
//! cookie jar (CDN image fetches), updates the in-memory value the browser
|
||||||
|
//! `on_launch` hook reads, persists it to `crawler_state` (so it survives
|
||||||
|
//! a restart), and clears the sticky `session_expired` flag. A subsequent
|
||||||
|
//! coordinated browser restart re-runs `on_launch`, re-injecting the new
|
||||||
|
//! cookie into Chromium and re-probing.
|
||||||
|
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::Context;
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
|
const STATE_KEY_RUNTIME_SESSION: &str = "runtime_session";
|
||||||
|
|
||||||
|
pub struct SessionController {
|
||||||
|
/// Current PHPSESSID — what `on_launch` injects into a fresh browser.
|
||||||
|
phpsessid: RwLock<Option<String>>,
|
||||||
|
/// The same `Arc<Jar>` handed to the reqwest client; updating it here
|
||||||
|
/// updates the client's cookies (the jar is internally mutable).
|
||||||
|
cookie_jar: Arc<reqwest::cookie::Jar>,
|
||||||
|
cookie_domain: Option<String>,
|
||||||
|
start_url: Option<String>,
|
||||||
|
db: PgPool,
|
||||||
|
session_expired: Arc<AtomicBool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SessionController {
|
||||||
|
pub fn new(
|
||||||
|
initial: Option<String>,
|
||||||
|
cookie_jar: Arc<reqwest::cookie::Jar>,
|
||||||
|
cookie_domain: Option<String>,
|
||||||
|
start_url: Option<String>,
|
||||||
|
db: PgPool,
|
||||||
|
session_expired: Arc<AtomicBool>,
|
||||||
|
) -> Arc<Self> {
|
||||||
|
Arc::new(Self {
|
||||||
|
phpsessid: RwLock::new(initial),
|
||||||
|
cookie_jar,
|
||||||
|
cookie_domain,
|
||||||
|
start_url,
|
||||||
|
db,
|
||||||
|
session_expired,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The PHPSESSID a fresh browser should inject (None when unset).
|
||||||
|
pub async fn current(&self) -> Option<String> {
|
||||||
|
self.phpsessid.read().await.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether the sticky session-expired flag is set (chapter workers
|
||||||
|
/// idle while true).
|
||||||
|
pub fn is_expired(&self) -> bool {
|
||||||
|
self.session_expired.load(Ordering::Acquire)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear the session-expired flag without changing the cookie — used
|
||||||
|
/// when the operator knows the session is fine and wants workers to
|
||||||
|
/// resume immediately.
|
||||||
|
pub fn clear_expired(&self) {
|
||||||
|
self.session_expired.store(false, Ordering::Release);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update the session everywhere: reqwest jar, in-memory value, and
|
||||||
|
/// persisted `crawler_state`. Clears the session-expired flag. Does
|
||||||
|
/// NOT relaunch the browser — the caller triggers a coordinated
|
||||||
|
/// restart so `on_launch` re-injects + re-probes.
|
||||||
|
pub async fn update(&self, sid: &str) -> anyhow::Result<()> {
|
||||||
|
let sid = sid.trim().to_string();
|
||||||
|
anyhow::ensure!(!sid.is_empty(), "PHPSESSID must not be empty");
|
||||||
|
// The value is spliced into a cookie string and a CDP CookieParam.
|
||||||
|
// Reject control chars and cookie delimiters so a pasted value
|
||||||
|
// can't smuggle extra attributes / break out of the cookie.
|
||||||
|
anyhow::ensure!(
|
||||||
|
sid.chars().all(|c| !c.is_control() && c != ';' && c != ','),
|
||||||
|
"PHPSESSID contains invalid characters"
|
||||||
|
);
|
||||||
|
|
||||||
|
if let (Some(domain), Some(start_url)) = (&self.cookie_domain, &self.start_url) {
|
||||||
|
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
|
||||||
|
let seed_url =
|
||||||
|
reqwest::Url::parse(start_url).context("parse start_url for cookie seed")?;
|
||||||
|
self.cookie_jar.add_cookie_str(&cookie_str, &seed_url);
|
||||||
|
}
|
||||||
|
*self.phpsessid.write().await = Some(sid.clone());
|
||||||
|
persist(&self.db, &sid).await.context("persist runtime session")?;
|
||||||
|
self.session_expired.store(false, Ordering::Release);
|
||||||
|
tracing::info!("crawler session updated at runtime");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a persisted runtime session (if any) from `crawler_state`.
|
||||||
|
/// Called at startup so a mid-day refresh survives a restart.
|
||||||
|
pub async fn load_persisted(db: &PgPool) -> Option<String> {
|
||||||
|
let row: Option<serde_json::Value> =
|
||||||
|
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
|
||||||
|
.bind(STATE_KEY_RUNTIME_SESSION)
|
||||||
|
.fetch_optional(db)
|
||||||
|
.await
|
||||||
|
.ok()
|
||||||
|
.flatten();
|
||||||
|
row.and_then(|v| {
|
||||||
|
v.get("phpsessid")
|
||||||
|
.and_then(|s| s.as_str())
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn persist(db: &PgPool, sid: &str) -> sqlx::Result<()> {
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||||
|
VALUES ($1, $2, now()) \
|
||||||
|
ON CONFLICT (key) DO UPDATE \
|
||||||
|
SET value = EXCLUDED.value, updated_at = now()",
|
||||||
|
)
|
||||||
|
.bind(STATE_KEY_RUNTIME_SESSION)
|
||||||
|
.bind(json!({ "phpsessid": sid }))
|
||||||
|
.execute(db)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn controller(db: PgPool) -> Arc<SessionController> {
|
||||||
|
SessionController::new(
|
||||||
|
None,
|
||||||
|
Arc::new(reqwest::cookie::Jar::default()),
|
||||||
|
Some("example.com".into()),
|
||||||
|
Some("https://example.com/".into()),
|
||||||
|
db,
|
||||||
|
Arc::new(AtomicBool::new(true)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn update_rejects_empty_and_control_chars(pool: PgPool) {
|
||||||
|
let c = controller(pool);
|
||||||
|
assert!(c.update(" ").await.is_err(), "empty rejected");
|
||||||
|
assert!(c.update("abc\r\ndef").await.is_err(), "CRLF rejected");
|
||||||
|
assert!(c.update("ab;Domain=evil").await.is_err(), "semicolon rejected");
|
||||||
|
assert!(c.update("x,y").await.is_err(), "comma rejected");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn update_persists_and_clears_expired_then_round_trips(pool: PgPool) {
|
||||||
|
let c = controller(pool.clone());
|
||||||
|
c.update("good-sid-123").await.unwrap();
|
||||||
|
assert_eq!(c.current().await.as_deref(), Some("good-sid-123"));
|
||||||
|
assert!(!c.is_expired(), "update clears the expired flag");
|
||||||
|
// Persisted to crawler_state and readable by a fresh load.
|
||||||
|
assert_eq!(
|
||||||
|
SessionController::load_persisted(&pool).await.as_deref(),
|
||||||
|
Some("good-sid-123")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn clear_expired_flips_sticky_flag_without_touching_session(pool: PgPool) {
|
||||||
|
// The flag starts `true` per `controller(pool)`'s test wiring.
|
||||||
|
let c = controller(pool);
|
||||||
|
assert!(c.is_expired(), "test fixture starts with the flag set");
|
||||||
|
c.clear_expired();
|
||||||
|
assert!(!c.is_expired(), "clear_expired flips the sticky flag to false");
|
||||||
|
assert!(
|
||||||
|
c.current().await.is_none(),
|
||||||
|
"clear_expired does not invent a session"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
128
backend/src/crawler/source.rs
Normal file
128
backend/src/crawler/source.rs
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
//! `Source` trait — the per-site abstraction.
|
||||||
|
//!
|
||||||
|
//! Job handlers depend on this trait, not on a concrete site. Adding a
|
||||||
|
//! new site is: implement `Source`, register it in a `sources` table
|
||||||
|
//! row, and the existing job pipeline picks it up unchanged.
|
||||||
|
|
||||||
|
pub mod target;
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use chromiumoxide::browser::Browser;
|
||||||
|
|
||||||
|
/// Pointer at a manga in the source's index, before we've fetched the
|
||||||
|
/// detail page. The `source_manga_key` is whatever stable id the source
|
||||||
|
/// uses (slug, numeric id, etc).
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SourceMangaRef {
|
||||||
|
pub source_manga_key: String,
|
||||||
|
pub title: String,
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Full metadata returned by `fetch_manga`. The hash is computed by the
|
||||||
|
/// source impl over the metadata-only field set (title through
|
||||||
|
/// cover_url) — chapter changes are tracked separately via
|
||||||
|
/// `chapter_sources`, so they intentionally do not affect
|
||||||
|
/// `metadata_hash`.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SourceManga {
|
||||||
|
pub source_manga_key: String,
|
||||||
|
pub title: String,
|
||||||
|
pub alternative_titles: Vec<String>,
|
||||||
|
pub authors: Vec<String>,
|
||||||
|
pub genres: Vec<String>,
|
||||||
|
pub tags: Vec<String>,
|
||||||
|
pub status: Option<String>,
|
||||||
|
pub summary: Option<String>,
|
||||||
|
pub cover_url: Option<String>,
|
||||||
|
/// Chapters surfaced on the same page as the metadata. Sources
|
||||||
|
/// where the chapter list lives elsewhere can leave this empty
|
||||||
|
/// and supply it via `fetch_chapter_list` instead.
|
||||||
|
pub chapters: Vec<SourceChapterRef>,
|
||||||
|
pub metadata_hash: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SourceChapterRef {
|
||||||
|
pub source_chapter_key: String,
|
||||||
|
pub number: i32,
|
||||||
|
pub title: Option<String>,
|
||||||
|
pub url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SourceChapter {
|
||||||
|
pub source_chapter_key: String,
|
||||||
|
pub number: i32,
|
||||||
|
pub title: Option<String>,
|
||||||
|
/// Ordered list of page image URLs, ready to be fetched and put
|
||||||
|
/// into `Storage`.
|
||||||
|
pub page_urls: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Context passed to every `Source` call. Carries the browser handle
|
||||||
|
/// plus the per-host rate-limiter map so impls that issue multiple
|
||||||
|
/// requests in one call (pagination walks, multi-page chapter image
|
||||||
|
/// fetches) honor the right budget for each origin.
|
||||||
|
pub struct FetchContext<'a> {
|
||||||
|
pub browser: &'a Browser,
|
||||||
|
pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
|
||||||
|
/// Optional TOR control-port client. When `Some`, retry helpers
|
||||||
|
/// signal `NEWNYM` between transient-page attempts so the next try
|
||||||
|
/// draws a fresh exit. `None` keeps pre-TOR behavior.
|
||||||
|
pub tor: Option<&'a crate::crawler::tor::TorController>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lazy iterator over discovered manga refs. The caller drives the
|
||||||
|
/// walk one batch at a time, so it can break out as soon as the
|
||||||
|
/// downstream stop condition is met (the first manga where metadata is
|
||||||
|
/// `Unchanged` and chapter sync reports zero new chapters) without
|
||||||
|
/// paying for pages it won't use.
|
||||||
|
///
|
||||||
|
/// Batches are typically one source-index page each. Within a batch
|
||||||
|
/// refs are in the source's natural newest-first ordering — the same
|
||||||
|
/// `update_date DESC` sort that makes the stop condition meaningful.
|
||||||
|
#[async_trait]
|
||||||
|
pub trait DiscoverWalk: Send {
|
||||||
|
/// Return the next batch of refs, or `Ok(None)` when the source has
|
||||||
|
/// no more pages. The walker is single-use; calling `next_batch`
|
||||||
|
/// after `None` is allowed and continues to return `None`.
|
||||||
|
async fn next_batch(
|
||||||
|
&mut self,
|
||||||
|
ctx: &FetchContext<'_>,
|
||||||
|
) -> anyhow::Result<Option<Vec<SourceMangaRef>>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
pub trait Source: Send + Sync {
|
||||||
|
/// Stable identifier — also the row key in the `sources` table.
|
||||||
|
fn id(&self) -> &'static str;
|
||||||
|
|
||||||
|
/// Begin discovery. Returns a walker the caller drives page-by-page
|
||||||
|
/// via `next_batch`. The initial page-1 probe (used to determine
|
||||||
|
/// `last_page` and warm the cache for sites that can't be paged
|
||||||
|
/// without knowing the bound) happens inside this call, so a fresh
|
||||||
|
/// walker is ready to yield its first batch without further setup.
|
||||||
|
async fn discover(
|
||||||
|
&self,
|
||||||
|
ctx: &FetchContext<'_>,
|
||||||
|
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>>;
|
||||||
|
|
||||||
|
async fn fetch_manga(
|
||||||
|
&self,
|
||||||
|
ctx: &FetchContext<'_>,
|
||||||
|
r: &SourceMangaRef,
|
||||||
|
) -> anyhow::Result<SourceManga>;
|
||||||
|
|
||||||
|
async fn fetch_chapter_list(
|
||||||
|
&self,
|
||||||
|
ctx: &FetchContext<'_>,
|
||||||
|
manga: &SourceManga,
|
||||||
|
) -> anyhow::Result<Vec<SourceChapterRef>>;
|
||||||
|
|
||||||
|
async fn fetch_chapter(
|
||||||
|
&self,
|
||||||
|
ctx: &FetchContext<'_>,
|
||||||
|
r: &SourceChapterRef,
|
||||||
|
) -> anyhow::Result<SourceChapter>;
|
||||||
|
}
|
||||||
1052
backend/src/crawler/source/target.rs
Normal file
1052
backend/src/crawler/source/target.rs
Normal file
File diff suppressed because it is too large
Load Diff
355
backend/src/crawler/status.rs
Normal file
355
backend/src/crawler/status.rs
Normal file
@@ -0,0 +1,355 @@
|
|||||||
|
//! Live, in-process crawler status.
|
||||||
|
//!
|
||||||
|
//! The metadata pass runs inline in the cron tick (it is not a
|
||||||
|
//! `crawler_jobs` row), so without this surface "what is the crawler doing
|
||||||
|
//! right now" is unanswerable from the dashboard. The daemon publishes its
|
||||||
|
//! current [`Phase`], the chapters being crawled right now (with a live
|
||||||
|
//! page count), and the cover being fetched into a shared [`StatusHandle`];
|
||||||
|
//! the admin endpoint reads a [`CrawlerStatus`] snapshot and composes it
|
||||||
|
//! with DB-derived counts + the session/browser flags.
|
||||||
|
//!
|
||||||
|
//! NOTE: this is per-process state. The deployment is a single server
|
||||||
|
//! (see CLAUDE.md), so an in-memory handle is sufficient; durable signals
|
||||||
|
//! (last-pass summary, runtime session) are persisted in `crawler_state`.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::Serialize;
|
||||||
|
use tokio::sync::{watch, RwLock};
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::crawler::pipeline::MetadataStats;
|
||||||
|
|
||||||
|
/// What the daemon's metadata pass is doing right now. Serialised with an
|
||||||
|
/// internal `state` tag so the frontend can switch on it.
|
||||||
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
#[serde(tag = "state", rename_all = "snake_case")]
|
||||||
|
pub enum Phase {
|
||||||
|
/// Sleeping until the next scheduled metadata pass.
|
||||||
|
Idle { next_fire: Option<DateTime<Utc>> },
|
||||||
|
/// Walking the source catalog list pages.
|
||||||
|
WalkingList,
|
||||||
|
/// Fetching one manga's metadata. `index`/`total` drive a progress bar
|
||||||
|
/// (`total` is `None` when the source size is unknown / uncapped).
|
||||||
|
FetchingMetadata {
|
||||||
|
index: usize,
|
||||||
|
total: Option<usize>,
|
||||||
|
title: String,
|
||||||
|
},
|
||||||
|
/// Backfilling covers that failed on first attempt. `index`/`total`
|
||||||
|
/// track progress through this tick's batch.
|
||||||
|
CoverBackfill { index: usize, total: usize },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A chapter being downloaded right now, with a live page count. Keyed in
|
||||||
|
/// the status by `chapter_id`; inserted by the dispatcher when a job starts
|
||||||
|
/// and removed (via an RAII guard) when it finishes, panics, or times out.
|
||||||
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
pub struct ActiveChapter {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub manga_title: String,
|
||||||
|
pub chapter_id: Uuid,
|
||||||
|
pub chapter_number: i32,
|
||||||
|
pub pages_done: usize,
|
||||||
|
/// `None` until the chapter page list has been parsed.
|
||||||
|
pub pages_total: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The manga whose cover is being downloaded right now.
|
||||||
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
pub struct CoverTarget {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub manga_title: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Summary of the most recent metadata pass (persisted across restarts in
|
||||||
|
/// `crawler_state` by the cron; mirrored here for the live read).
|
||||||
|
#[derive(Clone, Debug, Serialize, Default)]
|
||||||
|
pub struct LastPass {
|
||||||
|
pub at: Option<DateTime<Utc>>,
|
||||||
|
pub discovered: usize,
|
||||||
|
pub upserted: usize,
|
||||||
|
pub covers_fetched: usize,
|
||||||
|
pub mangas_failed: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A point-in-time snapshot returned by [`StatusHandle::snapshot`]. The
|
||||||
|
/// session/browser/queue fields are composed at read time by the endpoint
|
||||||
|
/// (they live elsewhere), so they are not stored here.
|
||||||
|
#[derive(Clone, Debug, Serialize)]
|
||||||
|
pub struct CrawlerStatus {
|
||||||
|
pub phase: Phase,
|
||||||
|
/// Number of configured chapter workers (for "N busy / M workers").
|
||||||
|
pub worker_count: usize,
|
||||||
|
/// Chapters being downloaded right now, with live page counts.
|
||||||
|
pub active_chapters: Vec<ActiveChapter>,
|
||||||
|
pub last_pass: LastPass,
|
||||||
|
/// The cover being downloaded right now, if any.
|
||||||
|
pub current_cover: Option<CoverTarget>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scalar status state held under the async `RwLock`. Active chapters live
|
||||||
|
/// in a separate sync map so per-page updates and RAII removal don't need
|
||||||
|
/// to `.await` (removal happens in `Drop`).
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
struct Scalar {
|
||||||
|
phase: Phase,
|
||||||
|
worker_count: usize,
|
||||||
|
last_pass: LastPass,
|
||||||
|
current_cover: Option<CoverTarget>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cloneable handle the daemon tasks use to publish status. Cheap to clone
|
||||||
|
/// (`Arc`). All writers funnel through the helper methods so locking stays
|
||||||
|
/// localised. Every mutation bumps a `watch` version so SSE subscribers
|
||||||
|
/// get pushed an update instead of polling.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct StatusHandle {
|
||||||
|
scalar: Arc<RwLock<Scalar>>,
|
||||||
|
/// Currently-downloading chapters keyed by `chapter_id`. A sync mutex so
|
||||||
|
/// the RAII [`ChapterGuard`]'s `Drop` can remove without `.await`.
|
||||||
|
active: Arc<Mutex<HashMap<Uuid, ActiveChapter>>>,
|
||||||
|
/// Monotonic version bumped on every change. SSE handlers `subscribe()`
|
||||||
|
/// and `await .changed()` for instant pushes; `watch` has no
|
||||||
|
/// lost-wakeup so a change between snapshots is never missed.
|
||||||
|
version: Arc<watch::Sender<u64>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lock the active map, recovering from a poisoned mutex (we never hold the
|
||||||
|
/// lock across a panic-prone section, so the data is still consistent).
|
||||||
|
fn lock_active(
|
||||||
|
m: &Mutex<HashMap<Uuid, ActiveChapter>>,
|
||||||
|
) -> std::sync::MutexGuard<'_, HashMap<Uuid, ActiveChapter>> {
|
||||||
|
m.lock().unwrap_or_else(|e| e.into_inner())
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StatusHandle {
|
||||||
|
pub fn new(num_workers: usize) -> Self {
|
||||||
|
let (version, _rx) = watch::channel(0u64);
|
||||||
|
Self {
|
||||||
|
scalar: Arc::new(RwLock::new(Scalar {
|
||||||
|
phase: Phase::Idle { next_fire: None },
|
||||||
|
worker_count: num_workers.max(1),
|
||||||
|
last_pass: LastPass::default(),
|
||||||
|
current_cover: None,
|
||||||
|
})),
|
||||||
|
active: Arc::new(Mutex::new(HashMap::new())),
|
||||||
|
version: Arc::new(version),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bump(&self) {
|
||||||
|
self.version.send_modify(|v| *v = v.wrapping_add(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A receiver whose `.changed()` resolves on the next status change.
|
||||||
|
pub fn subscribe(&self) -> watch::Receiver<u64> {
|
||||||
|
self.version.subscribe()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Signal a change without mutating in-memory state — used when an
|
||||||
|
/// *external* signal the live snapshot reflects (browser phase,
|
||||||
|
/// session-expired flag, queue counts) has changed, so subscribers
|
||||||
|
/// recompose promptly.
|
||||||
|
pub fn poke(&self) {
|
||||||
|
self.bump();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn set_phase(&self, phase: Phase) {
|
||||||
|
self.scalar.write().await.phase = phase;
|
||||||
|
self.bump();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set (or clear) the cover being downloaded right now.
|
||||||
|
pub async fn set_current_cover(&self, cover: Option<CoverTarget>) {
|
||||||
|
self.scalar.write().await.current_cover = cover;
|
||||||
|
self.bump();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Register a chapter as crawling now; returns a guard that removes it
|
||||||
|
/// when dropped (on completion, panic-unwind, or timeout-drop).
|
||||||
|
pub fn begin_chapter(&self, chapter: ActiveChapter) -> ChapterGuard {
|
||||||
|
let id = chapter.chapter_id;
|
||||||
|
lock_active(&self.active).insert(id, chapter);
|
||||||
|
self.bump();
|
||||||
|
ChapterGuard {
|
||||||
|
active: Arc::clone(&self.active),
|
||||||
|
version: Arc::clone(&self.version),
|
||||||
|
chapter_id: id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update the live page count of an in-flight chapter. Sync (no
|
||||||
|
/// `.await`) so it's cheap to call once per stored page.
|
||||||
|
pub fn set_chapter_pages(&self, chapter_id: Uuid, done: usize, total: Option<usize>) {
|
||||||
|
{
|
||||||
|
let mut map = lock_active(&self.active);
|
||||||
|
if let Some(c) = map.get_mut(&chapter_id) {
|
||||||
|
c.pages_done = done;
|
||||||
|
c.pages_total = total;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.bump();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Record a finished metadata pass. Stamps `at` with `now`.
|
||||||
|
pub async fn record_pass(&self, stats: &MetadataStats, at: DateTime<Utc>) {
|
||||||
|
self.scalar.write().await.last_pass = LastPass {
|
||||||
|
at: Some(at),
|
||||||
|
discovered: stats.discovered,
|
||||||
|
upserted: stats.upserted,
|
||||||
|
covers_fetched: stats.covers_fetched,
|
||||||
|
mangas_failed: stats.mangas_failed,
|
||||||
|
};
|
||||||
|
self.bump();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Seed the last-pass summary from a persisted `crawler_state` value on
|
||||||
|
/// startup so the dashboard isn't blank until the first tick.
|
||||||
|
pub async fn set_last_pass(&self, last: LastPass) {
|
||||||
|
self.scalar.write().await.last_pass = last;
|
||||||
|
self.bump();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn snapshot(&self) -> CrawlerStatus {
|
||||||
|
let scalar = self.scalar.read().await.clone();
|
||||||
|
let mut active_chapters: Vec<ActiveChapter> =
|
||||||
|
lock_active(&self.active).values().cloned().collect();
|
||||||
|
// Stable, readable order: by chapter number then id.
|
||||||
|
active_chapters.sort_by(|a, b| {
|
||||||
|
a.chapter_number
|
||||||
|
.cmp(&b.chapter_number)
|
||||||
|
.then(a.chapter_id.cmp(&b.chapter_id))
|
||||||
|
});
|
||||||
|
CrawlerStatus {
|
||||||
|
phase: scalar.phase,
|
||||||
|
worker_count: scalar.worker_count,
|
||||||
|
active_chapters,
|
||||||
|
last_pass: scalar.last_pass,
|
||||||
|
current_cover: scalar.current_cover,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// RAII handle removing an [`ActiveChapter`] from the live status when the
|
||||||
|
/// chapter dispatch finishes, panics, or is dropped on timeout.
|
||||||
|
pub struct ChapterGuard {
|
||||||
|
active: Arc<Mutex<HashMap<Uuid, ActiveChapter>>>,
|
||||||
|
version: Arc<watch::Sender<u64>>,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for ChapterGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
lock_active(&self.active).remove(&self.chapter_id);
|
||||||
|
self.version.send_modify(|v| *v = v.wrapping_add(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn sample_chapter(n: i32) -> ActiveChapter {
|
||||||
|
ActiveChapter {
|
||||||
|
manga_id: Uuid::new_v4(),
|
||||||
|
manga_title: "M".into(),
|
||||||
|
chapter_id: Uuid::new_v4(),
|
||||||
|
chapter_number: n,
|
||||||
|
pages_done: 0,
|
||||||
|
pages_total: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn begin_chapter_shows_in_snapshot_and_guard_removes_on_drop() {
|
||||||
|
let h = StatusHandle::new(2);
|
||||||
|
let chap = sample_chapter(7);
|
||||||
|
let cid = chap.chapter_id;
|
||||||
|
{
|
||||||
|
let _guard = h.begin_chapter(chap);
|
||||||
|
let snap = h.snapshot().await;
|
||||||
|
assert_eq!(snap.active_chapters.len(), 1);
|
||||||
|
assert_eq!(snap.active_chapters[0].chapter_id, cid);
|
||||||
|
assert_eq!(snap.worker_count, 2);
|
||||||
|
}
|
||||||
|
// Guard dropped → entry removed.
|
||||||
|
let snap = h.snapshot().await;
|
||||||
|
assert!(snap.active_chapters.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn set_chapter_pages_updates_live_count() {
|
||||||
|
let h = StatusHandle::new(1);
|
||||||
|
let chap = sample_chapter(1);
|
||||||
|
let cid = chap.chapter_id;
|
||||||
|
let _guard = h.begin_chapter(chap);
|
||||||
|
h.set_chapter_pages(cid, 3, Some(20));
|
||||||
|
let snap = h.snapshot().await;
|
||||||
|
assert_eq!(snap.active_chapters[0].pages_done, 3);
|
||||||
|
assert_eq!(snap.active_chapters[0].pages_total, Some(20));
|
||||||
|
// Updating an unknown chapter is a no-op, not a panic.
|
||||||
|
h.set_chapter_pages(Uuid::new_v4(), 9, Some(9));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn snapshot_sorts_active_chapters_by_number() {
|
||||||
|
let h = StatusHandle::new(2);
|
||||||
|
let _g1 = h.begin_chapter(sample_chapter(5));
|
||||||
|
let _g2 = h.begin_chapter(sample_chapter(2));
|
||||||
|
let snap = h.snapshot().await;
|
||||||
|
assert_eq!(snap.active_chapters[0].chapter_number, 2);
|
||||||
|
assert_eq!(snap.active_chapters[1].chapter_number, 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn set_current_cover_round_trips() {
|
||||||
|
let h = StatusHandle::new(1);
|
||||||
|
let mid = Uuid::new_v4();
|
||||||
|
h.set_current_cover(Some(CoverTarget {
|
||||||
|
manga_id: mid,
|
||||||
|
manga_title: "One Piece".into(),
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
assert_eq!(
|
||||||
|
h.snapshot().await.current_cover.map(|c| c.manga_id),
|
||||||
|
Some(mid)
|
||||||
|
);
|
||||||
|
h.set_current_cover(None).await;
|
||||||
|
assert!(h.snapshot().await.current_cover.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn record_pass_captures_stats_and_timestamp() {
|
||||||
|
let h = StatusHandle::new(1);
|
||||||
|
let stats = MetadataStats {
|
||||||
|
discovered: 5,
|
||||||
|
upserted: 3,
|
||||||
|
covers_fetched: 2,
|
||||||
|
mangas_failed: 1,
|
||||||
|
};
|
||||||
|
let at = Utc::now();
|
||||||
|
h.record_pass(&stats, at).await;
|
||||||
|
let snap = h.snapshot().await;
|
||||||
|
assert_eq!(snap.last_pass.discovered, 5);
|
||||||
|
assert_eq!(snap.last_pass.upserted, 3);
|
||||||
|
assert_eq!(snap.last_pass.at, Some(at));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn subscribe_resolves_on_mutation_poke_and_chapter_change() {
|
||||||
|
let h = StatusHandle::new(1);
|
||||||
|
let mut rx = h.subscribe();
|
||||||
|
h.set_phase(Phase::WalkingList).await;
|
||||||
|
rx.changed().await.unwrap();
|
||||||
|
h.poke();
|
||||||
|
rx.changed().await.unwrap();
|
||||||
|
// begin_chapter + guard drop each bump the version.
|
||||||
|
let g = h.begin_chapter(sample_chapter(1));
|
||||||
|
rx.changed().await.unwrap();
|
||||||
|
drop(g);
|
||||||
|
rx.changed().await.unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
446
backend/src/crawler/tor.rs
Normal file
446
backend/src/crawler/tor.rs
Normal file
@@ -0,0 +1,446 @@
|
|||||||
|
//! TOR control-port client for `SIGNAL NEWNYM` ("recircuit").
|
||||||
|
//!
|
||||||
|
//! The crawler can be proxied through TOR (`CRAWLER_PROXY=socks5h://tor:9050`)
|
||||||
|
//! to randomize the exit IP seen by the target site. When the target
|
||||||
|
//! returns a "bad page" (its broken-template body, missing layout
|
||||||
|
//! sentinel, or unauthenticated probe despite a valid PHPSESSID), it
|
||||||
|
//! is often the current exit being rate-limited or fingerprinted rather
|
||||||
|
//! than a real failure. Asking the local TOR daemon for a new identity
|
||||||
|
//! over its control port (port 9051 by default) makes subsequent
|
||||||
|
//! connections draw a fresh circuit; combined with `IsolateDestAddr`
|
||||||
|
//! in torrc this is usually enough to clear the failure.
|
||||||
|
//!
|
||||||
|
//! Scope is deliberately tiny — `AUTHENTICATE` + `SIGNAL NEWNYM` over
|
||||||
|
//! a one-shot TCP connection. No `torut` dep, no hidden-service
|
||||||
|
//! plumbing, no event streaming.
|
||||||
|
//!
|
||||||
|
//! **Caveat for in-flight connections:** Chromium reuses sockets, so a
|
||||||
|
//! `NEWNYM` only affects *new* connections (in TOR terms, new circuits).
|
||||||
|
//! That's fine for our retry path — the next navigation opens a fresh
|
||||||
|
//! connection. We do not try to forcibly close existing streams.
|
||||||
|
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use anyhow::{anyhow, bail, Context};
|
||||||
|
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
|
||||||
|
use tokio::net::TcpStream;
|
||||||
|
use tokio::time::timeout;
|
||||||
|
|
||||||
|
/// Default control-port (`tor --defaults-torrc` ships 9051).
|
||||||
|
const DEFAULT_CONTROL_PORT: u16 = 9051;
|
||||||
|
/// Connect timeout — generous enough for a slow compose start, short
|
||||||
|
/// enough that a misconfigured controller doesn't stall a crawl.
|
||||||
|
const CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
|
||||||
|
/// Per-command read timeout. `SIGNAL NEWNYM` returns instantly on the
|
||||||
|
/// happy path; bound it so a half-broken control port can't hang us.
|
||||||
|
const READ_TIMEOUT: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
|
/// How the controller authenticates to the control port.
|
||||||
|
///
|
||||||
|
/// `Cookie` is preferred for compose deploys where the auth cookie file
|
||||||
|
/// is shared between the `tor` and `backend` containers via a named
|
||||||
|
/// volume. `Password` is the fallback when the cookie file isn't
|
||||||
|
/// reachable (different gid, no shared volume, etc.). `None` matches a
|
||||||
|
/// torrc with no `CookieAuthentication 1` and no `HashedControlPassword`
|
||||||
|
/// — useful for local experimentation, not for production.
|
||||||
|
///
|
||||||
|
/// `Debug` is implemented manually to redact the password (and the
|
||||||
|
/// cookie path, which is non-sensitive but uninteresting in logs).
|
||||||
|
/// Don't add `#[derive(Debug)]` — the controller is `?`-logged at
|
||||||
|
/// startup and a derive would expand the password into the trace.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub enum TorAuth {
|
||||||
|
None,
|
||||||
|
Password(String),
|
||||||
|
Cookie(PathBuf),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for TorAuth {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
TorAuth::None => f.write_str("None"),
|
||||||
|
TorAuth::Password(_) => f.write_str("Password(<redacted>)"),
|
||||||
|
TorAuth::Cookie(_) => f.write_str("Cookie(<path>)"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct TorController {
|
||||||
|
/// `host:port` string. Kept as a string (not a `SocketAddr`) so
|
||||||
|
/// docker-compose hostnames like `tor:9051` resolve at connect time.
|
||||||
|
addr: String,
|
||||||
|
auth: TorAuth,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TorController {
|
||||||
|
pub fn new(addr: impl Into<String>, auth: TorAuth) -> Self {
|
||||||
|
Self { addr: addr.into(), auth }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a controller from the env-config shape:
|
||||||
|
/// `url` (e.g. `tcp://tor:9051`, `127.0.0.1:9051`, or `tor`),
|
||||||
|
/// optional password, optional cookie path. Returns `Ok(None)` when
|
||||||
|
/// `url` is absent — that's the "TOR feature disabled" signal.
|
||||||
|
/// Cookie wins over password when both are set (rotates with TOR;
|
||||||
|
/// no secret to manage).
|
||||||
|
pub fn from_parts(
|
||||||
|
url: Option<&str>,
|
||||||
|
password: Option<&str>,
|
||||||
|
cookie_path: Option<&Path>,
|
||||||
|
) -> anyhow::Result<Option<Self>> {
|
||||||
|
let Some(url) = url else { return Ok(None) };
|
||||||
|
let addr = parse_control_url(url)?;
|
||||||
|
let auth = match (cookie_path, password) {
|
||||||
|
(Some(p), _) => TorAuth::Cookie(p.to_path_buf()),
|
||||||
|
(None, Some(p)) => TorAuth::Password(p.to_string()),
|
||||||
|
(None, None) => TorAuth::None,
|
||||||
|
};
|
||||||
|
Ok(Some(Self { addr, auth }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Open the control port, `AUTHENTICATE`, `SIGNAL NEWNYM`, `QUIT`.
|
||||||
|
/// Each invocation is a fresh connection; the controller is cheap
|
||||||
|
/// to clone and stateless across calls.
|
||||||
|
pub async fn new_identity(&self) -> anyhow::Result<()> {
|
||||||
|
let stream = timeout(CONNECT_TIMEOUT, TcpStream::connect(&self.addr))
|
||||||
|
.await
|
||||||
|
.with_context(|| {
|
||||||
|
format!("timed out connecting to TOR control port {}", self.addr)
|
||||||
|
})?
|
||||||
|
.with_context(|| format!("connect to TOR control port {}", self.addr))?;
|
||||||
|
let (read, mut write) = stream.into_split();
|
||||||
|
let mut read = BufReader::new(read);
|
||||||
|
|
||||||
|
let auth_line = self.build_auth_line().await?;
|
||||||
|
write_line(&mut write, &auth_line).await?;
|
||||||
|
timeout(READ_TIMEOUT, expect_250(&mut read))
|
||||||
|
.await
|
||||||
|
.map_err(|_| anyhow!("TOR control AUTHENTICATE timed out"))?
|
||||||
|
.context("AUTHENTICATE")?;
|
||||||
|
|
||||||
|
write_line(&mut write, "SIGNAL NEWNYM").await?;
|
||||||
|
timeout(READ_TIMEOUT, expect_250(&mut read))
|
||||||
|
.await
|
||||||
|
.map_err(|_| anyhow!("TOR control SIGNAL NEWNYM timed out"))?
|
||||||
|
.context("SIGNAL NEWNYM")?;
|
||||||
|
|
||||||
|
// QUIT is courtesy; ignore errors — the daemon may close the
|
||||||
|
// socket before our QUIT lands and that's perfectly fine.
|
||||||
|
let _ = write_line(&mut write, "QUIT").await;
|
||||||
|
// Debug-level: a busy crawl can rotate circuits many times per
|
||||||
|
// minute, INFO is too chatty. Failures still log at WARN.
|
||||||
|
tracing::debug!(addr = %self.addr, "TOR NEWNYM signaled");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn build_auth_line(&self) -> anyhow::Result<String> {
|
||||||
|
match &self.auth {
|
||||||
|
TorAuth::None => Ok("AUTHENTICATE".to_string()),
|
||||||
|
TorAuth::Password(p) => Ok(format!("AUTHENTICATE \"{}\"", escape_quoted(p))),
|
||||||
|
TorAuth::Cookie(path) => {
|
||||||
|
let bytes = tokio::fs::read(path)
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("read TOR cookie file {}", path.display()))?;
|
||||||
|
Ok(format!("AUTHENTICATE {}", hex_encode(&bytes)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse `tcp://host:port`, `host:port`, or bare `host` into a
|
||||||
|
/// connect-time string. Default port is [`DEFAULT_CONTROL_PORT`].
|
||||||
|
fn parse_control_url(url: &str) -> anyhow::Result<String> {
|
||||||
|
let stripped = url.strip_prefix("tcp://").unwrap_or(url);
|
||||||
|
if stripped.is_empty() {
|
||||||
|
bail!("TOR control url is empty");
|
||||||
|
}
|
||||||
|
if stripped.contains(':') {
|
||||||
|
Ok(stripped.to_string())
|
||||||
|
} else {
|
||||||
|
Ok(format!("{stripped}:{DEFAULT_CONTROL_PORT}"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn escape_quoted(s: &str) -> String {
|
||||||
|
s.replace('\\', r"\\").replace('"', r#"\""#)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn hex_encode(bytes: &[u8]) -> String {
|
||||||
|
let mut s = String::with_capacity(bytes.len() * 2);
|
||||||
|
for b in bytes {
|
||||||
|
s.push_str(&format!("{b:02x}"));
|
||||||
|
}
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn write_line<W: tokio::io::AsyncWrite + Unpin>(
|
||||||
|
w: &mut W,
|
||||||
|
line: &str,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
w.write_all(line.as_bytes()).await?;
|
||||||
|
w.write_all(b"\r\n").await?;
|
||||||
|
w.flush().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drain a TOR control reply, accepting only status `250`. Handles
|
||||||
|
/// the protocol's three line forms: `XYZ ...` (single/end), `XYZ-...`
|
||||||
|
/// (continuation), `XYZ+...` (data block ended by a lone `.`). Our
|
||||||
|
/// commands only ever produce single-line `250 OK`, but we honor the
|
||||||
|
/// continuation forms so a future torrc that adds events / banners
|
||||||
|
/// doesn't confuse the parser.
|
||||||
|
async fn expect_250<R: AsyncBufReadExt + Unpin>(r: &mut R) -> anyhow::Result<()> {
|
||||||
|
loop {
|
||||||
|
let mut line = String::new();
|
||||||
|
let n = r.read_line(&mut line).await?;
|
||||||
|
if n == 0 {
|
||||||
|
bail!("TOR control port closed connection mid-reply");
|
||||||
|
}
|
||||||
|
let trimmed = line.trim_end_matches(['\r', '\n']);
|
||||||
|
if trimmed.len() < 4 {
|
||||||
|
bail!("malformed TOR control reply: {trimmed:?}");
|
||||||
|
}
|
||||||
|
let (code, rest) = trimmed.split_at(3);
|
||||||
|
if code != "250" {
|
||||||
|
bail!("TOR control replied {trimmed:?}");
|
||||||
|
}
|
||||||
|
let sep = rest.as_bytes()[0];
|
||||||
|
match sep {
|
||||||
|
b' ' => return Ok(()),
|
||||||
|
b'-' => continue,
|
||||||
|
b'+' => {
|
||||||
|
// Data block — read until a line consisting of only ".".
|
||||||
|
loop {
|
||||||
|
let mut data = String::new();
|
||||||
|
let n = r.read_line(&mut data).await?;
|
||||||
|
if n == 0 {
|
||||||
|
bail!("TOR control port closed mid-data-block");
|
||||||
|
}
|
||||||
|
if data.trim_end_matches(['\r', '\n']) == "." {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => bail!("malformed TOR control reply separator: {trimmed:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
use tokio::net::TcpListener;
|
||||||
|
|
||||||
|
/// Spawn a mock control port that responds to each \r\n-terminated
|
||||||
|
/// inbound line with the next entry from `replies`. Each reply has
|
||||||
|
/// its own `\r\n` appended. Records received lines into `recorder`.
|
||||||
|
/// After `replies.len()` exchanges the task drops the socket — this
|
||||||
|
/// matches the real TOR behavior for QUIT (close after acking).
|
||||||
|
async fn spawn_mock(
|
||||||
|
replies: Vec<&'static str>,
|
||||||
|
recorder: Arc<Mutex<Vec<String>>>,
|
||||||
|
) -> String {
|
||||||
|
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||||
|
let addr = listener.local_addr().unwrap().to_string();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let (sock, _) = listener.accept().await.unwrap();
|
||||||
|
let (r, mut w) = sock.into_split();
|
||||||
|
let mut r = BufReader::new(r);
|
||||||
|
for reply in replies {
|
||||||
|
let mut line = String::new();
|
||||||
|
let n = r.read_line(&mut line).await.unwrap_or(0);
|
||||||
|
if n == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
recorder
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.push(line.trim_end_matches(['\r', '\n']).to_string());
|
||||||
|
w.write_all(reply.as_bytes()).await.unwrap();
|
||||||
|
w.write_all(b"\r\n").await.unwrap();
|
||||||
|
w.flush().await.unwrap();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
addr
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn password_auth_then_newnym_writes_expected_sequence() {
|
||||||
|
let recorder = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
// Two replies: AUTHENTICATE then SIGNAL NEWNYM. QUIT is
|
||||||
|
// fire-and-forget; the mock dropping the socket is the
|
||||||
|
// expected real-world behavior.
|
||||||
|
let addr =
|
||||||
|
spawn_mock(vec!["250 OK", "250 OK"], Arc::clone(&recorder)).await;
|
||||||
|
let controller = TorController::new(addr, TorAuth::Password("secret".into()));
|
||||||
|
controller.new_identity().await.expect("new_identity ok");
|
||||||
|
let recorded = recorder.lock().unwrap().clone();
|
||||||
|
assert_eq!(recorded.first().map(String::as_str), Some("AUTHENTICATE \"secret\""));
|
||||||
|
assert_eq!(recorded.get(1).map(String::as_str), Some("SIGNAL NEWNYM"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn cookie_auth_hex_encodes_file_bytes() {
|
||||||
|
let tmp = tempfile::NamedTempFile::new().unwrap();
|
||||||
|
let cookie: Vec<u8> = (0u8..32).collect();
|
||||||
|
std::fs::write(tmp.path(), &cookie).unwrap();
|
||||||
|
let recorder = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
let addr =
|
||||||
|
spawn_mock(vec!["250 OK", "250 OK"], Arc::clone(&recorder)).await;
|
||||||
|
let controller =
|
||||||
|
TorController::new(addr, TorAuth::Cookie(tmp.path().to_path_buf()));
|
||||||
|
controller.new_identity().await.expect("new_identity ok");
|
||||||
|
let recorded = recorder.lock().unwrap().clone();
|
||||||
|
let expected_hex: String = cookie.iter().map(|b| format!("{b:02x}")).collect();
|
||||||
|
assert_eq!(
|
||||||
|
recorded.first().map(String::as_str),
|
||||||
|
Some(format!("AUTHENTICATE {expected_hex}").as_str())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn no_auth_sends_bare_authenticate() {
|
||||||
|
let recorder = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
let addr =
|
||||||
|
spawn_mock(vec!["250 OK", "250 OK"], Arc::clone(&recorder)).await;
|
||||||
|
let controller = TorController::new(addr, TorAuth::None);
|
||||||
|
controller.new_identity().await.expect("new_identity ok");
|
||||||
|
let recorded = recorder.lock().unwrap().clone();
|
||||||
|
assert_eq!(recorded.first().map(String::as_str), Some("AUTHENTICATE"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn non_250_reply_returns_err_with_reply_text() {
|
||||||
|
let recorder = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
let addr = spawn_mock(
|
||||||
|
vec!["515 Bad authentication"],
|
||||||
|
Arc::clone(&recorder),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let controller =
|
||||||
|
TorController::new(addr, TorAuth::Password("wrong".into()));
|
||||||
|
let err = controller.new_identity().await.expect_err("should fail");
|
||||||
|
let msg = format!("{err:#}");
|
||||||
|
assert!(msg.contains("515"), "expected 515 in error, got: {msg}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn closed_connection_mid_reply_is_an_error() {
|
||||||
|
// Listener accepts the AUTH line then drops without replying —
|
||||||
|
// this exercises the EOF-mid-reply path in expect_250 (rather
|
||||||
|
// than tor's own error replies which are covered elsewhere).
|
||||||
|
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||||
|
let addr = listener.local_addr().unwrap().to_string();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Ok((sock, _)) = listener.accept().await {
|
||||||
|
let (r, _w) = sock.into_split();
|
||||||
|
let mut r = BufReader::new(r);
|
||||||
|
let mut line = String::new();
|
||||||
|
let _ = r.read_line(&mut line).await; // read AUTH, ignore
|
||||||
|
// Drop _w (and the read half via scope exit) so the
|
||||||
|
// peer sees an immediate EOF on the next read.
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let controller = TorController::new(addr, TorAuth::None);
|
||||||
|
let err = controller.new_identity().await.expect_err("should fail");
|
||||||
|
let msg = format!("{err:#}");
|
||||||
|
assert!(
|
||||||
|
msg.contains("closed connection"),
|
||||||
|
"expected EOF-mid-reply error, got: {msg}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn multi_line_250_continuation_is_accepted() {
|
||||||
|
let recorder = Arc::new(Mutex::new(Vec::new()));
|
||||||
|
// AUTHENTICATE reply uses the `250-...\r\n250 OK\r\n` form.
|
||||||
|
// Single reply string contains the whole multi-line response.
|
||||||
|
let addr = spawn_mock(
|
||||||
|
vec!["250-banner=foo\r\n250 OK", "250 OK"],
|
||||||
|
Arc::clone(&recorder),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let controller = TorController::new(addr, TorAuth::None);
|
||||||
|
controller.new_identity().await.expect("new_identity ok");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_parts_returns_none_when_url_unset() {
|
||||||
|
let c = TorController::from_parts(None, None, None).unwrap();
|
||||||
|
assert!(c.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_parts_prefers_cookie_over_password() {
|
||||||
|
let c = TorController::from_parts(
|
||||||
|
Some("tor:9051"),
|
||||||
|
Some("pw"),
|
||||||
|
Some(Path::new("/var/lib/tor/control_auth_cookie")),
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.expect("controller built");
|
||||||
|
assert!(matches!(c.auth, TorAuth::Cookie(_)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn from_parts_falls_back_to_password_without_cookie() {
|
||||||
|
let c = TorController::from_parts(Some("tor:9051"), Some("pw"), None)
|
||||||
|
.unwrap()
|
||||||
|
.expect("controller built");
|
||||||
|
assert!(matches!(c.auth, TorAuth::Password(p) if p == "pw"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_control_url_accepts_tcp_scheme() {
|
||||||
|
assert_eq!(parse_control_url("tcp://127.0.0.1:9051").unwrap(), "127.0.0.1:9051");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_control_url_defaults_port_when_omitted() {
|
||||||
|
assert_eq!(parse_control_url("tor").unwrap(), "tor:9051");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_control_url_passes_through_host_port() {
|
||||||
|
assert_eq!(parse_control_url("tor:9999").unwrap(), "tor:9999");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_control_url_rejects_empty() {
|
||||||
|
assert!(parse_control_url("").is_err());
|
||||||
|
assert!(parse_control_url("tcp://").is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn escape_quoted_handles_quotes_and_backslashes() {
|
||||||
|
assert_eq!(escape_quoted(r#"a"b\c"#), r#"a\"b\\c"#);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn debug_format_redacts_password_and_cookie_path() {
|
||||||
|
// Regression: app.rs / bin/crawler.rs log the controller at
|
||||||
|
// startup via `tracing::info!(?t, ...)`. A derived Debug on
|
||||||
|
// TorAuth would expand TorAuth::Password(p) and leak the
|
||||||
|
// plaintext into logs.
|
||||||
|
let c = TorController::new("tor:9051", TorAuth::Password("super-secret".into()));
|
||||||
|
let dbg = format!("{c:?}");
|
||||||
|
assert!(!dbg.contains("super-secret"), "password leaked: {dbg}");
|
||||||
|
assert!(dbg.contains("<redacted>"), "expected <redacted>, got: {dbg}");
|
||||||
|
|
||||||
|
let c = TorController::new(
|
||||||
|
"tor:9051",
|
||||||
|
TorAuth::Cookie("/var/lib/tor/control_auth_cookie".into()),
|
||||||
|
);
|
||||||
|
let dbg = format!("{c:?}");
|
||||||
|
assert!(!dbg.contains("control_auth_cookie"), "cookie path leaked: {dbg}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hex_encode_zero_pads_low_bytes() {
|
||||||
|
assert_eq!(hex_encode(&[0x00, 0x0f, 0xff]), "000fff");
|
||||||
|
}
|
||||||
|
}
|
||||||
244
backend/src/crawler/url_utils.rs
Normal file
244
backend/src/crawler/url_utils.rs
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
//! Centralised URL helpers for the crawler subsystem.
|
||||||
|
//!
|
||||||
|
//! Three near-identical hand-rolled URL parsers used to live in
|
||||||
|
//! `crawler::session`, `crawler::rate_limit`, and `crawler::pipeline`
|
||||||
|
//! respectively, each with subtly different edge-case behaviour
|
||||||
|
//! around port handling and IPv6 literals. They're consolidated here
|
||||||
|
//! so the divergence can't drift again.
|
||||||
|
//!
|
||||||
|
//! The hand-rolled implementations are kept intentionally — they
|
||||||
|
//! preserve the exact semantics every existing test pins. A future
|
||||||
|
//! refactor can switch to `reqwest::Url` if it can be done without
|
||||||
|
//! changing those semantics.
|
||||||
|
|
||||||
|
/// Lowercased host (no port). Returns `None` for inputs without a
|
||||||
|
/// `scheme://host` shape — those would never have reached the network
|
||||||
|
/// layer anyway. Used by the per-host rate limiter as its bucket key.
|
||||||
|
///
|
||||||
|
/// IPv6 literals are kept in their `[::1]` bracketed form so the
|
||||||
|
/// `rsplit_once(':')` port-stripping logic doesn't split inside the
|
||||||
|
/// address (e.g. `https://[::1]/foo` used to return `"[:"` because
|
||||||
|
/// the rightmost `:` is inside the literal). Buckets keyed by
|
||||||
|
/// `[::1]` vs `::1` are still uniquely-per-host; the brackets are
|
||||||
|
/// cosmetic.
|
||||||
|
pub fn host_of(url: &str) -> Option<String> {
|
||||||
|
let after_scheme = url.split_once("://")?.1;
|
||||||
|
let host_with_port = after_scheme.split('/').next()?;
|
||||||
|
let host = if host_with_port.starts_with('[') {
|
||||||
|
// IPv6 literal: keep through the closing bracket. There may
|
||||||
|
// be a trailing `:port` after `]`; strip only that.
|
||||||
|
match host_with_port.rfind(']') {
|
||||||
|
Some(end) => &host_with_port[..=end],
|
||||||
|
None => host_with_port,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Hostnames and IPv4 literals: trailing `:port` (if any) is
|
||||||
|
// after the last `:`.
|
||||||
|
host_with_port
|
||||||
|
.rsplit_once(':')
|
||||||
|
.map_or(host_with_port, |(h, _)| h)
|
||||||
|
};
|
||||||
|
(!host.is_empty()).then(|| host.to_ascii_lowercase())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `scheme://host` with no path or port stripping. Used by the metadata
|
||||||
|
/// pass to seed `sources.base_url` from `CRAWLER_START_URL`.
|
||||||
|
pub fn origin_of(url: &str) -> Option<String> {
|
||||||
|
let (scheme, rest) = url.split_once("://")?;
|
||||||
|
let host = rest.split('/').next()?;
|
||||||
|
Some(format!("{scheme}://{host}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Approximate registrable-domain calculation: take the last two
|
||||||
|
/// dot-labels of the host, prefix with `.`. Used to set a parent-
|
||||||
|
/// domain cookie so the catalog's `www.` / `m.` redirects don't drop
|
||||||
|
/// the cookie mid-crawl.
|
||||||
|
///
|
||||||
|
/// Caveat: wrong for multi-part TLDs (`.co.uk`, `.com.br`). The
|
||||||
|
/// operator can override via `CRAWLER_COOKIE_DOMAIN`; pulling in the
|
||||||
|
/// Public Suffix List for one knob isn't worth it yet.
|
||||||
|
///
|
||||||
|
/// Bare hostnames (e.g. `localhost`) return the host as-is, with no
|
||||||
|
/// leading dot — setting `.localhost` as a cookie domain is invalid.
|
||||||
|
/// IPv6 literals (e.g. `[::1]`) are returned bracketed and unchanged;
|
||||||
|
/// the browser will reject them as a cookie `Domain` anyway, but the
|
||||||
|
/// representation stays sensible. Same `starts_with('[')` branch as
|
||||||
|
/// [`host_of`] for consistent IPv6 handling across the module.
|
||||||
|
pub fn registrable_domain(url: &str) -> Option<String> {
|
||||||
|
let after_scheme = url.split_once("://")?.1;
|
||||||
|
let host_with_port = after_scheme.split('/').next()?;
|
||||||
|
let host_str = if host_with_port.starts_with('[') {
|
||||||
|
// IPv6 literal: keep through the closing bracket; an optional
|
||||||
|
// `:port` follows `]`.
|
||||||
|
match host_with_port.rfind(']') {
|
||||||
|
Some(end) => &host_with_port[..=end],
|
||||||
|
None => host_with_port,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
host_with_port
|
||||||
|
.rsplit_once(':')
|
||||||
|
.map_or(host_with_port, |(h, _)| h)
|
||||||
|
};
|
||||||
|
let host = host_str.to_ascii_lowercase();
|
||||||
|
if host.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let labels: Vec<&str> = host.split('.').filter(|l| !l.is_empty()).collect();
|
||||||
|
if labels.len() < 2 {
|
||||||
|
return Some(host);
|
||||||
|
}
|
||||||
|
let registrable = &labels[labels.len() - 2..];
|
||||||
|
Some(format!(".{}", registrable.join(".")))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalise a SOCKS proxy URL for Chromium's `--proxy-server=` flag.
|
||||||
|
///
|
||||||
|
/// reqwest accepts both `socks5://` (resolve locally) and
|
||||||
|
/// `socks5h://` (resolve via the SOCKS server — important when the
|
||||||
|
/// proxy is TOR and we don't want the host's resolver to see the
|
||||||
|
/// target hostname). Chromium does **not** know the `socks5h` scheme
|
||||||
|
/// and refuses navigations with `ERR_NO_SUPPORTED_PROXIES`. It
|
||||||
|
/// already sends destination hostnames over SOCKS5 by default
|
||||||
|
/// regardless, so stripping the `h` is a pure scheme rename — the
|
||||||
|
/// remote-DNS behaviour is preserved.
|
||||||
|
///
|
||||||
|
/// Non-SOCKS schemes pass through unchanged.
|
||||||
|
pub fn chromium_proxy_arg(proxy: &str) -> String {
|
||||||
|
if let Some(rest) = proxy.strip_prefix("socks5h://") {
|
||||||
|
format!("socks5://{rest}")
|
||||||
|
} else {
|
||||||
|
proxy.to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn host_of_strips_port_and_lowercases() {
|
||||||
|
assert_eq!(
|
||||||
|
host_of("https://CDN.Example.com:443/x").as_deref(),
|
||||||
|
Some("cdn.example.com")
|
||||||
|
);
|
||||||
|
assert_eq!(host_of("http://localhost/").as_deref(), Some("localhost"));
|
||||||
|
assert_eq!(host_of("not a url"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn host_of_keeps_bracketed_ipv6_literal_intact() {
|
||||||
|
// Regression: the old impl rsplit_once(':')'d the IPv6 address,
|
||||||
|
// returning "[:" instead of "[::1]". A real IPv6 source would
|
||||||
|
// silently get a wrong rate-limit bucket key.
|
||||||
|
assert_eq!(host_of("https://[::1]/").as_deref(), Some("[::1]"));
|
||||||
|
assert_eq!(host_of("https://[::1]:8080/").as_deref(), Some("[::1]"));
|
||||||
|
assert_eq!(
|
||||||
|
host_of("https://[2001:db8::1]/foo").as_deref(),
|
||||||
|
Some("[2001:db8::1]")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
host_of("https://[2001:db8::1]:443/foo").as_deref(),
|
||||||
|
Some("[2001:db8::1]")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn origin_of_returns_scheme_and_host() {
|
||||||
|
assert_eq!(
|
||||||
|
origin_of("https://example.com/some/path?q=1").as_deref(),
|
||||||
|
Some("https://example.com")
|
||||||
|
);
|
||||||
|
assert_eq!(origin_of("garbage"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn registrable_domain_strips_subdomain() {
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("https://www.target-site.com/manga/foo/").as_deref(),
|
||||||
|
Some(".target-site.com")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("https://m.example.org").as_deref(),
|
||||||
|
Some(".example.org")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn registrable_domain_keeps_two_label_host() {
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("https://example.com/").as_deref(),
|
||||||
|
Some(".example.com")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn registrable_domain_handles_port() {
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("http://www.foo.bar:8080/x").as_deref(),
|
||||||
|
Some(".foo.bar")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn registrable_domain_bare_hostname_no_leading_dot() {
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("http://localhost:5173").as_deref(),
|
||||||
|
Some("localhost")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn registrable_domain_returns_none_for_garbage() {
|
||||||
|
assert!(registrable_domain("not a url").is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn registrable_domain_keeps_bracketed_ipv6_literal_intact() {
|
||||||
|
// Symmetric with host_of's IPv6 fix. The cookie-domain code
|
||||||
|
// won't accept an IP as a `Domain` value, but the function
|
||||||
|
// should at least return a sensible representation rather
|
||||||
|
// than the truncated `"[:"` the old port-stripper produced.
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("https://[::1]/").as_deref(),
|
||||||
|
Some("[::1]")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("https://[::1]:8080/").as_deref(),
|
||||||
|
Some("[::1]")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
registrable_domain("https://[2001:db8::1]/foo").as_deref(),
|
||||||
|
Some("[2001:db8::1]")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn chromium_proxy_arg_strips_socks5h_to_socks5() {
|
||||||
|
// Regression: passing socks5h:// to Chromium yields
|
||||||
|
// ERR_NO_SUPPORTED_PROXIES at navigation time.
|
||||||
|
assert_eq!(
|
||||||
|
chromium_proxy_arg("socks5h://127.0.0.1:9050"),
|
||||||
|
"socks5://127.0.0.1:9050"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
chromium_proxy_arg("socks5h://tor:9050"),
|
||||||
|
"socks5://tor:9050"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn chromium_proxy_arg_passes_socks5_unchanged() {
|
||||||
|
assert_eq!(
|
||||||
|
chromium_proxy_arg("socks5://127.0.0.1:9050"),
|
||||||
|
"socks5://127.0.0.1:9050"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn chromium_proxy_arg_passes_non_socks_unchanged() {
|
||||||
|
assert_eq!(
|
||||||
|
chromium_proxy_arg("http://proxy.example:8080"),
|
||||||
|
"http://proxy.example:8080"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
15
backend/src/domain/admin_audit.rs
Normal file
15
backend/src/domain/admin_audit.rs
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::Serialize;
|
||||||
|
use sqlx::FromRow;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||||
|
pub struct AdminAuditEntry {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub actor_user_id: Option<Uuid>,
|
||||||
|
pub action: String,
|
||||||
|
pub target_kind: String,
|
||||||
|
pub target_id: Option<Uuid>,
|
||||||
|
pub payload: serde_json::Value,
|
||||||
|
pub at: DateTime<Utc>,
|
||||||
|
}
|
||||||
50
backend/src/domain/collection.rs
Normal file
50
backend/src/domain/collection.rs
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::FromRow;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use super::patch::Patch;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
||||||
|
pub struct Collection {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub user_id: Uuid,
|
||||||
|
pub name: String,
|
||||||
|
pub description: Option<String>,
|
||||||
|
pub created_at: DateTime<Utc>,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shape returned by `GET /me/collections`. Enriched with the manga
|
||||||
|
/// count and up to three sample cover paths so a collection card can
|
||||||
|
/// render without extra round-trips.
|
||||||
|
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||||
|
pub struct CollectionSummary {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub user_id: Uuid,
|
||||||
|
pub name: String,
|
||||||
|
pub description: Option<String>,
|
||||||
|
pub created_at: DateTime<Utc>,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
pub manga_count: i64,
|
||||||
|
/// Cover image keys of up to three sample mangas (newest-added
|
||||||
|
/// first). `Vec<String>` rather than `Option<...>` so an empty
|
||||||
|
/// collection renders as `[]` rather than `null`.
|
||||||
|
pub sample_covers: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
|
pub struct NewCollection {
|
||||||
|
pub name: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub description: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize, Default)]
|
||||||
|
pub struct CollectionPatch {
|
||||||
|
pub name: Option<String>,
|
||||||
|
/// Three-state: missing key leaves description alone; explicit
|
||||||
|
/// `null` clears it; a string sets it. See `Patch`.
|
||||||
|
#[serde(default)]
|
||||||
|
pub description: Patch<String>,
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ use uuid::Uuid;
|
|||||||
|
|
||||||
use super::author::AuthorRef;
|
use super::author::AuthorRef;
|
||||||
use super::genre::GenreRef;
|
use super::genre::GenreRef;
|
||||||
|
use super::patch::Patch;
|
||||||
use super::tag::TagRef;
|
use super::tag::TagRef;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
||||||
@@ -73,82 +74,6 @@ pub struct MangaPatch {
|
|||||||
pub genre_ids: Option<Vec<Uuid>>,
|
pub genre_ids: Option<Vec<Uuid>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Three-state container for nullable PATCH fields.
|
// `Patch<T>` lives in `super::patch` so other resources (collections,
|
||||||
///
|
// future PATCH endpoints) can reuse the same three-state semantics
|
||||||
/// `serde`'s default behaviour collapses both "field missing" and
|
// without re-importing through `manga::`.
|
||||||
/// "field is `null`" to `Option::None`, which means an `Option<T>`
|
|
||||||
/// patch field can't distinguish "leave alone" from "set to NULL".
|
|
||||||
/// `Patch<T>` carries that distinction by deserializing JSON `null`
|
|
||||||
/// into `Clear` and any value into `Set`; with `#[serde(default)]` on
|
|
||||||
/// the field, a missing key falls through to `Unchanged`.
|
|
||||||
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
|
||||||
pub enum Patch<T> {
|
|
||||||
/// Field absent from the request — leave the column untouched.
|
|
||||||
#[default]
|
|
||||||
Unchanged,
|
|
||||||
/// Field present and explicitly `null` — set the column to NULL.
|
|
||||||
Clear,
|
|
||||||
/// Field present with a value — set the column to that value.
|
|
||||||
Set(T),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Patch<T> {
|
|
||||||
/// Whether the request indicated this field should be written
|
|
||||||
/// (either to a new value or to NULL).
|
|
||||||
pub fn is_provided(&self) -> bool {
|
|
||||||
!matches!(self, Patch::Unchanged)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The value to bind when writing, or `None` for `Unchanged`/`Clear`.
|
|
||||||
pub fn set_value(&self) -> Option<&T> {
|
|
||||||
match self {
|
|
||||||
Patch::Set(v) => Some(v),
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'de, T> serde::Deserialize<'de> for Patch<T>
|
|
||||||
where
|
|
||||||
T: serde::Deserialize<'de>,
|
|
||||||
{
|
|
||||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: serde::Deserializer<'de>,
|
|
||||||
{
|
|
||||||
Option::<T>::deserialize(deserializer).map(|opt| match opt {
|
|
||||||
Some(v) => Patch::Set(v),
|
|
||||||
None => Patch::Clear,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
use serde_json::json;
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct Holder {
|
|
||||||
#[serde(default)]
|
|
||||||
desc: Patch<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn missing_key_is_unchanged() {
|
|
||||||
let h: Holder = serde_json::from_value(json!({})).unwrap();
|
|
||||||
assert_eq!(h.desc, Patch::Unchanged);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn explicit_null_is_clear() {
|
|
||||||
let h: Holder = serde_json::from_value(json!({ "desc": null })).unwrap();
|
|
||||||
assert_eq!(h.desc, Patch::Clear);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn value_is_set() {
|
|
||||||
let h: Holder = serde_json::from_value(json!({ "desc": "x" })).unwrap();
|
|
||||||
assert_eq!(h.desc, Patch::Set("x".into()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,23 +1,35 @@
|
|||||||
|
pub mod admin_audit;
|
||||||
pub mod api_token;
|
pub mod api_token;
|
||||||
pub mod author;
|
pub mod author;
|
||||||
pub mod bookmark;
|
pub mod bookmark;
|
||||||
pub mod chapter;
|
pub mod chapter;
|
||||||
|
pub mod collection;
|
||||||
pub mod genre;
|
pub mod genre;
|
||||||
pub mod manga;
|
pub mod manga;
|
||||||
pub mod page;
|
pub mod page;
|
||||||
|
pub mod patch;
|
||||||
|
pub mod read_progress;
|
||||||
pub mod session;
|
pub mod session;
|
||||||
|
pub mod sync_state;
|
||||||
pub mod tag;
|
pub mod tag;
|
||||||
|
pub mod upload_entry;
|
||||||
pub mod user;
|
pub mod user;
|
||||||
pub mod user_preferences;
|
pub mod user_preferences;
|
||||||
|
|
||||||
|
pub use admin_audit::AdminAuditEntry;
|
||||||
pub use api_token::ApiToken;
|
pub use api_token::ApiToken;
|
||||||
pub use author::{Author, AuthorRef, AuthorWithCount};
|
pub use author::{Author, AuthorRef, AuthorWithCount};
|
||||||
pub use bookmark::{Bookmark, BookmarkSummary};
|
pub use bookmark::{Bookmark, BookmarkSummary};
|
||||||
pub use chapter::Chapter;
|
pub use chapter::Chapter;
|
||||||
|
pub use collection::{Collection, CollectionSummary};
|
||||||
pub use genre::{Genre, GenreRef};
|
pub use genre::{Genre, GenreRef};
|
||||||
pub use manga::{Manga, MangaCard, MangaDetail};
|
pub use manga::{Manga, MangaCard, MangaDetail};
|
||||||
pub use page::Page;
|
pub use page::Page;
|
||||||
|
pub use patch::Patch;
|
||||||
|
pub use read_progress::{ReadProgress, ReadProgressForManga, ReadProgressSummary};
|
||||||
pub use session::Session;
|
pub use session::Session;
|
||||||
|
pub use sync_state::{ChapterSyncState, MangaSyncState};
|
||||||
pub use tag::{Tag, TagRef};
|
pub use tag::{Tag, TagRef};
|
||||||
|
pub use upload_entry::UploadEntry;
|
||||||
pub use user::User;
|
pub use user::User;
|
||||||
pub use user_preferences::UserPreferences;
|
pub use user_preferences::UserPreferences;
|
||||||
|
|||||||
81
backend/src/domain/patch.rs
Normal file
81
backend/src/domain/patch.rs
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
//! Three-state container for PATCH fields.
|
||||||
|
//!
|
||||||
|
//! `serde`'s default behaviour collapses both "field missing" and
|
||||||
|
//! "field is `null`" to `Option::None`, which means an `Option<T>`
|
||||||
|
//! patch field can't distinguish "leave alone" from "set to NULL".
|
||||||
|
//! `Patch<T>` carries that distinction by deserializing JSON `null`
|
||||||
|
//! into `Clear` and any value into `Set`; with `#[serde(default)]`
|
||||||
|
//! on the field, a missing key falls through to `Unchanged`.
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||||
|
pub enum Patch<T> {
|
||||||
|
/// Field absent from the request — leave the column untouched.
|
||||||
|
#[default]
|
||||||
|
Unchanged,
|
||||||
|
/// Field present and explicitly `null` — set the column to NULL.
|
||||||
|
Clear,
|
||||||
|
/// Field present with a value — set the column to that value.
|
||||||
|
Set(T),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Patch<T> {
|
||||||
|
/// Whether the request indicated this field should be written
|
||||||
|
/// (either to a new value or to NULL).
|
||||||
|
pub fn is_provided(&self) -> bool {
|
||||||
|
!matches!(self, Patch::Unchanged)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The value to bind when writing, or `None` for `Unchanged`/`Clear`.
|
||||||
|
pub fn set_value(&self) -> Option<&T> {
|
||||||
|
match self {
|
||||||
|
Patch::Set(v) => Some(v),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de, T> serde::Deserialize<'de> for Patch<T>
|
||||||
|
where
|
||||||
|
T: serde::Deserialize<'de>,
|
||||||
|
{
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
|
where
|
||||||
|
D: serde::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
Option::<T>::deserialize(deserializer).map(|opt| match opt {
|
||||||
|
Some(v) => Patch::Set(v),
|
||||||
|
None => Patch::Clear,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct Holder {
|
||||||
|
#[serde(default)]
|
||||||
|
desc: Patch<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn missing_key_is_unchanged() {
|
||||||
|
let h: Holder = serde_json::from_value(json!({})).unwrap();
|
||||||
|
assert_eq!(h.desc, Patch::Unchanged);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn explicit_null_is_clear() {
|
||||||
|
let h: Holder = serde_json::from_value(json!({ "desc": null })).unwrap();
|
||||||
|
assert_eq!(h.desc, Patch::Clear);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn value_is_set() {
|
||||||
|
let h: Holder = serde_json::from_value(json!({ "desc": "x" })).unwrap();
|
||||||
|
assert_eq!(h.desc, Patch::Set("x".into()));
|
||||||
|
}
|
||||||
|
}
|
||||||
50
backend/src/domain/read_progress.rs
Normal file
50
backend/src/domain/read_progress.rs
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::FromRow;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
||||||
|
pub struct ReadProgress {
|
||||||
|
pub user_id: Uuid,
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub chapter_id: Option<Uuid>,
|
||||||
|
pub page: i32,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enriched row for the history view — joins in the manga's title and
|
||||||
|
/// cover plus the chapter number (when the chapter still exists) so a
|
||||||
|
/// card can render without extra round-trips.
|
||||||
|
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||||
|
pub struct ReadProgressSummary {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub manga_title: String,
|
||||||
|
pub manga_cover_image_path: Option<String>,
|
||||||
|
pub chapter_id: Option<Uuid>,
|
||||||
|
/// `None` when the chapter was deleted after this row was written
|
||||||
|
/// (FK ON DELETE SET NULL on `chapter_id`).
|
||||||
|
pub chapter_number: Option<i32>,
|
||||||
|
pub page: i32,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returned by `GET /me/read-progress/:manga_id`. Same shape as
|
||||||
|
/// `ReadProgressSummary` minus the manga title/cover (the caller
|
||||||
|
/// already knows them — they're on the manga detail page). Crucially
|
||||||
|
/// includes `chapter_number` so the "Continue reading" CTA can render
|
||||||
|
/// without resolving the chapter id against a paged chapters list.
|
||||||
|
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||||
|
pub struct ReadProgressForManga {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub chapter_id: Option<Uuid>,
|
||||||
|
pub chapter_number: Option<i32>,
|
||||||
|
pub page: i32,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
|
pub struct UpsertReadProgress {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub chapter_id: Option<Uuid>,
|
||||||
|
pub page: Option<i32>,
|
||||||
|
}
|
||||||
48
backend/src/domain/sync_state.rs
Normal file
48
backend/src/domain/sync_state.rs
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
//! Sync-state enums derived per-manga / per-chapter from `manga_sources`,
|
||||||
|
//! `chapter_sources`, and `crawler_jobs` at query time. No state column
|
||||||
|
//! is persisted on `mangas` / `chapters` — see `repo::admin_view` for the
|
||||||
|
//! derivation rules and priority order.
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, sqlx::Type)]
|
||||||
|
#[sqlx(type_name = "text", rename_all = "snake_case")]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum MangaSyncState {
|
||||||
|
/// A `sync_manga` or `sync_chapter_list` job is currently
|
||||||
|
/// pending or running for this manga.
|
||||||
|
InProgress,
|
||||||
|
/// At least one `manga_sources` row exists for this manga and ALL of
|
||||||
|
/// them have `dropped_at IS NOT NULL` — every source we know about
|
||||||
|
/// has stopped surfacing it.
|
||||||
|
Dropped,
|
||||||
|
/// Default healthy state: at least one live source row OR the manga
|
||||||
|
/// was user-uploaded (no `manga_sources` rows at all).
|
||||||
|
Synced,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, sqlx::Type)]
|
||||||
|
#[sqlx(type_name = "text", rename_all = "snake_case")]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum ChapterSyncState {
|
||||||
|
/// A `sync_chapter_content` job is currently pending or running for
|
||||||
|
/// this chapter (the 0014 dedup index guarantees at most one).
|
||||||
|
Downloading,
|
||||||
|
/// At least one `chapter_sources` row exists AND all of them are
|
||||||
|
/// `dropped_at IS NOT NULL`.
|
||||||
|
Dropped,
|
||||||
|
/// `page_count = 0` AND a `dead` `sync_chapter_content` job exists
|
||||||
|
/// for this chapter. Checked BEFORE `NotDownloaded` so the more
|
||||||
|
/// informative "we tried and it died" state wins over "we never
|
||||||
|
/// got around to it". Does NOT fire when `page_count > 0`, because
|
||||||
|
/// pages on disk mean the chapter IS synced regardless of historical
|
||||||
|
/// job failures — see the priority comment in `repo::admin_view`.
|
||||||
|
Failed,
|
||||||
|
/// `page_count = 0` and no in-flight or failed job — the chapter
|
||||||
|
/// row exists but content has never been downloaded.
|
||||||
|
NotDownloaded,
|
||||||
|
/// `page_count > 0` — content has been downloaded at some point.
|
||||||
|
/// Reaped `done` jobs in `crawler_jobs` mean we can't read this from
|
||||||
|
/// the job table, so `page_count` is the durable truth.
|
||||||
|
Synced,
|
||||||
|
}
|
||||||
40
backend/src/domain/upload_entry.rs
Normal file
40
backend/src/domain/upload_entry.rs
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::Serialize;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use super::chapter::Chapter;
|
||||||
|
use super::manga::Manga;
|
||||||
|
|
||||||
|
/// Tagged union used by `GET /me/uploads` to interleave manga + chapter
|
||||||
|
/// rows chronologically. Serialised as `{ "kind": "...", ... }` so a
|
||||||
|
/// TypeScript discriminated union can pattern-match on `kind`.
|
||||||
|
#[derive(Debug, Clone, Serialize)]
|
||||||
|
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||||
|
pub enum UploadEntry {
|
||||||
|
Manga {
|
||||||
|
manga: Manga,
|
||||||
|
/// Mirrored from `manga.created_at` for ordering convenience;
|
||||||
|
/// the frontend reads this to display the timestamp in a
|
||||||
|
/// kind-agnostic column.
|
||||||
|
created_at: DateTime<Utc>,
|
||||||
|
},
|
||||||
|
Chapter {
|
||||||
|
manga_id: Uuid,
|
||||||
|
manga_title: String,
|
||||||
|
manga_cover_image_path: Option<String>,
|
||||||
|
chapter: Chapter,
|
||||||
|
created_at: DateTime<Utc>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UploadEntry {
|
||||||
|
/// Timestamp used for chronological ordering. The repo sorts on
|
||||||
|
/// the underlying column server-side; this is here for callers
|
||||||
|
/// that need to merge or page in Rust.
|
||||||
|
pub fn created_at(&self) -> DateTime<Utc> {
|
||||||
|
match self {
|
||||||
|
UploadEntry::Manga { created_at, .. } => *created_at,
|
||||||
|
UploadEntry::Chapter { created_at, .. } => *created_at,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -10,4 +10,5 @@ pub struct User {
|
|||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub password_hash: String,
|
pub password_hash: String,
|
||||||
pub created_at: DateTime<Utc>,
|
pub created_at: DateTime<Utc>,
|
||||||
|
pub is_admin: bool,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,6 +21,16 @@ pub enum AppError {
|
|||||||
PayloadTooLarge(String),
|
PayloadTooLarge(String),
|
||||||
#[error("unsupported media type: {0}")]
|
#[error("unsupported media type: {0}")]
|
||||||
UnsupportedMediaType(String),
|
UnsupportedMediaType(String),
|
||||||
|
/// 503 — a feature is currently unavailable, distinct from a 5xx
|
||||||
|
/// internal error. Used when admin actions require the crawler
|
||||||
|
/// daemon but it's been disabled (`CRAWLER_DAEMON=false`).
|
||||||
|
#[error("service unavailable: {0}")]
|
||||||
|
ServiceUnavailable(String),
|
||||||
|
/// 429 with an optional `Retry-After` header value (in seconds).
|
||||||
|
#[error("too many requests")]
|
||||||
|
TooManyRequests {
|
||||||
|
retry_after_secs: Option<u64>,
|
||||||
|
},
|
||||||
/// Semantic per-field validation failure. `details` is rendered into the
|
/// Semantic per-field validation failure. `details` is rendered into the
|
||||||
/// envelope so the client can highlight the bad field(s).
|
/// envelope so the client can highlight the bad field(s).
|
||||||
#[error("validation failed")]
|
#[error("validation failed")]
|
||||||
@@ -51,6 +61,8 @@ impl AppError {
|
|||||||
AppError::Conflict(_) => "conflict",
|
AppError::Conflict(_) => "conflict",
|
||||||
AppError::PayloadTooLarge(_) => "payload_too_large",
|
AppError::PayloadTooLarge(_) => "payload_too_large",
|
||||||
AppError::UnsupportedMediaType(_) => "unsupported_media_type",
|
AppError::UnsupportedMediaType(_) => "unsupported_media_type",
|
||||||
|
AppError::ServiceUnavailable(_) => "service_unavailable",
|
||||||
|
AppError::TooManyRequests { .. } => "too_many_requests",
|
||||||
AppError::ValidationFailed { .. } => "validation_failed",
|
AppError::ValidationFailed { .. } => "validation_failed",
|
||||||
AppError::Database(sqlx::Error::RowNotFound) => "not_found",
|
AppError::Database(sqlx::Error::RowNotFound) => "not_found",
|
||||||
AppError::Database(_) => "internal_error",
|
AppError::Database(_) => "internal_error",
|
||||||
@@ -79,6 +91,34 @@ impl IntoResponse for AppError {
|
|||||||
AppError::UnsupportedMediaType(msg) => {
|
AppError::UnsupportedMediaType(msg) => {
|
||||||
(StatusCode::UNSUPPORTED_MEDIA_TYPE, msg.clone(), None)
|
(StatusCode::UNSUPPORTED_MEDIA_TYPE, msg.clone(), None)
|
||||||
}
|
}
|
||||||
|
AppError::ServiceUnavailable(msg) => {
|
||||||
|
(StatusCode::SERVICE_UNAVAILABLE, msg.clone(), None)
|
||||||
|
}
|
||||||
|
AppError::TooManyRequests { retry_after_secs } => {
|
||||||
|
// Emit `Retry-After: N` (RFC 6585 §4) so a well-behaved
|
||||||
|
// client can back off correctly. Done by building the
|
||||||
|
// response by hand below — the `(status, headers,
|
||||||
|
// body)` tuple shape doesn't fit the standard
|
||||||
|
// `(status, body)` IntoResponse path for the other
|
||||||
|
// variants.
|
||||||
|
let body = json!({
|
||||||
|
"error": {
|
||||||
|
"code": code,
|
||||||
|
"message": "too many requests; slow down",
|
||||||
|
}
|
||||||
|
});
|
||||||
|
let mut resp = (StatusCode::TOO_MANY_REQUESTS, Json(body)).into_response();
|
||||||
|
if let Some(secs) = retry_after_secs {
|
||||||
|
// `HeaderValue: From<u64>` skips both the
|
||||||
|
// intermediate `String` allocation and the
|
||||||
|
// fallible-by-shape `from_str` path.
|
||||||
|
resp.headers_mut().insert(
|
||||||
|
axum::http::header::RETRY_AFTER,
|
||||||
|
axum::http::HeaderValue::from(*secs),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return resp;
|
||||||
|
}
|
||||||
AppError::ValidationFailed { message, details } => (
|
AppError::ValidationFailed { message, details } => (
|
||||||
StatusCode::UNPROCESSABLE_ENTITY,
|
StatusCode::UNPROCESSABLE_ENTITY,
|
||||||
message.clone(),
|
message.clone(),
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ pub mod api;
|
|||||||
pub mod app;
|
pub mod app;
|
||||||
pub mod auth;
|
pub mod auth;
|
||||||
pub mod config;
|
pub mod config;
|
||||||
|
pub mod crawler;
|
||||||
pub mod domain;
|
pub mod domain;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod repo;
|
pub mod repo;
|
||||||
|
|||||||
@@ -1,21 +1,77 @@
|
|||||||
use std::net::SocketAddr;
|
use std::net::SocketAddr;
|
||||||
|
use std::time::Duration;
|
||||||
use tracing_subscriber::EnvFilter;
|
use tracing_subscriber::EnvFilter;
|
||||||
|
|
||||||
|
/// Upper bound on how long we're willing to wait for the crawler daemon
|
||||||
|
/// to drain before letting `main` return. Without it a wedged background
|
||||||
|
/// task (e.g. a chromiumoxide handler stuck on a dead WS) blocks the
|
||||||
|
/// process from exiting after Ctrl-C / SIGTERM.
|
||||||
|
const CRAWLER_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
dotenvy::dotenv().ok();
|
dotenvy::dotenv().ok();
|
||||||
tracing_subscriber::fmt()
|
tracing_subscriber::fmt()
|
||||||
.with_env_filter(
|
.with_env_filter(
|
||||||
EnvFilter::try_from_default_env().unwrap_or_else(|_| "info,mangalord=debug".into()),
|
EnvFilter::try_from_default_env().unwrap_or_else(|_| {
|
||||||
|
"info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off".into()
|
||||||
|
}),
|
||||||
)
|
)
|
||||||
.init();
|
.init();
|
||||||
|
|
||||||
let config = mangalord::config::Config::from_env()?;
|
let config = mangalord::config::Config::from_env()?;
|
||||||
let addr: SocketAddr = config.bind_address.parse()?;
|
let addr: SocketAddr = config.bind_address.parse()?;
|
||||||
let app = mangalord::app::build(config).await?;
|
let mangalord::app::AppHandle { router, daemon } = mangalord::app::build(config).await?;
|
||||||
|
|
||||||
tracing::info!(%addr, "mangalord listening");
|
tracing::info!(%addr, "mangalord listening");
|
||||||
let listener = tokio::net::TcpListener::bind(addr).await?;
|
let listener = tokio::net::TcpListener::bind(addr).await?;
|
||||||
axum::serve(listener, app).await?;
|
axum::serve(listener, router)
|
||||||
|
.with_graceful_shutdown(shutdown_signal())
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Drain background tasks (crawler daemon) before exiting so Chromium
|
||||||
|
// gets a clean shutdown rather than relying on kill-on-drop. Bounded
|
||||||
|
// by a timeout so a wedged shutdown path can't trap the process.
|
||||||
|
if let Some(d) = daemon {
|
||||||
|
if tokio::time::timeout(CRAWLER_SHUTDOWN_TIMEOUT, d.shutdown())
|
||||||
|
.await
|
||||||
|
.is_err()
|
||||||
|
{
|
||||||
|
tracing::warn!(
|
||||||
|
timeout_s = CRAWLER_SHUTDOWN_TIMEOUT.as_secs(),
|
||||||
|
"crawler daemon shutdown exceeded timeout; abandoning"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Wait for either Ctrl-C (interactive shell) or SIGTERM (Docker /
|
||||||
|
/// Kubernetes / Podman / systemd stop) and log which arrived. Without
|
||||||
|
/// the SIGTERM branch, `docker compose stop` runs out its grace period
|
||||||
|
/// and skips straight to SIGKILL — the daemon never gets the
|
||||||
|
/// `daemon.shutdown().await` path, leaking Chromium.
|
||||||
|
async fn shutdown_signal() {
|
||||||
|
use tokio::signal::unix::{signal, SignalKind};
|
||||||
|
let mut sigterm = match signal(SignalKind::terminate()) {
|
||||||
|
Ok(s) => s,
|
||||||
|
Err(e) => {
|
||||||
|
// SignalKind::terminate() is supported on every Unix the
|
||||||
|
// tokio runtime runs on; if registration fails we still
|
||||||
|
// honour Ctrl-C so the process is at least
|
||||||
|
// interactive-shutdownable.
|
||||||
|
tracing::warn!(error = %e, "could not install SIGTERM handler; falling back to ctrl_c only");
|
||||||
|
let _ = tokio::signal::ctrl_c().await;
|
||||||
|
tracing::info!("ctrl-c received; shutting down");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
tokio::select! {
|
||||||
|
_ = tokio::signal::ctrl_c() => {
|
||||||
|
tracing::info!("ctrl-c received; shutting down");
|
||||||
|
}
|
||||||
|
_ = sigterm.recv() => {
|
||||||
|
tracing::info!("SIGTERM received; shutting down");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
32
backend/src/repo/admin_audit.rs
Normal file
32
backend/src/repo/admin_audit.rs
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
//! Admin-action audit log writes.
|
||||||
|
//!
|
||||||
|
//! Insert is always called from inside the same transaction as the
|
||||||
|
//! action it audits — the executor parameter is `PgExecutor` so the
|
||||||
|
//! caller passes `&mut *tx` directly.
|
||||||
|
|
||||||
|
use sqlx::PgExecutor;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::error::AppResult;
|
||||||
|
|
||||||
|
pub async fn insert<'e, E: PgExecutor<'e>>(
|
||||||
|
executor: E,
|
||||||
|
actor_user_id: Uuid,
|
||||||
|
action: &str,
|
||||||
|
target_kind: &str,
|
||||||
|
target_id: Option<Uuid>,
|
||||||
|
payload: serde_json::Value,
|
||||||
|
) -> AppResult<()> {
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO admin_audit (actor_user_id, action, target_kind, target_id, payload) \
|
||||||
|
VALUES ($1, $2, $3, $4, $5)",
|
||||||
|
)
|
||||||
|
.bind(actor_user_id)
|
||||||
|
.bind(action)
|
||||||
|
.bind(target_kind)
|
||||||
|
.bind(target_id)
|
||||||
|
.bind(payload)
|
||||||
|
.execute(executor)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
232
backend/src/repo/admin_view.rs
Normal file
232
backend/src/repo/admin_view.rs
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
//! Admin-facing read queries that join manga/chapter with the crawler
|
||||||
|
//! signals (`manga_sources`, `chapter_sources`, `crawler_jobs`) to
|
||||||
|
//! derive a sync state per row at query time.
|
||||||
|
//!
|
||||||
|
//! Priority order for `MangaSyncState`:
|
||||||
|
//! 1. `InProgress` — any pending/running `sync_manga` or
|
||||||
|
//! `sync_chapter_list` job matches this manga.
|
||||||
|
//! 2. `Dropped` — manga has source rows AND every one of them is
|
||||||
|
//! `dropped_at IS NOT NULL`.
|
||||||
|
//! 3. `Synced` — default (includes user-uploaded mangas with no
|
||||||
|
//! `manga_sources` rows at all).
|
||||||
|
//!
|
||||||
|
//! Priority order for `ChapterSyncState`:
|
||||||
|
//! 1. `Downloading` — pending/running `sync_chapter_content` for this id
|
||||||
|
//! 2. `Dropped` — chapter has source rows AND all are dropped
|
||||||
|
//! 3. `Failed` — `page_count = 0` AND a `dead` `sync_chapter_content`
|
||||||
|
//! row exists for this chapter. Constrained to `page_count = 0`
|
||||||
|
//! because once pages are on disk the chapter IS synced — a
|
||||||
|
//! historical dead job (likely from a re-download attempt that
|
||||||
|
//! crashed) is noise that gets reaped after retention. Surfacing
|
||||||
|
//! "Failed" when content is present would contradict
|
||||||
|
//! `ChapterSyncState::Synced`'s "downloaded at some point" contract.
|
||||||
|
//! 4. `NotDownloaded` — `page_count = 0`, no in-flight, no dead job
|
||||||
|
//! 5. `Synced` — `page_count > 0`
|
||||||
|
//!
|
||||||
|
//! Reminder: `done` jobs are reaped after `CRAWLER_JOB_RETENTION_DAYS`,
|
||||||
|
//! so `chapters.page_count > 0` is the durable "this is synced" signal,
|
||||||
|
//! not the job table.
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::Serialize;
|
||||||
|
use sqlx::{FromRow, PgPool};
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::domain::{ChapterSyncState, MangaSyncState};
|
||||||
|
use crate::error::AppResult;
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, FromRow)]
|
||||||
|
pub struct AdminMangaRow {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub title: String,
|
||||||
|
pub status: String,
|
||||||
|
pub cover_image_path: Option<String>,
|
||||||
|
pub created_at: DateTime<Utc>,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
pub sync_state: MangaSyncState,
|
||||||
|
pub chapter_count: i64,
|
||||||
|
pub latest_seen_at: Option<DateTime<Utc>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct ListAdminMangasQuery {
|
||||||
|
pub search: Option<String>,
|
||||||
|
pub sync_state: Option<MangaSyncState>,
|
||||||
|
pub limit: i64,
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
const MANGA_SYNC_STATE_CASE: &str = r#"
|
||||||
|
CASE
|
||||||
|
WHEN EXISTS (
|
||||||
|
SELECT 1 FROM crawler_jobs cj
|
||||||
|
WHERE cj.state IN ('pending','running')
|
||||||
|
AND (
|
||||||
|
(cj.payload->>'kind' = 'sync_chapter_list'
|
||||||
|
AND (cj.payload->>'manga_id')::uuid = m.id)
|
||||||
|
OR (cj.payload->>'kind' = 'sync_manga'
|
||||||
|
AND EXISTS (
|
||||||
|
SELECT 1 FROM manga_sources ms
|
||||||
|
WHERE ms.manga_id = m.id
|
||||||
|
AND ms.source_id = cj.payload->>'source_id'
|
||||||
|
AND ms.source_manga_key = cj.payload->>'source_manga_key'
|
||||||
|
))
|
||||||
|
)
|
||||||
|
) THEN 'in_progress'
|
||||||
|
WHEN EXISTS (SELECT 1 FROM manga_sources ms WHERE ms.manga_id = m.id)
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM manga_sources ms
|
||||||
|
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
|
||||||
|
)
|
||||||
|
THEN 'dropped'
|
||||||
|
ELSE 'synced'
|
||||||
|
END
|
||||||
|
"#;
|
||||||
|
|
||||||
|
/// Paginated admin manga list with derived sync state and total count.
|
||||||
|
/// Filters by `search` (substring on title, case-insensitive) and
|
||||||
|
/// `sync_state` (post-derivation). The CTE keeps the case expression
|
||||||
|
/// in one place — the same projection feeds both the page rows and the
|
||||||
|
/// totals count under the same filter.
|
||||||
|
pub async fn list_mangas_with_sync_state(
|
||||||
|
pool: &PgPool,
|
||||||
|
q: &ListAdminMangasQuery,
|
||||||
|
) -> AppResult<(Vec<AdminMangaRow>, i64)> {
|
||||||
|
let search_pat = q
|
||||||
|
.search
|
||||||
|
.as_ref()
|
||||||
|
.map(|s| format!("%{}%", s.trim()))
|
||||||
|
.filter(|p| p.len() > 2);
|
||||||
|
// sqlx::Type → text: bind the snake_case representation manually so
|
||||||
|
// the SQL can compare it as text without an explicit cast.
|
||||||
|
let sync_filter = q.sync_state.map(|s| match s {
|
||||||
|
MangaSyncState::InProgress => "in_progress",
|
||||||
|
MangaSyncState::Dropped => "dropped",
|
||||||
|
MangaSyncState::Synced => "synced",
|
||||||
|
});
|
||||||
|
|
||||||
|
let sql = format!(
|
||||||
|
r#"
|
||||||
|
WITH classified AS (
|
||||||
|
SELECT
|
||||||
|
m.id, m.title, m.status, m.cover_image_path,
|
||||||
|
m.created_at, m.updated_at,
|
||||||
|
{case} AS sync_state,
|
||||||
|
(SELECT COUNT(*) FROM chapters c WHERE c.manga_id = m.id) AS chapter_count,
|
||||||
|
(SELECT MAX(last_seen_at) FROM manga_sources ms
|
||||||
|
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL) AS latest_seen_at
|
||||||
|
FROM mangas m
|
||||||
|
WHERE ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
)
|
||||||
|
SELECT * FROM classified
|
||||||
|
WHERE ($2::text IS NULL OR sync_state = $2)
|
||||||
|
ORDER BY updated_at DESC
|
||||||
|
LIMIT $3 OFFSET $4
|
||||||
|
"#,
|
||||||
|
case = MANGA_SYNC_STATE_CASE
|
||||||
|
);
|
||||||
|
let items: Vec<AdminMangaRow> = sqlx::query_as(&sql)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.bind(sync_filter)
|
||||||
|
.bind(q.limit)
|
||||||
|
.bind(q.offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let total_sql = format!(
|
||||||
|
r#"
|
||||||
|
WITH classified AS (
|
||||||
|
SELECT {case} AS sync_state
|
||||||
|
FROM mangas m
|
||||||
|
WHERE ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
)
|
||||||
|
SELECT COUNT(*) FROM classified
|
||||||
|
WHERE ($2::text IS NULL OR sync_state = $2)
|
||||||
|
"#,
|
||||||
|
case = MANGA_SYNC_STATE_CASE
|
||||||
|
);
|
||||||
|
let total: i64 = sqlx::query_scalar(&total_sql)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.bind(sync_filter)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok((items, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, FromRow)]
|
||||||
|
pub struct AdminChapterRow {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub number: i32,
|
||||||
|
pub title: Option<String>,
|
||||||
|
pub page_count: i32,
|
||||||
|
pub created_at: DateTime<Utc>,
|
||||||
|
pub sync_state: ChapterSyncState,
|
||||||
|
pub latest_seen_at: Option<DateTime<Utc>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct ListAdminChaptersQuery {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub limit: i64,
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Paginated chapter list with derived sync state. Pagination is non-
|
||||||
|
/// optional — long-runners can have thousands of chapters and the
|
||||||
|
/// per-row scalar subqueries make the unbounded variant a real
|
||||||
|
/// stall risk even behind an admin guard. Returns the page slice plus
|
||||||
|
/// the unfiltered total so the UI can render "showing N of M".
|
||||||
|
pub async fn list_chapters_with_sync_state(
|
||||||
|
pool: &PgPool,
|
||||||
|
q: &ListAdminChaptersQuery,
|
||||||
|
) -> AppResult<(Vec<AdminChapterRow>, i64)> {
|
||||||
|
let items: Vec<AdminChapterRow> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT
|
||||||
|
c.id, c.manga_id, c.number, c.title, c.page_count, c.created_at,
|
||||||
|
CASE
|
||||||
|
WHEN EXISTS (
|
||||||
|
SELECT 1 FROM crawler_jobs cj
|
||||||
|
WHERE cj.state IN ('pending','running')
|
||||||
|
AND cj.payload->>'kind' = 'sync_chapter_content'
|
||||||
|
AND (cj.payload->>'chapter_id')::uuid = c.id
|
||||||
|
) THEN 'downloading'
|
||||||
|
WHEN EXISTS (SELECT 1 FROM chapter_sources cs WHERE cs.chapter_id = c.id)
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM chapter_sources cs
|
||||||
|
WHERE cs.chapter_id = c.id AND cs.dropped_at IS NULL
|
||||||
|
)
|
||||||
|
THEN 'dropped'
|
||||||
|
WHEN c.page_count = 0
|
||||||
|
AND EXISTS (
|
||||||
|
SELECT 1 FROM crawler_jobs cj
|
||||||
|
WHERE cj.state = 'dead'
|
||||||
|
AND cj.payload->>'kind' = 'sync_chapter_content'
|
||||||
|
AND (cj.payload->>'chapter_id')::uuid = c.id
|
||||||
|
) THEN 'failed'
|
||||||
|
WHEN c.page_count = 0 THEN 'not_downloaded'
|
||||||
|
ELSE 'synced'
|
||||||
|
END AS sync_state,
|
||||||
|
(SELECT MAX(last_seen_at) FROM chapter_sources cs
|
||||||
|
WHERE cs.chapter_id = c.id AND cs.dropped_at IS NULL) AS latest_seen_at
|
||||||
|
FROM chapters c
|
||||||
|
WHERE c.manga_id = $1
|
||||||
|
ORDER BY c.number ASC
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(q.manga_id)
|
||||||
|
.bind(q.limit)
|
||||||
|
.bind(q.offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let total: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM chapters WHERE manga_id = $1")
|
||||||
|
.bind(q.manga_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok((items, total))
|
||||||
|
}
|
||||||
@@ -99,6 +99,11 @@ pub async fn list(
|
|||||||
/// Atomically replace the set of authors on a manga. Caller passes a
|
/// Atomically replace the set of authors on a manga. Caller passes a
|
||||||
/// `&mut PgConnection` (`&mut *tx` works) so the delete+upserts run in
|
/// `&mut PgConnection` (`&mut *tx` works) so the delete+upserts run in
|
||||||
/// one transaction with whatever called us.
|
/// one transaction with whatever called us.
|
||||||
|
///
|
||||||
|
/// Note: `crawler::repo::sync_authors` does a similar replace with the
|
||||||
|
/// same semantics on names. The duplication is intentional — handler
|
||||||
|
/// callers want the `Vec<AuthorRef>` for the API response; the
|
||||||
|
/// crawler doesn't need it and stays inside its own transaction.
|
||||||
pub async fn set_for_manga(
|
pub async fn set_for_manga(
|
||||||
conn: &mut PgConnection,
|
conn: &mut PgConnection,
|
||||||
manga_id: Uuid,
|
manga_id: Uuid,
|
||||||
|
|||||||
@@ -29,9 +29,9 @@ pub async fn create(
|
|||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(b) => Ok(b),
|
Ok(b) => Ok(b),
|
||||||
Err(e) if is_unique_violation(&e) => Err(AppError::Conflict(
|
Err(sqlx::Error::Database(ref db_err)) if db_err.is_unique_violation() => Err(
|
||||||
"bookmark already exists for this manga/chapter".into(),
|
AppError::Conflict("bookmark already exists for this manga/chapter".into()),
|
||||||
)),
|
),
|
||||||
Err(e) => Err(AppError::Database(e)),
|
Err(e) => Err(AppError::Database(e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -46,7 +46,7 @@ pub async fn list_for_user(
|
|||||||
user_id: Uuid,
|
user_id: Uuid,
|
||||||
limit: i64,
|
limit: i64,
|
||||||
offset: i64,
|
offset: i64,
|
||||||
) -> AppResult<Vec<BookmarkSummary>> {
|
) -> AppResult<(Vec<BookmarkSummary>, i64)> {
|
||||||
let rows = sqlx::query_as::<_, BookmarkSummary>(
|
let rows = sqlx::query_as::<_, BookmarkSummary>(
|
||||||
r#"
|
r#"
|
||||||
SELECT
|
SELECT
|
||||||
@@ -72,7 +72,12 @@ pub async fn list_for_user(
|
|||||||
.bind(offset)
|
.bind(offset)
|
||||||
.fetch_all(pool)
|
.fetch_all(pool)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(rows)
|
let (total,): (i64,) =
|
||||||
|
sqlx::query_as("SELECT count(*) FROM bookmarks WHERE user_id = $1")
|
||||||
|
.bind(user_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
Ok((rows, total))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn find_owner(pool: &PgPool, id: Uuid) -> AppResult<Option<Uuid>> {
|
pub async fn find_owner(pool: &PgPool, id: Uuid) -> AppResult<Option<Uuid>> {
|
||||||
@@ -92,10 +97,3 @@ pub async fn delete(pool: &PgPool, id: Uuid) -> AppResult<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_unique_violation(err: &sqlx::Error) -> bool {
|
|
||||||
if let sqlx::Error::Database(db_err) = err {
|
|
||||||
db_err.code().as_deref() == Some("23505")
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use sqlx::{PgExecutor, PgPool};
|
|||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
use crate::domain::Chapter;
|
use crate::domain::Chapter;
|
||||||
use crate::error::{AppError, AppResult};
|
use crate::error::AppResult;
|
||||||
|
|
||||||
pub async fn list_for_manga(
|
pub async fn list_for_manga(
|
||||||
pool: &PgPool,
|
pool: &PgPool,
|
||||||
@@ -12,12 +12,20 @@ pub async fn list_for_manga(
|
|||||||
limit: i64,
|
limit: i64,
|
||||||
offset: i64,
|
offset: i64,
|
||||||
) -> AppResult<Vec<Chapter>> {
|
) -> AppResult<Vec<Chapter>> {
|
||||||
|
// Display order = source-site order reversed. The crawler stamps
|
||||||
|
// `source_index` = position in the source DOM (0 = first = newest
|
||||||
|
// on this site, see migration 0021), so DESC puts the oldest
|
||||||
|
// chapter first and keeps the site's variant grouping and the
|
||||||
|
// placement of non-numeric entries (e.g. "notice. : Officials")
|
||||||
|
// intact. NULLS LAST keeps user-uploaded chapters (no source row)
|
||||||
|
// and rows that pre-date the migration below crawled rows; the
|
||||||
|
// (number, created_at) tail then orders them deterministically.
|
||||||
let rows = sqlx::query_as::<_, Chapter>(
|
let rows = sqlx::query_as::<_, Chapter>(
|
||||||
r#"
|
r#"
|
||||||
SELECT id, manga_id, number, title, page_count, created_at
|
SELECT id, manga_id, number, title, page_count, created_at
|
||||||
FROM chapters
|
FROM chapters
|
||||||
WHERE manga_id = $1
|
WHERE manga_id = $1
|
||||||
ORDER BY number ASC
|
ORDER BY source_index DESC NULLS LAST, number ASC, created_at ASC
|
||||||
LIMIT $2 OFFSET $3
|
LIMIT $2 OFFSET $3
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
@@ -29,55 +37,127 @@ pub async fn list_for_manga(
|
|||||||
Ok(rows)
|
Ok(rows)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn find_by_manga_and_number(
|
/// Look up a chapter by its UUID, scoped to its manga so a UUID guessed
|
||||||
|
/// from a different manga's URL doesn't accidentally resolve.
|
||||||
|
pub async fn find_by_id_in_manga(
|
||||||
pool: &PgPool,
|
pool: &PgPool,
|
||||||
manga_id: Uuid,
|
manga_id: Uuid,
|
||||||
number: i32,
|
chapter_id: Uuid,
|
||||||
) -> AppResult<Option<Chapter>> {
|
) -> AppResult<Option<Chapter>> {
|
||||||
let row = sqlx::query_as::<_, Chapter>(
|
let row = sqlx::query_as::<_, Chapter>(
|
||||||
r#"
|
r#"
|
||||||
SELECT id, manga_id, number, title, page_count, created_at
|
SELECT id, manga_id, number, title, page_count, created_at
|
||||||
FROM chapters
|
FROM chapters
|
||||||
WHERE manga_id = $1 AND number = $2
|
WHERE manga_id = $1 AND id = $2
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.bind(manga_id)
|
.bind(manga_id)
|
||||||
.bind(number)
|
.bind(chapter_id)
|
||||||
.fetch_optional(pool)
|
.fetch_optional(pool)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(row)
|
Ok(row)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Accepts any `PgExecutor` so the upload handler can run this inside a
|
/// Accepts any `PgExecutor` so the upload handler can run this inside a
|
||||||
/// transaction with the per-page inserts. Returns `AppError::Conflict`
|
/// transaction with the per-page inserts.
|
||||||
/// on the (manga_id, number) unique violation so handlers can surface a
|
///
|
||||||
/// clean 409.
|
/// `uploaded_by` records who uploaded the chapter and feeds the
|
||||||
|
/// per-user upload history. `None` means "historical / API token with
|
||||||
|
/// no associated user" — kept nullable to support that case.
|
||||||
|
///
|
||||||
|
/// Chapter identity is the row UUID; the same (manga_id, number)
|
||||||
|
/// combination can repeat (multiple translations, re-uploads). The
|
||||||
|
/// 0013 migration dropped the (manga_id, number) UNIQUE, so duplicate
|
||||||
|
/// inserts succeed by design. If a future migration re-adds any
|
||||||
|
/// uniqueness, surface a 409 by adding a unique-violation arm here.
|
||||||
pub async fn create<'e, E: PgExecutor<'e>>(
|
pub async fn create<'e, E: PgExecutor<'e>>(
|
||||||
executor: E,
|
executor: E,
|
||||||
manga_id: Uuid,
|
manga_id: Uuid,
|
||||||
number: i32,
|
number: i32,
|
||||||
title: Option<&str>,
|
title: Option<&str>,
|
||||||
|
uploaded_by: Option<Uuid>,
|
||||||
) -> AppResult<Chapter> {
|
) -> AppResult<Chapter> {
|
||||||
let result = sqlx::query_as::<_, Chapter>(
|
let row = sqlx::query_as::<_, Chapter>(
|
||||||
r#"
|
r#"
|
||||||
INSERT INTO chapters (manga_id, number, title)
|
INSERT INTO chapters (manga_id, number, title, uploaded_by)
|
||||||
VALUES ($1, $2, $3)
|
VALUES ($1, $2, $3, $4)
|
||||||
RETURNING id, manga_id, number, title, page_count, created_at
|
RETURNING id, manga_id, number, title, page_count, created_at
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.bind(manga_id)
|
.bind(manga_id)
|
||||||
.bind(number)
|
.bind(number)
|
||||||
.bind(title)
|
.bind(title)
|
||||||
|
.bind(uploaded_by)
|
||||||
.fetch_one(executor)
|
.fetch_one(executor)
|
||||||
.await;
|
.await?;
|
||||||
|
Ok(row)
|
||||||
|
}
|
||||||
|
|
||||||
match result {
|
/// Cross-link guard for `POST /bookmarks`: the bookmarks FK accepts
|
||||||
Ok(c) => Ok(c),
|
/// any valid chapter id, but a chapter must belong to the bookmark's
|
||||||
Err(e) if is_unique_violation(&e) => Err(AppError::Conflict(format!(
|
/// manga or the bookmark would dangle on a foreign manga. Handlers
|
||||||
"chapter {number} already exists for this manga"
|
/// call this before the insert and surface `NotFound` when it
|
||||||
))),
|
/// returns `false`.
|
||||||
Err(e) => Err(AppError::Database(e)),
|
pub async fn belongs_to_manga(
|
||||||
}
|
pool: &PgPool,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> AppResult<bool> {
|
||||||
|
let (exists,): (bool,) = sqlx::query_as(
|
||||||
|
"SELECT EXISTS(SELECT 1 FROM chapters WHERE id = $1 AND manga_id = $2)",
|
||||||
|
)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(exists)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read just the page_count for a chapter. Used by the crawler
|
||||||
|
/// daemon's consumer-side dedup safety net so it can ack-done a job
|
||||||
|
/// whose chapter has already been fetched by a racing worker.
|
||||||
|
pub async fn page_count(pool: &PgPool, id: Uuid) -> sqlx::Result<Option<i32>> {
|
||||||
|
sqlx::query_scalar("SELECT page_count FROM chapters WHERE id = $1")
|
||||||
|
.bind(id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Look up the manga_id + most recent live source_url for a chapter.
|
||||||
|
/// Used by the daemon's chapter dispatcher to resolve the URL it needs
|
||||||
|
/// to hand to `content::sync_chapter_content`.
|
||||||
|
///
|
||||||
|
/// Skips soft-dropped sources (`cs.dropped_at IS NOT NULL`) and breaks
|
||||||
|
/// ties between multiple live sources by `last_seen_at DESC`, so the
|
||||||
|
/// freshest still-attached URL wins. Returns `None` when the chapter
|
||||||
|
/// is gone or all its source rows are dropped — callers in the
|
||||||
|
/// dispatcher treat `None` as "ack the job, skip the work."
|
||||||
|
///
|
||||||
|
/// The enqueue queries (`pipeline::enqueue_bookmarked_pending` and
|
||||||
|
/// `enqueue_pending_for_manga`) apply the same `dropped_at IS NULL`
|
||||||
|
/// filter — this resolver stays in lockstep so a chapter that was
|
||||||
|
/// dropped between enqueue and lease isn't dispatched against a stale
|
||||||
|
/// URL.
|
||||||
|
/// Returns `(manga_id, source_url, manga_title, chapter_number)`. The
|
||||||
|
/// title + number feed the live "currently crawling" status; the rest is
|
||||||
|
/// what the dispatcher needs to do the work.
|
||||||
|
pub async fn dispatch_target(
|
||||||
|
pool: &PgPool,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
) -> sqlx::Result<Option<(Uuid, String, String, i32)>> {
|
||||||
|
sqlx::query_as(
|
||||||
|
"SELECT c.manga_id, cs.source_url, m.title, c.number \
|
||||||
|
FROM chapters c \
|
||||||
|
JOIN chapter_sources cs ON cs.chapter_id = c.id \
|
||||||
|
JOIN mangas m ON m.id = c.manga_id \
|
||||||
|
WHERE c.id = $1 \
|
||||||
|
AND cs.dropped_at IS NULL \
|
||||||
|
ORDER BY cs.last_seen_at DESC \
|
||||||
|
LIMIT 1",
|
||||||
|
)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn set_page_count<'e, E: PgExecutor<'e>>(
|
pub async fn set_page_count<'e, E: PgExecutor<'e>>(
|
||||||
@@ -93,10 +173,3 @@ pub async fn set_page_count<'e, E: PgExecutor<'e>>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_unique_violation(err: &sqlx::Error) -> bool {
|
|
||||||
if let sqlx::Error::Database(db_err) = err {
|
|
||||||
db_err.code().as_deref() == Some("23505")
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
280
backend/src/repo/collection.rs
Normal file
280
backend/src/repo/collection.rs
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
//! Collection persistence.
|
||||||
|
//!
|
||||||
|
//! Same plain-function pattern as `repo::bookmark`. Ownership is
|
||||||
|
//! tracked via `collections.user_id`; handlers call `find_owner`
|
||||||
|
//! before mutations to keep 403/404 honest.
|
||||||
|
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::domain::collection::{Collection, CollectionSummary};
|
||||||
|
use crate::domain::manga::Manga;
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
|
||||||
|
pub async fn create(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
name: &str,
|
||||||
|
description: Option<&str>,
|
||||||
|
) -> AppResult<Collection> {
|
||||||
|
let row = sqlx::query_as::<_, Collection>(
|
||||||
|
r#"
|
||||||
|
INSERT INTO collections (user_id, name, description)
|
||||||
|
VALUES ($1, $2, $3)
|
||||||
|
RETURNING id, user_id, name, description, created_at, updated_at
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(name.trim())
|
||||||
|
.bind(description)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.map_err(|e| match e {
|
||||||
|
sqlx::Error::Database(ref db_err) if db_err.is_unique_violation() => {
|
||||||
|
AppError::Conflict("a collection with this name already exists".into())
|
||||||
|
}
|
||||||
|
other => AppError::Database(other),
|
||||||
|
})?;
|
||||||
|
Ok(row)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get(pool: &PgPool, id: Uuid) -> AppResult<Collection> {
|
||||||
|
sqlx::query_as::<_, Collection>(
|
||||||
|
r#"
|
||||||
|
SELECT id, user_id, name, description, created_at, updated_at
|
||||||
|
FROM collections
|
||||||
|
WHERE id = $1
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?
|
||||||
|
.ok_or(AppError::NotFound)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn find_owner(pool: &PgPool, id: Uuid) -> AppResult<Option<Uuid>> {
|
||||||
|
let row: Option<(Uuid,)> =
|
||||||
|
sqlx::query_as("SELECT user_id FROM collections WHERE id = $1")
|
||||||
|
.bind(id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(row.map(|(u,)| u))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Paged list of one user's collections. Includes `manga_count` and up
|
||||||
|
/// to three sample cover image keys (newest-added first) so a card can
|
||||||
|
/// render without a follow-up fetch.
|
||||||
|
pub async fn list_for_user(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
limit: i64,
|
||||||
|
offset: i64,
|
||||||
|
) -> AppResult<(Vec<CollectionSummary>, i64)> {
|
||||||
|
let rows = sqlx::query_as::<_, CollectionSummary>(
|
||||||
|
r#"
|
||||||
|
SELECT
|
||||||
|
c.id, c.user_id, c.name, c.description, c.created_at, c.updated_at,
|
||||||
|
(SELECT count(*) FROM collection_mangas cm WHERE cm.collection_id = c.id)
|
||||||
|
AS manga_count,
|
||||||
|
COALESCE(
|
||||||
|
(
|
||||||
|
-- `array_agg(... ORDER BY ...)` is the only
|
||||||
|
-- spec-guaranteed way to preserve element order;
|
||||||
|
-- a subquery's ORDER BY isn't a contract the
|
||||||
|
-- outer aggregate has to honour. Adding manga_id
|
||||||
|
-- as a tiebreaker keeps the order stable when
|
||||||
|
-- multiple rows share `added_at` (bulk imports).
|
||||||
|
SELECT array_agg(cover_image_path ORDER BY added_at DESC, manga_id)
|
||||||
|
FROM (
|
||||||
|
SELECT m.cover_image_path, cm2.added_at, cm2.manga_id
|
||||||
|
FROM collection_mangas cm2
|
||||||
|
JOIN mangas m ON m.id = cm2.manga_id
|
||||||
|
WHERE cm2.collection_id = c.id
|
||||||
|
AND m.cover_image_path IS NOT NULL
|
||||||
|
ORDER BY cm2.added_at DESC, cm2.manga_id
|
||||||
|
LIMIT 3
|
||||||
|
) p
|
||||||
|
),
|
||||||
|
ARRAY[]::text[]
|
||||||
|
) AS sample_covers
|
||||||
|
FROM collections c
|
||||||
|
WHERE c.user_id = $1
|
||||||
|
ORDER BY c.updated_at DESC, c.id
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(limit)
|
||||||
|
.bind(offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let (total,): (i64,) =
|
||||||
|
sqlx::query_as("SELECT count(*) FROM collections WHERE user_id = $1")
|
||||||
|
.bind(user_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
Ok((rows, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn update(
|
||||||
|
pool: &PgPool,
|
||||||
|
id: Uuid,
|
||||||
|
name: Option<&str>,
|
||||||
|
description_provided: bool,
|
||||||
|
description: Option<&str>,
|
||||||
|
) -> AppResult<Collection> {
|
||||||
|
let row = sqlx::query_as::<_, Collection>(
|
||||||
|
r#"
|
||||||
|
UPDATE collections
|
||||||
|
SET name = COALESCE($2, name),
|
||||||
|
description = CASE WHEN $3::boolean THEN $4 ELSE description END,
|
||||||
|
updated_at = now()
|
||||||
|
WHERE id = $1
|
||||||
|
RETURNING id, user_id, name, description, created_at, updated_at
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(id)
|
||||||
|
.bind(name.map(str::trim))
|
||||||
|
.bind(description_provided)
|
||||||
|
.bind(description)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await
|
||||||
|
.map_err(|e| match e {
|
||||||
|
sqlx::Error::Database(ref db_err) if db_err.is_unique_violation() => {
|
||||||
|
AppError::Conflict("a collection with this name already exists".into())
|
||||||
|
}
|
||||||
|
other => AppError::Database(other),
|
||||||
|
})?
|
||||||
|
.ok_or(AppError::NotFound)?;
|
||||||
|
Ok(row)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn delete(pool: &PgPool, id: Uuid) -> AppResult<()> {
|
||||||
|
sqlx::query("DELETE FROM collections WHERE id = $1")
|
||||||
|
.bind(id)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a manga to a collection. Returns `true` if a new attachment was
|
||||||
|
/// created (handler picks 201), `false` if the manga was already in
|
||||||
|
/// the collection (handler picks 200). Touches `updated_at` so the
|
||||||
|
/// "recent collections" sort reflects activity.
|
||||||
|
///
|
||||||
|
/// FK violations (manga deleted between the handler's `exists` check
|
||||||
|
/// and this insert — a race the API can't fully close from the
|
||||||
|
/// outside) are remapped to `NotFound` so the handler returns 404
|
||||||
|
/// rather than 500.
|
||||||
|
pub async fn add_manga(
|
||||||
|
pool: &PgPool,
|
||||||
|
collection_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> AppResult<bool> {
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
let inserted = sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO collection_mangas (collection_id, manga_id)
|
||||||
|
VALUES ($1, $2)
|
||||||
|
ON CONFLICT DO NOTHING
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(collection_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await
|
||||||
|
.map_err(|e| match e {
|
||||||
|
sqlx::Error::Database(ref db_err) if db_err.is_foreign_key_violation() => {
|
||||||
|
AppError::NotFound
|
||||||
|
}
|
||||||
|
other => AppError::Database(other),
|
||||||
|
})?;
|
||||||
|
let rows_affected = inserted.rows_affected();
|
||||||
|
if rows_affected > 0 {
|
||||||
|
sqlx::query("UPDATE collections SET updated_at = now() WHERE id = $1")
|
||||||
|
.bind(collection_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(rows_affected > 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn remove_manga(
|
||||||
|
pool: &PgPool,
|
||||||
|
collection_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> AppResult<()> {
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
let rows_affected = sqlx::query(
|
||||||
|
"DELETE FROM collection_mangas WHERE collection_id = $1 AND manga_id = $2",
|
||||||
|
)
|
||||||
|
.bind(collection_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?
|
||||||
|
.rows_affected();
|
||||||
|
if rows_affected > 0 {
|
||||||
|
sqlx::query("UPDATE collections SET updated_at = now() WHERE id = $1")
|
||||||
|
.bind(collection_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_mangas(
|
||||||
|
pool: &PgPool,
|
||||||
|
collection_id: Uuid,
|
||||||
|
limit: i64,
|
||||||
|
offset: i64,
|
||||||
|
) -> AppResult<(Vec<Manga>, i64)> {
|
||||||
|
let rows = sqlx::query_as::<_, Manga>(
|
||||||
|
r#"
|
||||||
|
SELECT m.id, m.title, m.status, m.alt_titles, m.description,
|
||||||
|
m.cover_image_path, m.created_at, m.updated_at
|
||||||
|
FROM collection_mangas cm
|
||||||
|
JOIN mangas m ON m.id = cm.manga_id
|
||||||
|
WHERE cm.collection_id = $1
|
||||||
|
ORDER BY cm.added_at DESC, m.id
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(collection_id)
|
||||||
|
.bind(limit)
|
||||||
|
.bind(offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
let (total,): (i64,) =
|
||||||
|
sqlx::query_as("SELECT count(*) FROM collection_mangas WHERE collection_id = $1")
|
||||||
|
.bind(collection_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
Ok((rows, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Which of `user_id`'s collections currently contain `manga_id`?
|
||||||
|
/// Used by the "Add to collection" modal to pre-check the boxes.
|
||||||
|
pub async fn list_collections_containing(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> AppResult<Vec<Uuid>> {
|
||||||
|
let rows: Vec<(Uuid,)> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT c.id
|
||||||
|
FROM collections c
|
||||||
|
JOIN collection_mangas cm ON cm.collection_id = c.id
|
||||||
|
WHERE c.user_id = $1
|
||||||
|
AND cm.manga_id = $2
|
||||||
|
ORDER BY c.updated_at DESC
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(rows.into_iter().map(|(id,)| id).collect())
|
||||||
|
}
|
||||||
945
backend/src/repo/crawler.rs
Normal file
945
backend/src/repo/crawler.rs
Normal file
@@ -0,0 +1,945 @@
|
|||||||
|
//! Persistence for crawled mangas.
|
||||||
|
//!
|
||||||
|
//! High-level operations:
|
||||||
|
//! - [`ensure_source`]: idempotent registration of a source row.
|
||||||
|
//! - [`upsert_manga_from_source`]: end-to-end "I saw this manga" —
|
||||||
|
//! creates or updates the `mangas` row, threads `manga_sources`, and
|
||||||
|
//! refreshes authors/genres/tags. Returns whether the manga is new,
|
||||||
|
//! updated (metadata_hash changed), or unchanged.
|
||||||
|
//! - [`sync_manga_chapters`]: per-manga chapter reconciliation. Adds
|
||||||
|
//! new ones, refreshes URLs on existing ones, soft-drops vanished.
|
||||||
|
//! - [`mark_run_started`] / [`mark_run_completed`] /
|
||||||
|
//! [`last_run_completed_cleanly`]: per-source recovery flag in
|
||||||
|
//! `crawler_state`. A `false` flag on tick start means the previous
|
||||||
|
//! run did not exit cleanly and the next walk should ignore the
|
||||||
|
//! early-stop condition.
|
||||||
|
//!
|
||||||
|
//! Each public function is a transaction boundary so a partial failure
|
||||||
|
//! mid-call leaves the DB in its pre-call state.
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::Serialize;
|
||||||
|
use sqlx::{FromRow, PgPool, Postgres, Transaction};
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::crawler::source::{SourceChapterRef, SourceManga};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub enum UpsertStatus {
|
||||||
|
New,
|
||||||
|
Updated,
|
||||||
|
Unchanged,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct UpsertedManga {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub status: UpsertStatus,
|
||||||
|
/// Current value of `mangas.cover_image_path` after the upsert.
|
||||||
|
/// `None` means the cover hasn't been downloaded yet — the caller
|
||||||
|
/// uses this to backfill covers for mangas that were synced before
|
||||||
|
/// cover-download support existed.
|
||||||
|
pub cover_image_path: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct ChapterDiff {
|
||||||
|
pub new: usize,
|
||||||
|
pub refreshed: usize,
|
||||||
|
pub dropped: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn ensure_source(
|
||||||
|
pool: &PgPool,
|
||||||
|
id: &str,
|
||||||
|
name: &str,
|
||||||
|
base_url: &str,
|
||||||
|
) -> sqlx::Result<()> {
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO sources (id, name, base_url, enabled)
|
||||||
|
VALUES ($1, $2, $3, true)
|
||||||
|
ON CONFLICT (id) DO UPDATE
|
||||||
|
SET name = EXCLUDED.name,
|
||||||
|
base_url = EXCLUDED.base_url
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(id)
|
||||||
|
.bind(name)
|
||||||
|
.bind(base_url)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn upsert_manga_from_source(
|
||||||
|
pool: &PgPool,
|
||||||
|
source_id: &str,
|
||||||
|
source_url: &str,
|
||||||
|
sm: &SourceManga,
|
||||||
|
) -> sqlx::Result<UpsertedManga> {
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
|
||||||
|
let existing: Option<(Uuid, Option<String>)> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT manga_id, metadata_hash
|
||||||
|
FROM manga_sources
|
||||||
|
WHERE source_id = $1 AND source_manga_key = $2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(&sm.source_manga_key)
|
||||||
|
.fetch_optional(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let status_db = sm.status.as_deref().unwrap_or("ongoing");
|
||||||
|
|
||||||
|
// Note: `cover_image_path` is intentionally not written here.
|
||||||
|
// The repo layer doesn't know about the storage backend, so the
|
||||||
|
// caller (crawler binary) downloads the cover via the `Storage`
|
||||||
|
// trait and sets the path with `repo::manga::set_cover_image_path`
|
||||||
|
// once the bytes have landed.
|
||||||
|
let (manga_id, status) = match existing {
|
||||||
|
None => {
|
||||||
|
let (id,): (Uuid,) = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
INSERT INTO mangas (title, description, status, alt_titles)
|
||||||
|
VALUES ($1, $2, $3, $4)
|
||||||
|
RETURNING id
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&sm.title)
|
||||||
|
.bind(sm.summary.as_deref())
|
||||||
|
.bind(status_db)
|
||||||
|
.bind(&sm.alternative_titles)
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
(id, UpsertStatus::New)
|
||||||
|
}
|
||||||
|
Some((id, prev_hash)) if prev_hash.as_deref() == Some(&sm.metadata_hash) => {
|
||||||
|
(id, UpsertStatus::Unchanged)
|
||||||
|
}
|
||||||
|
Some((id, _)) => {
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
UPDATE mangas
|
||||||
|
SET title = $1,
|
||||||
|
description = $2,
|
||||||
|
status = $3,
|
||||||
|
alt_titles = $4,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $5
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&sm.title)
|
||||||
|
.bind(sm.summary.as_deref())
|
||||||
|
.bind(status_db)
|
||||||
|
.bind(&sm.alternative_titles)
|
||||||
|
.bind(id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
(id, UpsertStatus::Updated)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO manga_sources
|
||||||
|
(source_id, source_manga_key, manga_id, source_url, metadata_hash, last_seen_at, dropped_at)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, NOW(), NULL)
|
||||||
|
ON CONFLICT (source_id, source_manga_key) DO UPDATE
|
||||||
|
SET source_url = EXCLUDED.source_url,
|
||||||
|
metadata_hash = EXCLUDED.metadata_hash,
|
||||||
|
last_seen_at = NOW(),
|
||||||
|
dropped_at = NULL
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(&sm.source_manga_key)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(source_url)
|
||||||
|
.bind(&sm.metadata_hash)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
sync_authors(&mut tx, manga_id, &sm.authors).await?;
|
||||||
|
sync_genres(&mut tx, manga_id, &sm.genres).await?;
|
||||||
|
sync_tags(&mut tx, manga_id, &sm.tags).await?;
|
||||||
|
|
||||||
|
let cover_image_path: Option<String> =
|
||||||
|
sqlx::query_scalar("SELECT cover_image_path FROM mangas WHERE id = $1")
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(UpsertedManga {
|
||||||
|
manga_id,
|
||||||
|
status,
|
||||||
|
cover_image_path,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn sync_authors(
|
||||||
|
tx: &mut Transaction<'_, Postgres>,
|
||||||
|
manga_id: Uuid,
|
||||||
|
authors: &[String],
|
||||||
|
) -> sqlx::Result<()> {
|
||||||
|
sqlx::query("DELETE FROM manga_authors WHERE manga_id = $1")
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
for (i, name) in authors.iter().enumerate() {
|
||||||
|
let trimmed = name.trim();
|
||||||
|
if trimmed.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Self-update on conflict so the row id is always returned —
|
||||||
|
// can't use DO NOTHING because that suppresses RETURNING.
|
||||||
|
let (author_id,): (Uuid,) = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
INSERT INTO authors (name) VALUES ($1)
|
||||||
|
ON CONFLICT (lower(name)) DO UPDATE SET name = authors.name
|
||||||
|
RETURNING id
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(trimmed)
|
||||||
|
.fetch_one(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO manga_authors (manga_id, author_id, position)
|
||||||
|
VALUES ($1, $2, $3)
|
||||||
|
ON CONFLICT DO NOTHING
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(author_id)
|
||||||
|
.bind(i as i32)
|
||||||
|
.execute(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn sync_genres(
|
||||||
|
tx: &mut Transaction<'_, Postgres>,
|
||||||
|
manga_id: Uuid,
|
||||||
|
genres: &[String],
|
||||||
|
) -> sqlx::Result<()> {
|
||||||
|
sqlx::query("DELETE FROM manga_genres WHERE manga_id = $1")
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
for name in genres {
|
||||||
|
let trimmed = name.trim();
|
||||||
|
if trimmed.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Case-insensitive lookup so a source-supplied "action"
|
||||||
|
// attaches to the seeded "Action" rather than creating a
|
||||||
|
// second row.
|
||||||
|
let existing: Option<(Uuid,)> =
|
||||||
|
sqlx::query_as("SELECT id FROM genres WHERE lower(name) = lower($1)")
|
||||||
|
.bind(trimmed)
|
||||||
|
.fetch_optional(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
let genre_id = match existing {
|
||||||
|
Some((id,)) => id,
|
||||||
|
None => {
|
||||||
|
let (id,): (Uuid,) = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
INSERT INTO genres (name) VALUES ($1)
|
||||||
|
ON CONFLICT (name) DO UPDATE SET name = genres.name
|
||||||
|
RETURNING id
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(trimmed)
|
||||||
|
.fetch_one(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
tracing::info!(genre = trimmed, "added new genre from source");
|
||||||
|
id
|
||||||
|
}
|
||||||
|
};
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO manga_genres (manga_id, genre_id) VALUES ($1, $2) ON CONFLICT DO NOTHING",
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(genre_id)
|
||||||
|
.execute(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn sync_tags(
|
||||||
|
tx: &mut Transaction<'_, Postgres>,
|
||||||
|
manga_id: Uuid,
|
||||||
|
tags: &[String],
|
||||||
|
) -> sqlx::Result<()> {
|
||||||
|
// Only clear crawler-owned attachments (added_by IS NULL). User-
|
||||||
|
// attached tags are owned by the attaching user and must survive
|
||||||
|
// the recurring metadata pass — see manga_tags.added_by in
|
||||||
|
// migration 0009.
|
||||||
|
//
|
||||||
|
// Note on orphans: `manga_tags.added_by` is `ON DELETE SET NULL`,
|
||||||
|
// so an attachment whose user was deleted becomes
|
||||||
|
// indistinguishable from a crawler-owned row and is cleaned up
|
||||||
|
// here. That mirrors how `api::mangas::detach_tag` already treats
|
||||||
|
// orphans ("nobody owns it, refuse to let anyone but admin clear
|
||||||
|
// them") — the crawler now becomes the eventual reaper. Tracked
|
||||||
|
// by `sync_tags_garbage_collects_orphan_user_attachments` in
|
||||||
|
// backend/tests/crawler_sync.rs.
|
||||||
|
sqlx::query("DELETE FROM manga_tags WHERE manga_id = $1 AND added_by IS NULL")
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
for name in tags {
|
||||||
|
let trimmed = name.trim();
|
||||||
|
if trimmed.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let (tag_id,): (Uuid,) = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
INSERT INTO tags (name) VALUES ($1)
|
||||||
|
ON CONFLICT (lower(name)) DO UPDATE SET name = tags.name
|
||||||
|
RETURNING id
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(trimmed)
|
||||||
|
.fetch_one(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO manga_tags (manga_id, tag_id, added_by)
|
||||||
|
VALUES ($1, $2, NULL)
|
||||||
|
ON CONFLICT DO NOTHING
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(tag_id)
|
||||||
|
.execute(&mut **tx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn sync_manga_chapters(
|
||||||
|
pool: &PgPool,
|
||||||
|
source_id: &str,
|
||||||
|
manga_id: Uuid,
|
||||||
|
chapters: &[SourceChapterRef],
|
||||||
|
) -> sqlx::Result<ChapterDiff> {
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
// Per-manga advisory lock. Two concurrent calls for the same manga
|
||||||
|
// would otherwise both read `seen_keys`, both run the drop UPDATE
|
||||||
|
// filtered on `NOT (key = ANY $3)`, and the later commit could soft-
|
||||||
|
// drop a chapter the earlier commit had just inserted (lost-update
|
||||||
|
// shape under MVCC). `pg_advisory_xact_lock` is scoped to this
|
||||||
|
// transaction: it auto-releases on COMMIT/ROLLBACK so a Rust-side
|
||||||
|
// panic mid-call doesn't strand the lock. The single-arg int8 form
|
||||||
|
// keyed by `hashtextextended(manga_id::text, 0)` shares Postgres'
|
||||||
|
// global advisory-lock namespace with `CRON_LOCK_KEY`, but collision
|
||||||
|
// is 2^-64 per pair (a UUID-derived hash hitting the fixed cron key
|
||||||
|
// is effectively impossible).
|
||||||
|
sqlx::query("SELECT pg_advisory_xact_lock(hashtextextended($1::text, 0))")
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut diff = ChapterDiff::default();
|
||||||
|
let seen_keys: Vec<String> = chapters
|
||||||
|
.iter()
|
||||||
|
.map(|c| c.source_chapter_key.clone())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
for (idx, c) in chapters.iter().enumerate() {
|
||||||
|
// `source_index` captures the chapter's position in the source
|
||||||
|
// DOM (0 = first = newest on this site) so the list query can
|
||||||
|
// reverse it for the user-facing list — see migration 0021.
|
||||||
|
// Every sync overwrites the value on both branches, so a new
|
||||||
|
// chapter inserted at the top of the source shifts every other
|
||||||
|
// row down by one on the next tick.
|
||||||
|
let source_index = idx as i32;
|
||||||
|
// Lookup is constrained by manga_id (via the chapters join) so a
|
||||||
|
// source whose chapter slugs collide across mangas (e.g.
|
||||||
|
// "chapter-1" appearing under two different mangas) attributes
|
||||||
|
// each row to the correct manga. Migration 0017 dropped the
|
||||||
|
// (source_id, source_chapter_key) PK in favour of
|
||||||
|
// (source_id, chapter_id) for exactly this reason.
|
||||||
|
let existing: Option<(Uuid,)> = sqlx::query_as(
|
||||||
|
"SELECT cs.chapter_id \
|
||||||
|
FROM chapter_sources cs \
|
||||||
|
JOIN chapters ch ON ch.id = cs.chapter_id \
|
||||||
|
WHERE cs.source_id = $1 \
|
||||||
|
AND cs.source_chapter_key = $2 \
|
||||||
|
AND ch.manga_id = $3",
|
||||||
|
)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(&c.source_chapter_key)
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_optional(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
match existing {
|
||||||
|
None => {
|
||||||
|
// New chapter row. As of 0013 there's no (manga_id,
|
||||||
|
// number) UNIQUE, so duplicate-numbered chapters from
|
||||||
|
// the source (different uploaders, notices, alt
|
||||||
|
// translations) each get their own row — chapter
|
||||||
|
// identity is the UUID, not the number.
|
||||||
|
let (chapter_id,): (Uuid,) = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
INSERT INTO chapters (manga_id, number, title, page_count, source_index)
|
||||||
|
VALUES ($1, $2, $3, 0, $4)
|
||||||
|
RETURNING id
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(c.number)
|
||||||
|
.bind(c.title.as_deref())
|
||||||
|
.bind(source_index)
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
INSERT INTO chapter_sources
|
||||||
|
(source_id, source_chapter_key, chapter_id, source_url, last_seen_at, dropped_at)
|
||||||
|
VALUES ($1, $2, $3, $4, NOW(), NULL)
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(&c.source_chapter_key)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(&c.url)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
diff.new += 1;
|
||||||
|
}
|
||||||
|
Some((chapter_id,)) => {
|
||||||
|
sqlx::query(
|
||||||
|
"UPDATE chapters SET title = $1, source_index = $2 WHERE id = $3",
|
||||||
|
)
|
||||||
|
.bind(c.title.as_deref())
|
||||||
|
.bind(source_index)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
// chapter_id is now the natural per-(source, chapter)
|
||||||
|
// identifier — use it directly instead of re-keying on
|
||||||
|
// (source_id, source_chapter_key) which may not be unique.
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
UPDATE chapter_sources
|
||||||
|
SET source_url = $1, last_seen_at = NOW(), dropped_at = NULL
|
||||||
|
WHERE source_id = $2 AND chapter_id = $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&c.url)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
diff.refreshed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Soft-drop any chapter previously seen from this source for this
|
||||||
|
// manga that's not in the current list.
|
||||||
|
let result = sqlx::query(
|
||||||
|
r#"
|
||||||
|
UPDATE chapter_sources cs
|
||||||
|
SET dropped_at = NOW()
|
||||||
|
FROM chapters ch
|
||||||
|
WHERE cs.chapter_id = ch.id
|
||||||
|
AND ch.manga_id = $1
|
||||||
|
AND cs.source_id = $2
|
||||||
|
AND cs.dropped_at IS NULL
|
||||||
|
AND NOT (cs.source_chapter_key = ANY($3))
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(&seen_keys)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
diff.dropped = result.rows_affected() as usize;
|
||||||
|
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count the chapters that the source `(source_id, source_manga_key)`
|
||||||
|
/// is currently known to attach to — i.e. the number of `chapter_sources`
|
||||||
|
/// rows for the manga identified by the (source_id, source_manga_key)
|
||||||
|
/// pair, restricted to live (`dropped_at IS NULL`) rows.
|
||||||
|
///
|
||||||
|
/// Used by the metadata pass's partial-render guard: if `fetch_manga`
|
||||||
|
/// returns an empty `chapters` Vec but the source previously surfaced
|
||||||
|
/// chapters here, that's most likely a chromium snapshot taken between
|
||||||
|
/// the `#chapter_table` wrapper render and its rows render — the
|
||||||
|
/// safest move is to skip `sync_manga_chapters` so the soft-drop
|
||||||
|
/// branch doesn't flip every existing chapter to `dropped_at`.
|
||||||
|
///
|
||||||
|
/// Returns `Ok(0)` when the manga is brand-new (no `manga_sources`
|
||||||
|
/// row yet), which is the legitimate "this manga has no chapters yet"
|
||||||
|
/// case and must NOT be flagged.
|
||||||
|
pub async fn live_chapter_count_for_source_manga(
|
||||||
|
pool: &PgPool,
|
||||||
|
source_id: &str,
|
||||||
|
source_manga_key: &str,
|
||||||
|
) -> sqlx::Result<i64> {
|
||||||
|
let row: Option<(i64,)> = sqlx::query_as(
|
||||||
|
"SELECT COUNT(*) \
|
||||||
|
FROM chapter_sources cs \
|
||||||
|
JOIN chapters c ON c.id = cs.chapter_id \
|
||||||
|
JOIN manga_sources ms \
|
||||||
|
ON ms.manga_id = c.manga_id \
|
||||||
|
AND ms.source_id = cs.source_id \
|
||||||
|
WHERE ms.source_id = $1 \
|
||||||
|
AND ms.source_manga_key = $2 \
|
||||||
|
AND cs.dropped_at IS NULL",
|
||||||
|
)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(source_manga_key)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(row.map(|(n,)| n).unwrap_or(0))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a metadata pass as in-flight for `source_id`. Stamps
|
||||||
|
/// `last_run_completed:<source_id>` in `crawler_state` with
|
||||||
|
/// `{"completed": false, "at": now}`. A crash, panic, or SIGKILL after
|
||||||
|
/// this point leaves the flag at `false`, which the next tick reads as
|
||||||
|
/// "previous run did not exit cleanly — walk the full catalog this
|
||||||
|
/// time" (recovery sweep).
|
||||||
|
pub async fn mark_run_started(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
|
||||||
|
let key = format!("last_run_completed:{source_id}");
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||||
|
VALUES ($1, $2, now()) \
|
||||||
|
ON CONFLICT (key) DO UPDATE \
|
||||||
|
SET value = EXCLUDED.value, updated_at = now()",
|
||||||
|
)
|
||||||
|
.bind(&key)
|
||||||
|
.bind(serde_json::json!({
|
||||||
|
"completed": false,
|
||||||
|
"at": Utc::now().to_rfc3339(),
|
||||||
|
}))
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a metadata pass as completed cleanly for `source_id`. Called
|
||||||
|
/// from the same place a run decides it reached end-of-walk or hit the
|
||||||
|
/// intentional stop. The next tick reads `true` and applies the normal
|
||||||
|
/// stop condition.
|
||||||
|
pub async fn mark_run_completed(pool: &PgPool, source_id: &str) -> sqlx::Result<()> {
|
||||||
|
let key = format!("last_run_completed:{source_id}");
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO crawler_state (key, value, updated_at) \
|
||||||
|
VALUES ($1, $2, now()) \
|
||||||
|
ON CONFLICT (key) DO UPDATE \
|
||||||
|
SET value = EXCLUDED.value, updated_at = now()",
|
||||||
|
)
|
||||||
|
.bind(&key)
|
||||||
|
.bind(serde_json::json!({
|
||||||
|
"completed": true,
|
||||||
|
"at": Utc::now().to_rfc3339(),
|
||||||
|
}))
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List mangas whose `cover_image_path IS NULL` but a live
|
||||||
|
/// `manga_sources` row still attaches them to a source. The bounded
|
||||||
|
/// result feeds the cover-backfill pass in [`crate::crawler::pipeline`]:
|
||||||
|
/// each entry is one (manga, freshest source row) pair where a cover
|
||||||
|
/// re-download is in order.
|
||||||
|
///
|
||||||
|
/// Per-manga deduplication uses `DISTINCT ON (m.id)` keyed on the row
|
||||||
|
/// with the newest `last_seen_at`, so a manga that's surfaced by
|
||||||
|
/// multiple sources only produces one row (the freshest). Sort is
|
||||||
|
/// stable for tests.
|
||||||
|
pub async fn list_missing_covers(
|
||||||
|
pool: &PgPool,
|
||||||
|
max: i64,
|
||||||
|
) -> sqlx::Result<Vec<MissingCoverEntry>> {
|
||||||
|
let rows: Vec<(Uuid, String, String)> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT DISTINCT ON (m.id) m.id, ms.source_manga_key, ms.source_url
|
||||||
|
FROM mangas m
|
||||||
|
JOIN manga_sources ms ON ms.manga_id = m.id
|
||||||
|
WHERE m.cover_image_path IS NULL
|
||||||
|
AND ms.dropped_at IS NULL
|
||||||
|
ORDER BY m.id, ms.last_seen_at DESC
|
||||||
|
LIMIT $1
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(max)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(rows
|
||||||
|
.into_iter()
|
||||||
|
.map(|(manga_id, source_manga_key, source_url)| MissingCoverEntry {
|
||||||
|
manga_id,
|
||||||
|
source_manga_key,
|
||||||
|
source_url,
|
||||||
|
})
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub struct MissingCoverEntry {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub source_manga_key: String,
|
||||||
|
pub source_url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read the recovery flag for `source_id`. A missing row OR an
|
||||||
|
/// unparseable value reads as `true` ("clean") — the former covers the
|
||||||
|
/// first-ever run on a virgin DB (no recovery needed), the latter
|
||||||
|
/// covers forward-compat against future schema changes; both fail-safe
|
||||||
|
/// toward not making an operator pay for an unnecessary full sweep.
|
||||||
|
pub async fn last_run_completed_cleanly(
|
||||||
|
pool: &PgPool,
|
||||||
|
source_id: &str,
|
||||||
|
) -> sqlx::Result<bool> {
|
||||||
|
let key = format!("last_run_completed:{source_id}");
|
||||||
|
let row: Option<serde_json::Value> =
|
||||||
|
sqlx::query_scalar("SELECT value FROM crawler_state WHERE key = $1")
|
||||||
|
.bind(&key)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(row
|
||||||
|
.and_then(|v| v.get("completed").and_then(|b| b.as_bool()))
|
||||||
|
.unwrap_or(true))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Dead-letter jobs: admin observability + requeue.
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// A `dead` crawler job joined to its chapter/manga context for the admin
|
||||||
|
/// dead-letter view. Chapter columns are `Option` because the join is
|
||||||
|
/// best-effort (the chapter may have been removed since the job died, or
|
||||||
|
/// the job may be a non-chapter kind).
|
||||||
|
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||||
|
pub struct DeadJob {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub kind: String,
|
||||||
|
pub chapter_id: Option<Uuid>,
|
||||||
|
pub manga_id: Option<Uuid>,
|
||||||
|
pub manga_title: Option<String>,
|
||||||
|
pub chapter_number: Option<i32>,
|
||||||
|
pub attempts: i32,
|
||||||
|
pub max_attempts: i32,
|
||||||
|
pub last_error: Option<String>,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Paginated list of `dead` jobs, newest-failed first, joined to chapter +
|
||||||
|
/// manga context. `search` filters on manga title (case-insensitive
|
||||||
|
/// substring). Returns the page slice plus the unfiltered-by-page total.
|
||||||
|
pub async fn list_dead_jobs(
|
||||||
|
pool: &PgPool,
|
||||||
|
search: Option<&str>,
|
||||||
|
limit: i64,
|
||||||
|
offset: i64,
|
||||||
|
) -> sqlx::Result<(Vec<DeadJob>, i64)> {
|
||||||
|
let search_pat = search
|
||||||
|
.map(|s| format!("%{}%", s.trim()))
|
||||||
|
.filter(|p| p.len() > 2);
|
||||||
|
|
||||||
|
let items: Vec<DeadJob> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT
|
||||||
|
cj.id,
|
||||||
|
cj.payload->>'kind' AS kind,
|
||||||
|
(cj.payload->>'chapter_id')::uuid AS chapter_id,
|
||||||
|
c.manga_id AS manga_id,
|
||||||
|
m.title AS manga_title,
|
||||||
|
c.number AS chapter_number,
|
||||||
|
cj.attempts,
|
||||||
|
cj.max_attempts,
|
||||||
|
cj.last_error,
|
||||||
|
cj.updated_at
|
||||||
|
FROM crawler_jobs cj
|
||||||
|
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
|
||||||
|
LEFT JOIN mangas m ON m.id = c.manga_id
|
||||||
|
WHERE cj.state = 'dead'
|
||||||
|
AND ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
ORDER BY cj.updated_at DESC
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.bind(limit)
|
||||||
|
.bind(offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let total: i64 = sqlx::query_scalar(
|
||||||
|
r#"
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM crawler_jobs cj
|
||||||
|
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
|
||||||
|
LEFT JOIN mangas m ON m.id = c.manga_id
|
||||||
|
WHERE cj.state = 'dead'
|
||||||
|
AND ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok((items, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An in-flight chapter-content job (`pending` or `running`) joined to its
|
||||||
|
/// chapter + manga, for the "queued chapters" admin view.
|
||||||
|
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||||
|
pub struct ActiveJob {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub chapter_id: Option<Uuid>,
|
||||||
|
pub manga_id: Option<Uuid>,
|
||||||
|
pub manga_title: Option<String>,
|
||||||
|
pub chapter_number: Option<i32>,
|
||||||
|
/// `"pending"` or `"running"`.
|
||||||
|
pub state: String,
|
||||||
|
pub attempts: i32,
|
||||||
|
pub max_attempts: i32,
|
||||||
|
pub updated_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Paginated list of `pending`/`running` chapter-content jobs (which
|
||||||
|
/// chapters of which mangas are queued or being crawled). Running first,
|
||||||
|
/// then by scheduled order. `search` filters on manga title.
|
||||||
|
pub async fn list_active_jobs(
|
||||||
|
pool: &PgPool,
|
||||||
|
search: Option<&str>,
|
||||||
|
limit: i64,
|
||||||
|
offset: i64,
|
||||||
|
) -> sqlx::Result<(Vec<ActiveJob>, i64)> {
|
||||||
|
let search_pat = search
|
||||||
|
.map(|s| format!("%{}%", s.trim()))
|
||||||
|
.filter(|p| p.len() > 2);
|
||||||
|
|
||||||
|
let items: Vec<ActiveJob> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT
|
||||||
|
cj.id,
|
||||||
|
(cj.payload->>'chapter_id')::uuid AS chapter_id,
|
||||||
|
c.manga_id AS manga_id,
|
||||||
|
m.title AS manga_title,
|
||||||
|
c.number AS chapter_number,
|
||||||
|
cj.state,
|
||||||
|
cj.attempts,
|
||||||
|
cj.max_attempts,
|
||||||
|
cj.updated_at
|
||||||
|
FROM crawler_jobs cj
|
||||||
|
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
|
||||||
|
LEFT JOIN mangas m ON m.id = c.manga_id
|
||||||
|
WHERE cj.state IN ('pending','running')
|
||||||
|
AND cj.payload->>'kind' = 'sync_chapter_content'
|
||||||
|
AND ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
ORDER BY (cj.state = 'running') DESC, cj.scheduled_at, cj.created_at
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.bind(limit)
|
||||||
|
.bind(offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let total: i64 = sqlx::query_scalar(
|
||||||
|
r#"
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM crawler_jobs cj
|
||||||
|
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
|
||||||
|
LEFT JOIN mangas m ON m.id = c.manga_id
|
||||||
|
WHERE cj.state IN ('pending','running')
|
||||||
|
AND cj.payload->>'kind' = 'sync_chapter_content'
|
||||||
|
AND ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok((items, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A manga whose cover is still missing (queued for cover fetch).
|
||||||
|
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||||
|
pub struct MissingCoverRow {
|
||||||
|
pub manga_id: Uuid,
|
||||||
|
pub manga_title: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count mangas with no cover yet but a live source row — the cover
|
||||||
|
/// backlog the metadata pass + backfill drain.
|
||||||
|
pub async fn count_missing_covers(pool: &PgPool) -> sqlx::Result<i64> {
|
||||||
|
sqlx::query_scalar(
|
||||||
|
r#"
|
||||||
|
SELECT COUNT(*) FROM mangas m
|
||||||
|
WHERE m.cover_image_path IS NULL
|
||||||
|
AND EXISTS (
|
||||||
|
SELECT 1 FROM manga_sources ms
|
||||||
|
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
|
||||||
|
)
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Paginated list of mangas queued for a cover fetch (no cover yet + a live
|
||||||
|
/// source), with titles. `search` filters on title. Freshest source first.
|
||||||
|
pub async fn list_missing_cover_mangas(
|
||||||
|
pool: &PgPool,
|
||||||
|
search: Option<&str>,
|
||||||
|
limit: i64,
|
||||||
|
offset: i64,
|
||||||
|
) -> sqlx::Result<(Vec<MissingCoverRow>, i64)> {
|
||||||
|
let search_pat = search
|
||||||
|
.map(|s| format!("%{}%", s.trim()))
|
||||||
|
.filter(|p| p.len() > 2);
|
||||||
|
|
||||||
|
let items: Vec<MissingCoverRow> = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT m.id AS manga_id, m.title AS manga_title
|
||||||
|
FROM mangas m
|
||||||
|
WHERE m.cover_image_path IS NULL
|
||||||
|
AND EXISTS (
|
||||||
|
SELECT 1 FROM manga_sources ms
|
||||||
|
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
|
||||||
|
)
|
||||||
|
AND ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
ORDER BY m.updated_at DESC
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.bind(limit)
|
||||||
|
.bind(offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let total: i64 = sqlx::query_scalar(
|
||||||
|
r#"
|
||||||
|
SELECT COUNT(*) FROM mangas m
|
||||||
|
WHERE m.cover_image_path IS NULL
|
||||||
|
AND EXISTS (
|
||||||
|
SELECT 1 FROM manga_sources ms
|
||||||
|
WHERE ms.manga_id = m.id AND ms.dropped_at IS NULL
|
||||||
|
)
|
||||||
|
AND ($1::text IS NULL OR m.title ILIKE $1)
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&search_pat)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok((items, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scope of a dead-job requeue.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum RequeueScope {
|
||||||
|
/// Every dead job.
|
||||||
|
All,
|
||||||
|
/// Dead jobs whose chapter belongs to this manga.
|
||||||
|
Manga(Uuid),
|
||||||
|
/// Dead jobs for a single chapter.
|
||||||
|
Chapter(Uuid),
|
||||||
|
/// A single dead job by its id.
|
||||||
|
Job(Uuid),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Requeue dead jobs back to `pending` with a fresh attempt budget. This is
|
||||||
|
/// an explicit operator override, so it bypasses the dead-letter quarantine
|
||||||
|
/// the enqueue helpers honour (we act directly on the row). Returns the
|
||||||
|
/// number of rows requeued.
|
||||||
|
///
|
||||||
|
/// Two invariants protect the partial unique dedup index
|
||||||
|
/// `crawler_jobs_chapter_content_dedup_idx` (one `pending|running`
|
||||||
|
/// sync_chapter_content job per chapter):
|
||||||
|
/// 1. A chapter that already has a live (`pending|running`) job is
|
||||||
|
/// skipped entirely (`NO_LIVE_DUP`).
|
||||||
|
/// 2. When a chapter has *multiple* dead jobs, only the newest is
|
||||||
|
/// revived (`DISTINCT ON` the chapter key) — without this, flipping
|
||||||
|
/// two dead rows for the same chapter to `pending` in one statement
|
||||||
|
/// would violate the index and abort the whole requeue. Non-chapter
|
||||||
|
/// jobs fall back to their row id so each stays distinct.
|
||||||
|
pub async fn requeue_dead_jobs(pool: &PgPool, scope: RequeueScope) -> sqlx::Result<u64> {
|
||||||
|
// Scope predicate spliced into the `pick` CTE. Only compile-time
|
||||||
|
// literals are interpolated; all values are bound below.
|
||||||
|
let scope_pred: &str = match scope {
|
||||||
|
RequeueScope::All => "",
|
||||||
|
RequeueScope::Manga(_) => {
|
||||||
|
"AND (cj.payload->>'chapter_id')::uuid IN \
|
||||||
|
(SELECT id FROM chapters WHERE manga_id = $1)"
|
||||||
|
}
|
||||||
|
RequeueScope::Chapter(_) => "AND (cj.payload->>'chapter_id')::uuid = $1",
|
||||||
|
RequeueScope::Job(_) => "AND cj.id = $1",
|
||||||
|
};
|
||||||
|
|
||||||
|
let sql = format!(
|
||||||
|
r#"
|
||||||
|
WITH pick AS (
|
||||||
|
SELECT DISTINCT ON (COALESCE(cj.payload->>'chapter_id', cj.id::text)) cj.id
|
||||||
|
FROM crawler_jobs cj
|
||||||
|
WHERE cj.state = 'dead'
|
||||||
|
{scope_pred}
|
||||||
|
AND NOT EXISTS (
|
||||||
|
SELECT 1 FROM crawler_jobs live
|
||||||
|
WHERE live.payload->>'kind' = 'sync_chapter_content'
|
||||||
|
AND live.payload->>'chapter_id' = cj.payload->>'chapter_id'
|
||||||
|
AND live.state IN ('pending','running')
|
||||||
|
)
|
||||||
|
ORDER BY COALESCE(cj.payload->>'chapter_id', cj.id::text), cj.updated_at DESC
|
||||||
|
)
|
||||||
|
UPDATE crawler_jobs
|
||||||
|
SET state = 'pending', attempts = 0, leased_until = NULL,
|
||||||
|
last_error = NULL, scheduled_at = now(), updated_at = now()
|
||||||
|
FROM pick
|
||||||
|
WHERE crawler_jobs.id = pick.id
|
||||||
|
"#
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut q = sqlx::query(&sql);
|
||||||
|
match scope {
|
||||||
|
RequeueScope::All => {}
|
||||||
|
RequeueScope::Manga(id) | RequeueScope::Chapter(id) | RequeueScope::Job(id) => {
|
||||||
|
q = q.bind(id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(q.execute(pool).await?.rows_affected())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count crawler jobs grouped by state — drives the dashboard queue
|
||||||
|
/// gauges. Returns `(pending, running, dead)`.
|
||||||
|
pub async fn job_state_counts(pool: &PgPool) -> sqlx::Result<(i64, i64, i64)> {
|
||||||
|
let rows: Vec<(String, i64)> =
|
||||||
|
sqlx::query_as("SELECT state, COUNT(*) FROM crawler_jobs GROUP BY state")
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
let mut pending = 0;
|
||||||
|
let mut running = 0;
|
||||||
|
let mut dead = 0;
|
||||||
|
for (state, n) in rows {
|
||||||
|
match state.as_str() {
|
||||||
|
"pending" => pending = n,
|
||||||
|
"running" => running = n,
|
||||||
|
"dead" => dead = n,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok((pending, running, dead))
|
||||||
|
}
|
||||||
|
|
||||||
@@ -61,6 +61,11 @@ pub async fn load_for_mangas(
|
|||||||
/// FK constraint would reject them, so we filter upstream rather than
|
/// FK constraint would reject them, so we filter upstream rather than
|
||||||
/// surface a 500 here. (The API layer validates the set against
|
/// surface a 500 here. (The API layer validates the set against
|
||||||
/// `list_all` first.)
|
/// `list_all` first.)
|
||||||
|
///
|
||||||
|
/// Note: `crawler::repo::sync_genres` does a similar replace, but by
|
||||||
|
/// *name* and with auto-create of unseen genres — the crawler can't
|
||||||
|
/// validate against the curated vocabulary on its own. Both paths are
|
||||||
|
/// intentional; don't merge them without preserving that semantic.
|
||||||
pub async fn set_for_manga(
|
pub async fn set_for_manga(
|
||||||
conn: &mut PgConnection,
|
conn: &mut PgConnection,
|
||||||
manga_id: Uuid,
|
manga_id: Uuid,
|
||||||
|
|||||||
@@ -181,17 +181,23 @@ pub async fn get_detail(pool: &PgPool, id: Uuid) -> AppResult<MangaDetail> {
|
|||||||
/// by the caller via `repo::author::set_for_manga` etc. in the same
|
/// by the caller via `repo::author::set_for_manga` etc. in the same
|
||||||
/// transaction. `status` is taken as a validated string — the handler
|
/// transaction. `status` is taken as a validated string — the handler
|
||||||
/// is responsible for defaulting/validating it.
|
/// is responsible for defaulting/validating it.
|
||||||
|
///
|
||||||
|
/// `uploaded_by` records who created the manga and feeds the per-user
|
||||||
|
/// upload history. `None` means "historical / no associated user" —
|
||||||
|
/// historic rows from before the uploader columns were added carry
|
||||||
|
/// NULL.
|
||||||
pub async fn create<'e, E: PgExecutor<'e>>(
|
pub async fn create<'e, E: PgExecutor<'e>>(
|
||||||
executor: E,
|
executor: E,
|
||||||
title: &str,
|
title: &str,
|
||||||
status: &str,
|
status: &str,
|
||||||
description: Option<&str>,
|
description: Option<&str>,
|
||||||
alt_titles: &[String],
|
alt_titles: &[String],
|
||||||
|
uploaded_by: Option<Uuid>,
|
||||||
) -> AppResult<Manga> {
|
) -> AppResult<Manga> {
|
||||||
let row = sqlx::query_as::<_, Manga>(&format!(
|
let row = sqlx::query_as::<_, Manga>(&format!(
|
||||||
r#"
|
r#"
|
||||||
INSERT INTO mangas (title, status, description, alt_titles)
|
INSERT INTO mangas (title, status, description, alt_titles, uploaded_by)
|
||||||
VALUES ($1, $2, $3, $4)
|
VALUES ($1, $2, $3, $4, $5)
|
||||||
RETURNING {SELECT_COLS}
|
RETURNING {SELECT_COLS}
|
||||||
"#
|
"#
|
||||||
))
|
))
|
||||||
@@ -199,6 +205,7 @@ pub async fn create<'e, E: PgExecutor<'e>>(
|
|||||||
.bind(status)
|
.bind(status)
|
||||||
.bind(description)
|
.bind(description)
|
||||||
.bind(alt_titles)
|
.bind(alt_titles)
|
||||||
|
.bind(uploaded_by)
|
||||||
.fetch_one(executor)
|
.fetch_one(executor)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(row)
|
Ok(row)
|
||||||
@@ -255,6 +262,17 @@ pub async fn set_cover_image_path<'e, E: PgExecutor<'e>>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn clear_cover_image_path<'e, E: PgExecutor<'e>>(
|
||||||
|
executor: E,
|
||||||
|
id: Uuid,
|
||||||
|
) -> AppResult<()> {
|
||||||
|
sqlx::query("UPDATE mangas SET cover_image_path = NULL, updated_at = now() WHERE id = $1")
|
||||||
|
.bind(id)
|
||||||
|
.execute(executor)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn exists(pool: &PgPool, id: Uuid) -> AppResult<bool> {
|
pub async fn exists(pool: &PgPool, id: Uuid) -> AppResult<bool> {
|
||||||
let (exists,): (bool,) =
|
let (exists,): (bool,) =
|
||||||
sqlx::query_as("SELECT EXISTS(SELECT 1 FROM mangas WHERE id = $1)")
|
sqlx::query_as("SELECT EXISTS(SELECT 1 FROM mangas WHERE id = $1)")
|
||||||
@@ -263,3 +281,17 @@ pub async fn exists(pool: &PgPool, id: Uuid) -> AppResult<bool> {
|
|||||||
.await?;
|
.await?;
|
||||||
Ok(exists)
|
Ok(exists)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the uploader's user id for a manga. `None` either when the
|
||||||
|
/// manga doesn't exist or when the row predates the `uploaded_by`
|
||||||
|
/// column (historical NULL — see migration 0011). Callers must
|
||||||
|
/// distinguish "manga missing" via [`exists`] before relying on this
|
||||||
|
/// to make an authz decision.
|
||||||
|
pub async fn uploaded_by(pool: &PgPool, id: Uuid) -> AppResult<Option<Uuid>> {
|
||||||
|
let row: Option<(Option<Uuid>,)> =
|
||||||
|
sqlx::query_as("SELECT uploaded_by FROM mangas WHERE id = $1")
|
||||||
|
.bind(id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(row.and_then(|(u,)| u))
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,11 +1,17 @@
|
|||||||
|
pub mod admin_audit;
|
||||||
|
pub mod admin_view;
|
||||||
pub mod api_token;
|
pub mod api_token;
|
||||||
pub mod author;
|
pub mod author;
|
||||||
pub mod bookmark;
|
pub mod bookmark;
|
||||||
pub mod chapter;
|
pub mod chapter;
|
||||||
|
pub mod collection;
|
||||||
|
pub mod crawler;
|
||||||
pub mod genre;
|
pub mod genre;
|
||||||
pub mod manga;
|
pub mod manga;
|
||||||
pub mod page;
|
pub mod page;
|
||||||
|
pub mod read_progress;
|
||||||
pub mod session;
|
pub mod session;
|
||||||
pub mod tag;
|
pub mod tag;
|
||||||
|
pub mod upload_history;
|
||||||
pub mod user;
|
pub mod user;
|
||||||
pub mod user_preferences;
|
pub mod user_preferences;
|
||||||
|
|||||||
164
backend/src/repo/read_progress.rs
Normal file
164
backend/src/repo/read_progress.rs
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
//! Per-user reading-progress persistence.
|
||||||
|
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::domain::read_progress::{
|
||||||
|
ReadProgress, ReadProgressForManga, ReadProgressSummary,
|
||||||
|
};
|
||||||
|
use crate::error::{AppError, AppResult};
|
||||||
|
|
||||||
|
/// Insert-or-overwrite the user's progress row for this manga.
|
||||||
|
/// Progress can move backwards (re-reading) — we accept the
|
||||||
|
/// simplification that the last write wins.
|
||||||
|
///
|
||||||
|
/// FK violations (manga or chapter deleted between the handler's
|
||||||
|
/// existence check and this write) are mapped to `NotFound` so the
|
||||||
|
/// API returns 404 rather than 500.
|
||||||
|
pub async fn upsert(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
chapter_id: Option<Uuid>,
|
||||||
|
page: i32,
|
||||||
|
) -> AppResult<ReadProgress> {
|
||||||
|
sqlx::query_as::<_, ReadProgress>(
|
||||||
|
r#"
|
||||||
|
INSERT INTO read_progress (user_id, manga_id, chapter_id, page, updated_at)
|
||||||
|
VALUES ($1, $2, $3, $4, now())
|
||||||
|
ON CONFLICT (user_id, manga_id) DO UPDATE
|
||||||
|
SET chapter_id = EXCLUDED.chapter_id,
|
||||||
|
page = EXCLUDED.page,
|
||||||
|
updated_at = now()
|
||||||
|
RETURNING user_id, manga_id, chapter_id, page, updated_at
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(page)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.map_err(|e| match e {
|
||||||
|
sqlx::Error::Database(ref db_err) if db_err.is_foreign_key_violation() => {
|
||||||
|
AppError::NotFound
|
||||||
|
}
|
||||||
|
other => AppError::Database(other),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> AppResult<ReadProgress> {
|
||||||
|
sqlx::query_as::<_, ReadProgress>(
|
||||||
|
r#"
|
||||||
|
SELECT user_id, manga_id, chapter_id, page, updated_at
|
||||||
|
FROM read_progress
|
||||||
|
WHERE user_id = $1 AND manga_id = $2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?
|
||||||
|
.ok_or(AppError::NotFound)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same lookup as `get`, but resolves `chapter_number` in one round-
|
||||||
|
/// trip so the manga detail page's "Continue reading" CTA can render
|
||||||
|
/// without having to find the chapter in the paged chapters list.
|
||||||
|
pub async fn get_for_manga(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> AppResult<ReadProgressForManga> {
|
||||||
|
sqlx::query_as::<_, ReadProgressForManga>(
|
||||||
|
r#"
|
||||||
|
SELECT rp.manga_id,
|
||||||
|
rp.chapter_id,
|
||||||
|
c.number AS chapter_number,
|
||||||
|
rp.page,
|
||||||
|
rp.updated_at
|
||||||
|
FROM read_progress rp
|
||||||
|
LEFT JOIN chapters c ON c.id = rp.chapter_id
|
||||||
|
WHERE rp.user_id = $1 AND rp.manga_id = $2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await?
|
||||||
|
.ok_or(AppError::NotFound)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cross-link guard. Returns true when `chapter_id` belongs to
|
||||||
|
/// `manga_id`. The upsert handler calls this before writing to refuse
|
||||||
|
/// PUT bodies that pair a chapter from one manga with another manga
|
||||||
|
/// — the FK alone can't catch that because both ids resolve
|
||||||
|
/// individually.
|
||||||
|
pub async fn chapter_belongs_to_manga(
|
||||||
|
pool: &PgPool,
|
||||||
|
manga_id: Uuid,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
) -> AppResult<bool> {
|
||||||
|
let (matches,): (bool,) = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT EXISTS(
|
||||||
|
SELECT 1 FROM chapters
|
||||||
|
WHERE id = $1 AND manga_id = $2
|
||||||
|
)
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(matches)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_for_user(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
limit: i64,
|
||||||
|
offset: i64,
|
||||||
|
) -> AppResult<(Vec<ReadProgressSummary>, i64)> {
|
||||||
|
let rows = sqlx::query_as::<_, ReadProgressSummary>(
|
||||||
|
r#"
|
||||||
|
SELECT rp.manga_id,
|
||||||
|
m.title AS manga_title,
|
||||||
|
m.cover_image_path AS manga_cover_image_path,
|
||||||
|
rp.chapter_id,
|
||||||
|
c.number AS chapter_number,
|
||||||
|
rp.page,
|
||||||
|
rp.updated_at
|
||||||
|
FROM read_progress rp
|
||||||
|
JOIN mangas m ON m.id = rp.manga_id
|
||||||
|
LEFT JOIN chapters c ON c.id = rp.chapter_id
|
||||||
|
WHERE rp.user_id = $1
|
||||||
|
ORDER BY rp.updated_at DESC, rp.manga_id
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(limit)
|
||||||
|
.bind(offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
let (total,): (i64,) =
|
||||||
|
sqlx::query_as("SELECT count(*) FROM read_progress WHERE user_id = $1")
|
||||||
|
.bind(user_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
Ok((rows, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn delete(pool: &PgPool, user_id: Uuid, manga_id: Uuid) -> AppResult<()> {
|
||||||
|
sqlx::query("DELETE FROM read_progress WHERE user_id = $1 AND manga_id = $2")
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
119
backend/src/repo/upload_history.rs
Normal file
119
backend/src/repo/upload_history.rs
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
//! Cross-table upload history.
|
||||||
|
//!
|
||||||
|
//! Mangas and chapters are uploaded by users separately, but the
|
||||||
|
//! profile UI wants a single chronological feed. Rather than open a
|
||||||
|
//! UNION-ALL over two tables with mismatched columns we fetch each
|
||||||
|
//! side, then merge in Rust by `created_at`. Cheap for the volumes a
|
||||||
|
//! single user produces.
|
||||||
|
//!
|
||||||
|
//! Pagination uses limit-only for now; offsets across two unrelated
|
||||||
|
//! tables aren't trivially stable, and the realistic per-user upload
|
||||||
|
//! count is small. Switch to keyset pagination if real users blow
|
||||||
|
//! past a few hundred uploads.
|
||||||
|
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::domain::chapter::Chapter;
|
||||||
|
use crate::domain::manga::Manga;
|
||||||
|
use crate::domain::upload_entry::UploadEntry;
|
||||||
|
use crate::error::AppResult;
|
||||||
|
|
||||||
|
#[derive(sqlx::FromRow)]
|
||||||
|
struct ChapterUploadRow {
|
||||||
|
manga_id: Uuid,
|
||||||
|
manga_title: String,
|
||||||
|
manga_cover_image_path: Option<String>,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
number: i32,
|
||||||
|
title: Option<String>,
|
||||||
|
page_count: i32,
|
||||||
|
created_at: chrono::DateTime<chrono::Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns up to `limit` of the user's most recent uploads (mangas and
|
||||||
|
/// chapters interleaved by `created_at DESC`) plus the unfiltered
|
||||||
|
/// total count (mangas + chapters owned by the user). The caller is
|
||||||
|
/// responsible for clamping `limit` to a sane value.
|
||||||
|
pub async fn list_for_user(
|
||||||
|
pool: &PgPool,
|
||||||
|
user_id: Uuid,
|
||||||
|
limit: i64,
|
||||||
|
) -> AppResult<(Vec<UploadEntry>, i64)> {
|
||||||
|
let mangas: Vec<Manga> = sqlx::query_as::<_, Manga>(
|
||||||
|
r#"
|
||||||
|
SELECT id, title, status, alt_titles, description,
|
||||||
|
cover_image_path, created_at, updated_at
|
||||||
|
FROM mangas
|
||||||
|
WHERE uploaded_by = $1
|
||||||
|
ORDER BY created_at DESC, id
|
||||||
|
LIMIT $2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(limit)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let chapters: Vec<ChapterUploadRow> = sqlx::query_as::<_, ChapterUploadRow>(
|
||||||
|
r#"
|
||||||
|
SELECT c.manga_id,
|
||||||
|
m.title AS manga_title,
|
||||||
|
m.cover_image_path AS manga_cover_image_path,
|
||||||
|
c.id AS chapter_id,
|
||||||
|
c.number,
|
||||||
|
c.title,
|
||||||
|
c.page_count,
|
||||||
|
c.created_at
|
||||||
|
FROM chapters c
|
||||||
|
JOIN mangas m ON m.id = c.manga_id
|
||||||
|
WHERE c.uploaded_by = $1
|
||||||
|
ORDER BY c.created_at DESC, c.id
|
||||||
|
LIMIT $2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.bind(limit)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut entries: Vec<UploadEntry> = Vec::with_capacity(mangas.len() + chapters.len());
|
||||||
|
for m in mangas {
|
||||||
|
entries.push(UploadEntry::Manga {
|
||||||
|
created_at: m.created_at,
|
||||||
|
manga: m,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for c in chapters {
|
||||||
|
let created_at = c.created_at;
|
||||||
|
entries.push(UploadEntry::Chapter {
|
||||||
|
manga_id: c.manga_id,
|
||||||
|
manga_title: c.manga_title,
|
||||||
|
manga_cover_image_path: c.manga_cover_image_path,
|
||||||
|
chapter: Chapter {
|
||||||
|
id: c.chapter_id,
|
||||||
|
manga_id: c.manga_id,
|
||||||
|
number: c.number,
|
||||||
|
title: c.title,
|
||||||
|
page_count: c.page_count,
|
||||||
|
created_at: c.created_at,
|
||||||
|
},
|
||||||
|
created_at,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Newest first; trim to limit after the merge.
|
||||||
|
entries.sort_by(|a, b| b.created_at().cmp(&a.created_at()));
|
||||||
|
entries.truncate(limit as usize);
|
||||||
|
|
||||||
|
let (manga_total, chapter_total): (i64, i64) = sqlx::query_as(
|
||||||
|
r#"
|
||||||
|
SELECT
|
||||||
|
(SELECT count(*) FROM mangas WHERE uploaded_by = $1),
|
||||||
|
(SELECT count(*) FROM chapters WHERE uploaded_by = $1)
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(user_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
Ok((entries, manga_total + chapter_total))
|
||||||
|
}
|
||||||
@@ -11,7 +11,7 @@ pub async fn create(pool: &PgPool, username: &str, password_hash: &str) -> AppRe
|
|||||||
r#"
|
r#"
|
||||||
INSERT INTO users (username, password_hash)
|
INSERT INTO users (username, password_hash)
|
||||||
VALUES ($1, $2)
|
VALUES ($1, $2)
|
||||||
RETURNING id, username, password_hash, created_at
|
RETURNING id, username, password_hash, created_at, is_admin
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.bind(username)
|
.bind(username)
|
||||||
@@ -21,7 +21,7 @@ pub async fn create(pool: &PgPool, username: &str, password_hash: &str) -> AppRe
|
|||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(user) => Ok(user),
|
Ok(user) => Ok(user),
|
||||||
Err(e) if is_unique_violation(&e) => {
|
Err(sqlx::Error::Database(ref db_err)) if db_err.is_unique_violation() => {
|
||||||
Err(AppError::Conflict("username is already taken".into()))
|
Err(AppError::Conflict("username is already taken".into()))
|
||||||
}
|
}
|
||||||
Err(e) => Err(AppError::Database(e)),
|
Err(e) => Err(AppError::Database(e)),
|
||||||
@@ -35,7 +35,7 @@ pub async fn create(pool: &PgPool, username: &str, password_hash: &str) -> AppRe
|
|||||||
pub async fn find_by_username(pool: &PgPool, username: &str) -> AppResult<Option<User>> {
|
pub async fn find_by_username(pool: &PgPool, username: &str) -> AppResult<Option<User>> {
|
||||||
let row = sqlx::query_as::<_, User>(
|
let row = sqlx::query_as::<_, User>(
|
||||||
r#"
|
r#"
|
||||||
SELECT id, username, password_hash, created_at
|
SELECT id, username, password_hash, created_at, is_admin
|
||||||
FROM users
|
FROM users
|
||||||
WHERE lower(username) = lower($1)
|
WHERE lower(username) = lower($1)
|
||||||
"#,
|
"#,
|
||||||
@@ -48,7 +48,7 @@ pub async fn find_by_username(pool: &PgPool, username: &str) -> AppResult<Option
|
|||||||
|
|
||||||
pub async fn find_by_id(pool: &PgPool, id: Uuid) -> AppResult<Option<User>> {
|
pub async fn find_by_id(pool: &PgPool, id: Uuid) -> AppResult<Option<User>> {
|
||||||
let row = sqlx::query_as::<_, User>(
|
let row = sqlx::query_as::<_, User>(
|
||||||
r#"SELECT id, username, password_hash, created_at FROM users WHERE id = $1"#,
|
r#"SELECT id, username, password_hash, created_at, is_admin FROM users WHERE id = $1"#,
|
||||||
)
|
)
|
||||||
.bind(id)
|
.bind(id)
|
||||||
.fetch_optional(pool)
|
.fetch_optional(pool)
|
||||||
@@ -56,10 +56,317 @@ pub async fn find_by_id(pool: &PgPool, id: Uuid) -> AppResult<Option<User>> {
|
|||||||
Ok(row)
|
Ok(row)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_unique_violation(err: &sqlx::Error) -> bool {
|
/// Postgres advisory-lock key guarding admin-count-changing operations
|
||||||
if let sqlx::Error::Database(db_err) = err {
|
/// (demote, delete-admin). Without this lock two concurrent demotes of
|
||||||
db_err.code().as_deref() == Some("23505")
|
/// different admins could each pass their "more than one admin remains"
|
||||||
} else {
|
/// check, then commit, leaving zero admins. The lock serialises any tx
|
||||||
false
|
/// that might change the admin count so the recount under the lock is
|
||||||
}
|
/// authoritative.
|
||||||
|
///
|
||||||
|
/// Value is the bytes of "admininv" interpreted as a big-endian i64.
|
||||||
|
/// Postgres' advisory-lock keyspace is global; collision risk with
|
||||||
|
/// `CRON_LOCK_KEY` and friends is ~2^-64.
|
||||||
|
pub const ADMIN_INVARIANT_LOCK_KEY: i64 = 0x61_64_6d_69_6e_69_6e_76;
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct ListUsersQuery {
|
||||||
|
pub search: Option<String>,
|
||||||
|
pub limit: i64,
|
||||||
|
pub offset: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Paginated user list with total count. `search` is a case-insensitive
|
||||||
|
/// substring match on `username`. Order is alphabetical by username so
|
||||||
|
/// pagination is stable across concurrent writes (mangas changing
|
||||||
|
/// is_admin doesn't reshuffle the page).
|
||||||
|
pub async fn list_with_total(
|
||||||
|
pool: &PgPool,
|
||||||
|
q: &ListUsersQuery,
|
||||||
|
) -> AppResult<(Vec<User>, i64)> {
|
||||||
|
let pat = q
|
||||||
|
.search
|
||||||
|
.as_ref()
|
||||||
|
.map(|s| format!("%{}%", s.trim()))
|
||||||
|
.filter(|p| p.len() > 2);
|
||||||
|
|
||||||
|
let items = sqlx::query_as::<_, User>(
|
||||||
|
r#"
|
||||||
|
SELECT id, username, password_hash, created_at, is_admin
|
||||||
|
FROM users
|
||||||
|
WHERE ($1::text IS NULL OR username ILIKE $1)
|
||||||
|
ORDER BY username
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(&pat)
|
||||||
|
.bind(q.limit)
|
||||||
|
.bind(q.offset)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let total: i64 = sqlx::query_scalar(
|
||||||
|
"SELECT COUNT(*) FROM users WHERE ($1::text IS NULL OR username ILIKE $1)",
|
||||||
|
)
|
||||||
|
.bind(&pat)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok((items, total))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Raw `is_admin` update with no safety checks, no audit log, and no
|
||||||
|
/// advisory lock. Exists only as a test setup helper for the admin-
|
||||||
|
/// feature integration suite — production code MUST go through
|
||||||
|
/// [`admin_safe_set_is_admin`], which enforces self-protection, the
|
||||||
|
/// last-admin invariant, and the audit log atomically.
|
||||||
|
pub async fn set_is_admin_unchecked(pool: &PgPool, id: Uuid, value: bool) -> AppResult<()> {
|
||||||
|
sqlx::query("UPDATE users SET is_admin = $1 WHERE id = $2")
|
||||||
|
.bind(value)
|
||||||
|
.bind(id)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Ensure the user `username` exists and is an admin. Called at startup
|
||||||
|
/// from `app::build` when `ADMIN_USERNAME` / `ADMIN_PASSWORD` are set.
|
||||||
|
///
|
||||||
|
/// Semantics — see cross-cutting decision #2 in the feature plan:
|
||||||
|
/// - If no row exists: create with the env-supplied password hashed via
|
||||||
|
/// argon2id and `is_admin = true`.
|
||||||
|
/// - If a row already exists: flip `is_admin` to true if needed; **never**
|
||||||
|
/// touch the existing `password_hash`. Lets the operator rotate the
|
||||||
|
/// admin password through the UI without env-var conflict.
|
||||||
|
/// Wrapped in a transaction so a concurrent `register` for the same
|
||||||
|
/// username can't slip an INSERT between the SELECT and UPDATE/INSERT.
|
||||||
|
/// Set `is_admin` on a user with full safety checks: rejects self-demote,
|
||||||
|
/// rejects demoting the only remaining admin (under `ADMIN_INVARIANT_LOCK_KEY`
|
||||||
|
/// to close the parallel-demote race), and writes an `admin_audit` row
|
||||||
|
/// in the same tx so the log mirrors what actually committed.
|
||||||
|
///
|
||||||
|
/// Returns the freshly-written user row (so the handler can return it
|
||||||
|
/// without a second SELECT).
|
||||||
|
pub async fn admin_safe_set_is_admin(
|
||||||
|
pool: &PgPool,
|
||||||
|
actor_id: Uuid,
|
||||||
|
target_id: Uuid,
|
||||||
|
value: bool,
|
||||||
|
) -> AppResult<User> {
|
||||||
|
// Cheap pre-check before opening a tx — also covers the "demote me"
|
||||||
|
// case which would otherwise pass the recount when other admins exist.
|
||||||
|
if actor_id == target_id && !value {
|
||||||
|
return Err(AppError::Conflict(
|
||||||
|
"cannot demote yourself; ask another admin".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
sqlx::query("SELECT pg_advisory_xact_lock($1)")
|
||||||
|
.bind(ADMIN_INVARIANT_LOCK_KEY)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let target: Option<User> = sqlx::query_as(
|
||||||
|
"SELECT id, username, password_hash, created_at, is_admin \
|
||||||
|
FROM users WHERE id = $1 FOR UPDATE",
|
||||||
|
)
|
||||||
|
.bind(target_id)
|
||||||
|
.fetch_optional(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
let Some(target) = target else {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
};
|
||||||
|
|
||||||
|
// No-op: caller asked to set `is_admin` to its current value. Return
|
||||||
|
// the row as-is without writing an audit entry — otherwise repeated
|
||||||
|
// PATCH calls (browser retry, double-click) pile misleading
|
||||||
|
// "promote_user" rows in `admin_audit` for actions that changed
|
||||||
|
// nothing.
|
||||||
|
if target.is_admin == value {
|
||||||
|
tx.commit().await?;
|
||||||
|
return Ok(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recount inside the lock — this is the authoritative read.
|
||||||
|
if target.is_admin && !value {
|
||||||
|
let admin_count: i64 =
|
||||||
|
sqlx::query_scalar("SELECT COUNT(*) FROM users WHERE is_admin = true")
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
if admin_count <= 1 {
|
||||||
|
return Err(AppError::Conflict(
|
||||||
|
"cannot demote the last admin; promote another user first".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let updated: User = sqlx::query_as(
|
||||||
|
"UPDATE users SET is_admin = $1 WHERE id = $2 \
|
||||||
|
RETURNING id, username, password_hash, created_at, is_admin",
|
||||||
|
)
|
||||||
|
.bind(value)
|
||||||
|
.bind(target_id)
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let action = if value { "promote_user" } else { "demote_user" };
|
||||||
|
crate::repo::admin_audit::insert(
|
||||||
|
&mut *tx,
|
||||||
|
actor_id,
|
||||||
|
action,
|
||||||
|
"user",
|
||||||
|
Some(target_id),
|
||||||
|
serde_json::json!({ "username": target.username }),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete a user with full safety checks: rejects self-delete, rejects
|
||||||
|
/// deleting the only remaining admin (under `ADMIN_INVARIANT_LOCK_KEY`),
|
||||||
|
/// and writes an `admin_audit` row in the same tx. Captures the deleted
|
||||||
|
/// username + admin status in the audit payload so the action is
|
||||||
|
/// readable after the user row itself is gone.
|
||||||
|
pub async fn admin_safe_delete(
|
||||||
|
pool: &PgPool,
|
||||||
|
actor_id: Uuid,
|
||||||
|
target_id: Uuid,
|
||||||
|
) -> AppResult<()> {
|
||||||
|
if actor_id == target_id {
|
||||||
|
return Err(AppError::Conflict(
|
||||||
|
"cannot delete yourself; ask another admin".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
sqlx::query("SELECT pg_advisory_xact_lock($1)")
|
||||||
|
.bind(ADMIN_INVARIANT_LOCK_KEY)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let target: Option<User> = sqlx::query_as(
|
||||||
|
"SELECT id, username, password_hash, created_at, is_admin \
|
||||||
|
FROM users WHERE id = $1 FOR UPDATE",
|
||||||
|
)
|
||||||
|
.bind(target_id)
|
||||||
|
.fetch_optional(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
let Some(target) = target else {
|
||||||
|
return Err(AppError::NotFound);
|
||||||
|
};
|
||||||
|
|
||||||
|
if target.is_admin {
|
||||||
|
let admin_count: i64 =
|
||||||
|
sqlx::query_scalar("SELECT COUNT(*) FROM users WHERE is_admin = true")
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
if admin_count <= 1 {
|
||||||
|
return Err(AppError::Conflict(
|
||||||
|
"cannot delete the last admin; promote another user first".into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlx::query("DELETE FROM users WHERE id = $1")
|
||||||
|
.bind(target_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
crate::repo::admin_audit::insert(
|
||||||
|
&mut *tx,
|
||||||
|
actor_id,
|
||||||
|
"delete_user",
|
||||||
|
"user",
|
||||||
|
Some(target_id),
|
||||||
|
serde_json::json!({
|
||||||
|
"username": target.username,
|
||||||
|
"was_admin": target.is_admin,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Admin-initiated user creation. Wraps the INSERT + audit row in a
|
||||||
|
/// single transaction so a rolled-back create never leaves an orphan
|
||||||
|
/// audit entry. Caller (HTTP handler) is responsible for validating
|
||||||
|
/// `username`/`password` and hashing — this fn assumes both are
|
||||||
|
/// already vetted by the same `validate_*` rules used by self-
|
||||||
|
/// registration.
|
||||||
|
pub async fn admin_create_user(
|
||||||
|
pool: &PgPool,
|
||||||
|
actor_id: Uuid,
|
||||||
|
username: &str,
|
||||||
|
password_hash: &str,
|
||||||
|
is_admin: bool,
|
||||||
|
) -> AppResult<User> {
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
let user: User = match sqlx::query_as::<_, User>(
|
||||||
|
"INSERT INTO users (username, password_hash, is_admin) VALUES ($1, $2, $3) \
|
||||||
|
RETURNING id, username, password_hash, created_at, is_admin",
|
||||||
|
)
|
||||||
|
.bind(username)
|
||||||
|
.bind(password_hash)
|
||||||
|
.bind(is_admin)
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(u) => u,
|
||||||
|
Err(sqlx::Error::Database(ref db_err)) if db_err.is_unique_violation() => {
|
||||||
|
return Err(AppError::Conflict("username is already taken".into()));
|
||||||
|
}
|
||||||
|
Err(e) => return Err(AppError::Database(e)),
|
||||||
|
};
|
||||||
|
|
||||||
|
crate::repo::admin_audit::insert(
|
||||||
|
&mut *tx,
|
||||||
|
actor_id,
|
||||||
|
"create_user",
|
||||||
|
"user",
|
||||||
|
Some(user.id),
|
||||||
|
serde_json::json!({
|
||||||
|
"username": user.username,
|
||||||
|
"is_admin": user.is_admin,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(user)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn bootstrap_admin(
|
||||||
|
pool: &PgPool,
|
||||||
|
username: &str,
|
||||||
|
password: &str,
|
||||||
|
) -> AppResult<()> {
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
let existing: Option<(Uuid,)> = sqlx::query_as(
|
||||||
|
"SELECT id FROM users WHERE lower(username) = lower($1) FOR UPDATE",
|
||||||
|
)
|
||||||
|
.bind(username)
|
||||||
|
.fetch_optional(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
match existing {
|
||||||
|
Some((id,)) => {
|
||||||
|
sqlx::query("UPDATE users SET is_admin = true WHERE id = $1 AND is_admin = false")
|
||||||
|
.bind(id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
let hash = crate::auth::password::hash_password(password)?;
|
||||||
|
sqlx::query("INSERT INTO users (username, password_hash, is_admin) VALUES ($1, $2, true)")
|
||||||
|
.bind(username)
|
||||||
|
.bind(&hash)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,6 +16,13 @@ impl LocalStorage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn resolve(&self, key: &str) -> Result<PathBuf, StorageError> {
|
fn resolve(&self, key: &str) -> Result<PathBuf, StorageError> {
|
||||||
|
// NUL bytes are rejected by the Linux syscall layer, but the
|
||||||
|
// error surfaces as an opaque IO failure rather than the
|
||||||
|
// explicit `BadKey` the rest of the contract uses. Catch it
|
||||||
|
// here so the error path is consistent.
|
||||||
|
if key.contains('\0') {
|
||||||
|
return Err(StorageError::BadKey);
|
||||||
|
}
|
||||||
let key = key.trim_start_matches('/');
|
let key = key.trim_start_matches('/');
|
||||||
if key.is_empty() {
|
if key.is_empty() {
|
||||||
return Err(StorageError::BadKey);
|
return Err(StorageError::BadKey);
|
||||||
@@ -79,6 +86,10 @@ impl Storage for LocalStorage {
|
|||||||
let path: &Path = &self.resolve(key)?;
|
let path: &Path = &self.resolve(key)?;
|
||||||
Ok(fs::try_exists(path).await?)
|
Ok(fs::try_exists(path).await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn local_root(&self) -> Option<&Path> {
|
||||||
|
Some(&self.root)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -114,6 +125,9 @@ mod tests {
|
|||||||
assert!(matches!(s.get(".").await, Err(StorageError::BadKey)));
|
assert!(matches!(s.get(".").await, Err(StorageError::BadKey)));
|
||||||
// Empty segment via doubled slash.
|
// Empty segment via doubled slash.
|
||||||
assert!(matches!(s.get("a//b").await, Err(StorageError::BadKey)));
|
assert!(matches!(s.get("a//b").await, Err(StorageError::BadKey)));
|
||||||
|
// NUL byte (rejected explicitly so callers see BadKey rather
|
||||||
|
// than an opaque IO error from the kernel).
|
||||||
|
assert!(matches!(s.put("a\0b", b"x").await, Err(StorageError::BadKey)));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ mod local;
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
|
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures_core::Stream;
|
use futures_core::Stream;
|
||||||
@@ -44,4 +46,13 @@ pub trait Storage: Send + Sync {
|
|||||||
async fn get_stream(&self, key: &str) -> Result<StreamingFile, StorageError>;
|
async fn get_stream(&self, key: &str) -> Result<StreamingFile, StorageError>;
|
||||||
async fn delete(&self, key: &str) -> Result<(), StorageError>;
|
async fn delete(&self, key: &str) -> Result<(), StorageError>;
|
||||||
async fn exists(&self, key: &str) -> Result<bool, StorageError>;
|
async fn exists(&self, key: &str) -> Result<bool, StorageError>;
|
||||||
|
|
||||||
|
/// Filesystem path the backend is rooted at, when introspectable.
|
||||||
|
/// Returns `None` for backends that aren't a local filesystem (e.g.
|
||||||
|
/// a future `S3Storage`). The admin system endpoint uses this to
|
||||||
|
/// statvfs the data dir; backends that return `None` get a `disk:
|
||||||
|
/// null` payload instead of fabricated numbers.
|
||||||
|
fn local_root(&self) -> Option<&Path> {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
344
backend/tests/api_admin_crawler.rs
Normal file
344
backend/tests/api_admin_crawler.rs
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
//! Integration tests for the admin crawler observability/control API.
|
||||||
|
//!
|
||||||
|
//! The default test harness wires `AppState.crawler = None` (no daemon),
|
||||||
|
//! so the *control* endpoints return 503 and the *read* endpoints that
|
||||||
|
//! work off the DB (status shell, dead-jobs list/requeue) still function.
|
||||||
|
//! This is exactly the production "daemon disabled" posture.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::Router;
|
||||||
|
use http_body_util::BodyExt;
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use common::{body_json, get, get_with_cookie, post_json_with_cookie, register_user, harness};
|
||||||
|
|
||||||
|
async fn seed_admin(pool: &PgPool, app: &Router) -> String {
|
||||||
|
let (username, cookie) = register_user(app).await;
|
||||||
|
let u = mangalord::repo::user::find_by_username(pool, &username)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
mangalord::repo::user::set_is_admin_unchecked(pool, u.id, true)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
cookie
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn seed_dead_job(pool: &PgPool, title: &str) -> Uuid {
|
||||||
|
let manga_id = Uuid::new_v4();
|
||||||
|
let chapter_id = Uuid::new_v4();
|
||||||
|
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, $2)")
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(title)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, 1)")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let job_id = Uuid::new_v4();
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO crawler_jobs (id, payload, state, attempts, last_error) \
|
||||||
|
VALUES ($1, $2, 'dead', 5, 'boom')",
|
||||||
|
)
|
||||||
|
.bind(job_id)
|
||||||
|
.bind(json!({
|
||||||
|
"kind": "sync_chapter_content",
|
||||||
|
"source_id": "target",
|
||||||
|
"chapter_id": chapter_id,
|
||||||
|
"source_chapter_key": "k",
|
||||||
|
}))
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
job_id
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Seed a chapter-content job in a given state ('pending'/'running').
|
||||||
|
async fn seed_job(pool: &PgPool, title: &str, state: &str) {
|
||||||
|
let manga_id = Uuid::new_v4();
|
||||||
|
let chapter_id = Uuid::new_v4();
|
||||||
|
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, $2)")
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(title)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, 1)")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
sqlx::query("INSERT INTO crawler_jobs (id, payload, state) VALUES ($1, $2, $3)")
|
||||||
|
.bind(Uuid::new_v4())
|
||||||
|
.bind(json!({
|
||||||
|
"kind": "sync_chapter_content",
|
||||||
|
"source_id": "target",
|
||||||
|
"chapter_id": chapter_id,
|
||||||
|
"source_chapter_key": "k",
|
||||||
|
}))
|
||||||
|
.bind(state)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Seed a manga with no cover + a live source row (queued for cover fetch).
|
||||||
|
async fn seed_missing_cover(pool: &PgPool, title: &str) {
|
||||||
|
let manga_id = Uuid::new_v4();
|
||||||
|
sqlx::query("INSERT INTO mangas (id, title, cover_image_path) VALUES ($1, $2, NULL)")
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(title)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
sqlx::query("INSERT INTO sources (id, name, base_url) VALUES ('target','T','http://x') ON CONFLICT DO NOTHING")
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
sqlx::query(
|
||||||
|
"INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url) \
|
||||||
|
VALUES ('target', $1, $2, 'http://x/m')",
|
||||||
|
)
|
||||||
|
.bind(format!("k-{manga_id}"))
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn active_jobs_and_covers_lists_over_http(pool: PgPool) {
|
||||||
|
seed_job(&pool, "Naruto", "pending").await;
|
||||||
|
seed_job(&pool, "Bleach", "running").await;
|
||||||
|
seed_missing_cover(&pool, "One Piece").await;
|
||||||
|
let h = harness(pool.clone());
|
||||||
|
let cookie = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
// Queued/active chapters.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler/active-jobs", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["page"]["total"], 2);
|
||||||
|
|
||||||
|
// Queued covers.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler/covers", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["page"]["total"], 1);
|
||||||
|
assert_eq!(body["items"][0]["manga_title"], "One Piece");
|
||||||
|
|
||||||
|
// Both are admin-gated.
|
||||||
|
let (_u, plain) = register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler/active-jobs", &plain))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn get_status_requires_admin(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
// Unauthenticated → 401.
|
||||||
|
let resp = h.app.clone().oneshot(get("/api/v1/admin/crawler")).await.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
|
||||||
|
// Authenticated non-admin → 403.
|
||||||
|
let (_u, cookie) = register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn get_status_reports_disabled_daemon_with_queue_counts(pool: PgPool) {
|
||||||
|
seed_dead_job(&pool, "Naruto").await;
|
||||||
|
let h = harness(pool.clone());
|
||||||
|
let cookie = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["daemon"], "disabled");
|
||||||
|
assert_eq!(body["queue"]["dead"], 1);
|
||||||
|
assert_eq!(body["browser"], "down");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn control_endpoints_return_503_when_daemon_disabled(pool: PgPool) {
|
||||||
|
let h = harness(pool.clone());
|
||||||
|
let cookie = seed_admin(&pool, &h.app).await;
|
||||||
|
for uri in [
|
||||||
|
"/api/v1/admin/crawler/run",
|
||||||
|
"/api/v1/admin/crawler/browser/restart",
|
||||||
|
"/api/v1/admin/crawler/session/clear-expired",
|
||||||
|
] {
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(post_json_with_cookie(uri, json!({}), &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
resp.status(),
|
||||||
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
|
"{uri} should be 503 when daemon disabled"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn status_stream_requires_admin(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
// Unauthenticated → 401.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get("/api/v1/admin/crawler/stream"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
// Non-admin → 403.
|
||||||
|
let (_u, cookie) = register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler/stream", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn status_stream_emits_initial_event(pool: PgPool) {
|
||||||
|
let h = harness(pool.clone());
|
||||||
|
let cookie = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler/stream", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let ct = resp
|
||||||
|
.headers()
|
||||||
|
.get(axum::http::header::CONTENT_TYPE)
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.unwrap_or_default()
|
||||||
|
.to_string();
|
||||||
|
assert!(ct.starts_with("text/event-stream"), "content-type was {ct:?}");
|
||||||
|
|
||||||
|
// Accumulate frames (the immediate snapshot may arrive split across
|
||||||
|
// frames) until the status payload appears, with an overall timeout so
|
||||||
|
// the never-ending stream can't hang the test.
|
||||||
|
let mut body = resp.into_body();
|
||||||
|
let mut acc = String::new();
|
||||||
|
let deadline = tokio::time::timeout(Duration::from_secs(5), async {
|
||||||
|
loop {
|
||||||
|
let Some(frame) = body.frame().await else { break };
|
||||||
|
if let Ok(data) = frame.expect("frame ok").into_data() {
|
||||||
|
acc.push_str(&String::from_utf8_lossy(&data));
|
||||||
|
if acc.contains("\"daemon\"") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
assert!(deadline.is_ok(), "did not receive status within 5s; got: {acc:?}");
|
||||||
|
assert!(acc.contains("\"daemon\""), "missing status payload: {acc}");
|
||||||
|
assert!(acc.contains("status"), "missing SSE event name: {acc}");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn mutating_endpoints_reject_non_admin(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
// A logged-in non-admin must be forbidden from a mutating endpoint.
|
||||||
|
let (_u, cookie) = register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(post_json_with_cookie(
|
||||||
|
"/api/v1/admin/crawler/dead-jobs/requeue",
|
||||||
|
json!({ "scope": "all" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn dead_jobs_list_and_requeue_over_http(pool: PgPool) {
|
||||||
|
let job_id = seed_dead_job(&pool, "Bleach").await;
|
||||||
|
let h = harness(pool.clone());
|
||||||
|
let cookie = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
// List.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get_with_cookie("/api/v1/admin/crawler/dead-jobs", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["page"]["total"], 1);
|
||||||
|
assert_eq!(body["items"][0]["manga_title"], "Bleach");
|
||||||
|
|
||||||
|
// Requeue the single job.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(post_json_with_cookie(
|
||||||
|
"/api/v1/admin/crawler/dead-jobs/requeue",
|
||||||
|
json!({ "scope": "job", "job_id": job_id }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["requeued"], 1);
|
||||||
|
|
||||||
|
let state: String = sqlx::query_scalar("SELECT state FROM crawler_jobs WHERE id = $1")
|
||||||
|
.bind(job_id)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(state, "pending");
|
||||||
|
}
|
||||||
548
backend/tests/api_admin_mangas.rs
Normal file
548
backend/tests/api_admin_mangas.rs
Normal file
@@ -0,0 +1,548 @@
|
|||||||
|
//! PR 3 (feat/admin-mangas-api) integration tests.
|
||||||
|
//!
|
||||||
|
//! Per-variant fixture tests for the derived sync-state SQL plus
|
||||||
|
//! happy-path E2E for the two admin endpoints. Auth-gate regression
|
||||||
|
//! (403/401) is covered by PR 1's `RequireAdmin` test matrix; the only
|
||||||
|
//! gate test here is one spot check per endpoint.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::Router;
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use mangalord::repo;
|
||||||
|
|
||||||
|
const SOURCE_ID: &str = "test-source";
|
||||||
|
|
||||||
|
async fn seed_admin(pool: &PgPool, app: &Router) -> (String, String) {
|
||||||
|
let (username, cookie) = common::register_user(app).await;
|
||||||
|
let u = repo::user::find_by_username(pool, &username)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
repo::user::set_is_admin_unchecked(pool, u.id, true).await.unwrap();
|
||||||
|
(username, cookie)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn seed_source(pool: &PgPool) {
|
||||||
|
repo::crawler::ensure_source(pool, SOURCE_ID, "Test", "https://example.test")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_manga(pool: &PgPool, title: &str) -> Uuid {
|
||||||
|
let (id,): (Uuid,) = sqlx::query_as(
|
||||||
|
"INSERT INTO mangas (title, status, alt_titles) VALUES ($1, 'ongoing', ARRAY[]::text[]) RETURNING id",
|
||||||
|
)
|
||||||
|
.bind(title)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
id
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_manga_source(
|
||||||
|
pool: &PgPool,
|
||||||
|
manga_id: Uuid,
|
||||||
|
source_manga_key: &str,
|
||||||
|
dropped: bool,
|
||||||
|
) {
|
||||||
|
let dropped_at = if dropped { "now()" } else { "NULL" };
|
||||||
|
let sql = format!(
|
||||||
|
"INSERT INTO manga_sources (source_id, source_manga_key, manga_id, source_url, dropped_at) \
|
||||||
|
VALUES ($1, $2, $3, 'https://example.test/m', {dropped_at})"
|
||||||
|
);
|
||||||
|
sqlx::query(&sql)
|
||||||
|
.bind(SOURCE_ID)
|
||||||
|
.bind(source_manga_key)
|
||||||
|
.bind(manga_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_chapter(pool: &PgPool, manga_id: Uuid, number: i32, page_count: i32) -> Uuid {
|
||||||
|
let (id,): (Uuid,) = sqlx::query_as(
|
||||||
|
"INSERT INTO chapters (manga_id, number, title, page_count) VALUES ($1, $2, NULL, $3) RETURNING id",
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(number)
|
||||||
|
.bind(page_count)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
id
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_chapter_source(
|
||||||
|
pool: &PgPool,
|
||||||
|
chapter_id: Uuid,
|
||||||
|
source_chapter_key: &str,
|
||||||
|
dropped: bool,
|
||||||
|
) {
|
||||||
|
let dropped_at = if dropped { "now()" } else { "NULL" };
|
||||||
|
let sql = format!(
|
||||||
|
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url, dropped_at) \
|
||||||
|
VALUES ($1, $2, $3, 'https://example.test/c', {dropped_at})"
|
||||||
|
);
|
||||||
|
sqlx::query(&sql)
|
||||||
|
.bind(SOURCE_ID)
|
||||||
|
.bind(source_chapter_key)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_job(pool: &PgPool, payload: serde_json::Value, state: &str) {
|
||||||
|
sqlx::query("INSERT INTO crawler_jobs (payload, state) VALUES ($1, $2)")
|
||||||
|
.bind(payload)
|
||||||
|
.bind(state)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-variant tests don't care about pagination — fetch the whole
|
||||||
|
/// chapter set (up to the hard cap) and discard the total.
|
||||||
|
async fn fetch_chapter_rows(
|
||||||
|
pool: &PgPool,
|
||||||
|
manga_id: Uuid,
|
||||||
|
) -> Vec<mangalord::repo::admin_view::AdminChapterRow> {
|
||||||
|
let (rows, _) = repo::admin_view::list_chapters_with_sync_state(
|
||||||
|
pool,
|
||||||
|
&repo::admin_view::ListAdminChaptersQuery {
|
||||||
|
manga_id,
|
||||||
|
limit: 500,
|
||||||
|
offset: 0,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
rows
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- manga sync state ------------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_state_synced_for_fresh_source(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "Synced Manga").await;
|
||||||
|
insert_manga_source(&pool, m, "smk-1", false).await;
|
||||||
|
|
||||||
|
let (rows, total) = repo::admin_view::list_mangas_with_sync_state(
|
||||||
|
&pool,
|
||||||
|
&repo::admin_view::ListAdminMangasQuery {
|
||||||
|
limit: 50,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(total, 1);
|
||||||
|
assert_eq!(rows[0].id, m);
|
||||||
|
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::Synced);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_state_synced_for_user_upload_without_sources(pool: PgPool) {
|
||||||
|
let m = insert_manga(&pool, "User Upload").await;
|
||||||
|
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
|
||||||
|
&pool,
|
||||||
|
&repo::admin_view::ListAdminMangasQuery {
|
||||||
|
limit: 50,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows[0].id, m);
|
||||||
|
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::Synced);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_state_dropped_when_all_sources_dropped(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "Dropped Manga").await;
|
||||||
|
insert_manga_source(&pool, m, "smk-1", true).await;
|
||||||
|
|
||||||
|
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
|
||||||
|
&pool,
|
||||||
|
&repo::admin_view::ListAdminMangasQuery {
|
||||||
|
limit: 50,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows[0].id, m);
|
||||||
|
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::Dropped);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_state_in_progress_via_sync_chapter_list_job(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "Syncing Manga").await;
|
||||||
|
insert_manga_source(&pool, m, "smk-1", false).await;
|
||||||
|
// sync_chapter_list payload carries manga_id directly.
|
||||||
|
insert_job(
|
||||||
|
&pool,
|
||||||
|
json!({
|
||||||
|
"kind": "sync_chapter_list",
|
||||||
|
"source_id": SOURCE_ID,
|
||||||
|
"manga_id": m.to_string(),
|
||||||
|
"source_manga_key": "smk-1",
|
||||||
|
}),
|
||||||
|
"pending",
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
|
||||||
|
&pool,
|
||||||
|
&repo::admin_view::ListAdminMangasQuery {
|
||||||
|
limit: 50,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::InProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_state_in_progress_via_sync_manga_job(pool: PgPool) {
|
||||||
|
// The trickier branch: sync_manga payload is keyed by
|
||||||
|
// source_manga_key, NOT manga_id — must join through manga_sources.
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "Metadata-Refreshing Manga").await;
|
||||||
|
insert_manga_source(&pool, m, "smk-key-42", false).await;
|
||||||
|
insert_job(
|
||||||
|
&pool,
|
||||||
|
json!({
|
||||||
|
"kind": "sync_manga",
|
||||||
|
"source_id": SOURCE_ID,
|
||||||
|
"source_manga_key": "smk-key-42",
|
||||||
|
}),
|
||||||
|
"running",
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let (rows, _) = repo::admin_view::list_mangas_with_sync_state(
|
||||||
|
&pool,
|
||||||
|
&repo::admin_view::ListAdminMangasQuery {
|
||||||
|
limit: 50,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows[0].sync_state, mangalord::domain::MangaSyncState::InProgress);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_list_filters_by_sync_state(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m_synced = insert_manga(&pool, "AAA Synced").await;
|
||||||
|
insert_manga_source(&pool, m_synced, "smk-a", false).await;
|
||||||
|
let m_dropped = insert_manga(&pool, "BBB Dropped").await;
|
||||||
|
insert_manga_source(&pool, m_dropped, "smk-b", true).await;
|
||||||
|
|
||||||
|
let (rows, total) = repo::admin_view::list_mangas_with_sync_state(
|
||||||
|
&pool,
|
||||||
|
&repo::admin_view::ListAdminMangasQuery {
|
||||||
|
sync_state: Some(mangalord::domain::MangaSyncState::Dropped),
|
||||||
|
limit: 50,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(total, 1);
|
||||||
|
assert_eq!(rows.len(), 1);
|
||||||
|
assert_eq!(rows[0].id, m_dropped);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- chapter sync state ----------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_state_synced_when_pages_present(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
insert_manga_source(&pool, m, "smk", false).await;
|
||||||
|
let c = insert_chapter(&pool, m, 1, 12).await;
|
||||||
|
insert_chapter_source(&pool, c, "ckey-1", false).await;
|
||||||
|
|
||||||
|
let rows = fetch_chapter_rows(&pool, m).await;
|
||||||
|
assert_eq!(rows.len(), 1);
|
||||||
|
assert_eq!(rows[0].id, c);
|
||||||
|
assert_eq!(rows[0].sync_state, mangalord::domain::ChapterSyncState::Synced);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_state_not_downloaded_when_page_count_zero(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
let c = insert_chapter(&pool, m, 1, 0).await;
|
||||||
|
insert_chapter_source(&pool, c, "ckey-1", false).await;
|
||||||
|
|
||||||
|
let rows = fetch_chapter_rows(&pool, m).await;
|
||||||
|
assert_eq!(
|
||||||
|
rows[0].sync_state,
|
||||||
|
mangalord::domain::ChapterSyncState::NotDownloaded
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_state_downloading_when_job_in_flight(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
let c = insert_chapter(&pool, m, 1, 0).await;
|
||||||
|
insert_chapter_source(&pool, c, "ckey-1", false).await;
|
||||||
|
insert_job(
|
||||||
|
&pool,
|
||||||
|
json!({
|
||||||
|
"kind": "sync_chapter_content",
|
||||||
|
"source_id": SOURCE_ID,
|
||||||
|
"chapter_id": c.to_string(),
|
||||||
|
"source_chapter_key": "ckey-1",
|
||||||
|
}),
|
||||||
|
"running",
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let rows = fetch_chapter_rows(&pool, m).await;
|
||||||
|
assert_eq!(
|
||||||
|
rows[0].sync_state,
|
||||||
|
mangalord::domain::ChapterSyncState::Downloading
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_state_dropped_when_all_sources_dropped(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
let c = insert_chapter(&pool, m, 1, 0).await;
|
||||||
|
insert_chapter_source(&pool, c, "ckey-1", true).await;
|
||||||
|
|
||||||
|
let rows = fetch_chapter_rows(&pool, m).await;
|
||||||
|
assert_eq!(
|
||||||
|
rows[0].sync_state,
|
||||||
|
mangalord::domain::ChapterSyncState::Dropped
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_state_failed_when_most_recent_job_dead(pool: PgPool) {
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
let c = insert_chapter(&pool, m, 1, 0).await;
|
||||||
|
insert_chapter_source(&pool, c, "ckey-1", false).await;
|
||||||
|
insert_job(
|
||||||
|
&pool,
|
||||||
|
json!({
|
||||||
|
"kind": "sync_chapter_content",
|
||||||
|
"source_id": SOURCE_ID,
|
||||||
|
"chapter_id": c.to_string(),
|
||||||
|
"source_chapter_key": "ckey-1",
|
||||||
|
}),
|
||||||
|
"dead",
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let rows = fetch_chapter_rows(&pool, m).await;
|
||||||
|
assert_eq!(
|
||||||
|
rows[0].sync_state,
|
||||||
|
mangalord::domain::ChapterSyncState::Failed
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- HTTP-level happy-path + gate ------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn http_list_mangas_returns_paged_with_state(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "Hello").await;
|
||||||
|
insert_manga_source(&pool, m, "smk", false).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
"/api/v1/admin/mangas?limit=50",
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let items = body["items"].as_array().unwrap();
|
||||||
|
assert_eq!(items.len(), 1);
|
||||||
|
assert_eq!(items[0]["id"], m.to_string());
|
||||||
|
assert_eq!(items[0]["sync_state"], "synced");
|
||||||
|
assert_eq!(items[0]["chapter_count"], 0);
|
||||||
|
assert_eq!(body["page"]["total"], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn http_list_mangas_rejects_unknown_sync_state(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
"/api/v1/admin/mangas?sync_state=bogus",
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn http_list_chapters_returns_per_chapter_state(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
let c1 = insert_chapter(&pool, m, 1, 12).await;
|
||||||
|
let c2 = insert_chapter(&pool, m, 2, 0).await;
|
||||||
|
insert_chapter_source(&pool, c1, "ck1", false).await;
|
||||||
|
insert_chapter_source(&pool, c2, "ck2", false).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{m}/chapters"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let items = body["items"].as_array().unwrap();
|
||||||
|
assert_eq!(items.len(), 2);
|
||||||
|
assert_eq!(items[0]["id"], c1.to_string());
|
||||||
|
assert_eq!(items[0]["sync_state"], "synced");
|
||||||
|
assert_eq!(items[1]["id"], c2.to_string());
|
||||||
|
assert_eq!(items[1]["sync_state"], "not_downloaded");
|
||||||
|
assert_eq!(body["page"]["total"], 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn http_list_chapters_caps_limit_at_500(pool: PgPool) {
|
||||||
|
// The handler clamps limit to [1, 500] so a long-runner with
|
||||||
|
// thousands of chapters can't be turned into a request-stall by an
|
||||||
|
// admin (or by a curious admin tab) just clicking expand.
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
for n in 1..=3 {
|
||||||
|
let _c = insert_chapter(&pool, m, n, 0).await;
|
||||||
|
}
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{m}/chapters?limit=999"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["page"]["limit"], 500, "limit must clamp to 500");
|
||||||
|
assert_eq!(body["items"].as_array().unwrap().len(), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn http_list_chapters_paginates(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
for n in 1..=5 {
|
||||||
|
let _c = insert_chapter(&pool, m, n, 0).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{m}/chapters?limit=2&offset=2"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let items = body["items"].as_array().unwrap();
|
||||||
|
assert_eq!(items.len(), 2);
|
||||||
|
// Ordered by chapter number ascending; offset=2 skips chapters 1 & 2.
|
||||||
|
assert_eq!(items[0]["number"], 3);
|
||||||
|
assert_eq!(items[1]["number"], 4);
|
||||||
|
assert_eq!(body["page"]["total"], 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn http_list_chapters_returns_404_for_unknown_manga(pool: PgPool) {
|
||||||
|
// Regression: used to return 200 [] for a non-existent manga,
|
||||||
|
// which silently rendered "No chapters." for a typo'd / deleted id.
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin, cookie) = seed_admin(&pool, &h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{}/chapters", Uuid::new_v4()),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_state_synced_when_pages_present_even_with_dead_job(pool: PgPool) {
|
||||||
|
// Regression: the old CASE prioritised the dead-job branch above
|
||||||
|
// the page_count check, so a chapter with pages on disk AND a
|
||||||
|
// historical dead job (e.g. from a re-download attempt that
|
||||||
|
// crashed) flipped to Failed — contradicting Synced's "downloaded
|
||||||
|
// at some point" contract.
|
||||||
|
seed_source(&pool).await;
|
||||||
|
let m = insert_manga(&pool, "M").await;
|
||||||
|
let c = insert_chapter(&pool, m, 1, 12).await; // pages present
|
||||||
|
insert_chapter_source(&pool, c, "ckey-1", false).await;
|
||||||
|
insert_job(
|
||||||
|
&pool,
|
||||||
|
json!({
|
||||||
|
"kind": "sync_chapter_content",
|
||||||
|
"source_id": SOURCE_ID,
|
||||||
|
"chapter_id": c.to_string(),
|
||||||
|
"source_chapter_key": "ckey-1",
|
||||||
|
}),
|
||||||
|
"dead",
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let rows = fetch_chapter_rows(&pool, m).await;
|
||||||
|
assert_eq!(
|
||||||
|
rows[0].sync_state,
|
||||||
|
mangalord::domain::ChapterSyncState::Synced,
|
||||||
|
"pages on disk override historical dead-job noise"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn http_list_mangas_requires_admin(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_u, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/admin/mangas", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
350
backend/tests/api_admin_resync.rs
Normal file
350
backend/tests/api_admin_resync.rs
Normal file
@@ -0,0 +1,350 @@
|
|||||||
|
//! Integration tests for the admin force-resync endpoints.
|
||||||
|
//!
|
||||||
|
//! Real resync work requires Chromium, so these tests swap in a stub
|
||||||
|
//! [`ResyncService`] to assert the handler-level contract: routing,
|
||||||
|
//! admin gate, 503 when the daemon is disabled, 404 / 422 mapping for
|
||||||
|
//! missing-resource / no-source cases, and the audit-log side effect.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use mangalord::crawler::resync::{
|
||||||
|
ChapterResyncOutcome, MangaResyncOutcome, ResyncError, ResyncService,
|
||||||
|
};
|
||||||
|
use mangalord::repo;
|
||||||
|
use mangalord::repo::crawler::UpsertStatus;
|
||||||
|
|
||||||
|
/// Stub that records call counts and returns a canned outcome.
|
||||||
|
struct StubResync {
|
||||||
|
manga_calls: AtomicUsize,
|
||||||
|
chapter_calls: AtomicUsize,
|
||||||
|
/// When true, returns NoMangaSource / NoChapterSource.
|
||||||
|
no_source: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StubResync {
|
||||||
|
fn new() -> Arc<Self> {
|
||||||
|
Arc::new(Self {
|
||||||
|
manga_calls: AtomicUsize::new(0),
|
||||||
|
chapter_calls: AtomicUsize::new(0),
|
||||||
|
no_source: false,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
fn no_source() -> Arc<Self> {
|
||||||
|
Arc::new(Self {
|
||||||
|
manga_calls: AtomicUsize::new(0),
|
||||||
|
chapter_calls: AtomicUsize::new(0),
|
||||||
|
no_source: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl ResyncService for StubResync {
|
||||||
|
async fn resync_manga(&self, manga_id: Uuid) -> anyhow::Result<MangaResyncOutcome> {
|
||||||
|
self.manga_calls.fetch_add(1, Ordering::SeqCst);
|
||||||
|
if self.no_source {
|
||||||
|
return Err(ResyncError::NoMangaSource.into());
|
||||||
|
}
|
||||||
|
Ok(MangaResyncOutcome {
|
||||||
|
manga_id,
|
||||||
|
metadata_status: UpsertStatus::Updated,
|
||||||
|
cover_fetched: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
async fn resync_chapter(&self, chapter_id: Uuid) -> anyhow::Result<ChapterResyncOutcome> {
|
||||||
|
self.chapter_calls.fetch_add(1, Ordering::SeqCst);
|
||||||
|
if self.no_source {
|
||||||
|
return Err(ResyncError::NoChapterSource.into());
|
||||||
|
}
|
||||||
|
Ok(ChapterResyncOutcome::Fetched {
|
||||||
|
chapter_id,
|
||||||
|
pages: 7,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn promote_admin(pool: &PgPool, username: &str) {
|
||||||
|
let u = repo::user::find_by_username(pool, username)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
repo::user::set_is_admin_unchecked(pool, u.id, true)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_manga(pool: &PgPool, title: &str) -> Uuid {
|
||||||
|
let (id,): (Uuid,) = sqlx::query_as(
|
||||||
|
"INSERT INTO mangas (title, status, alt_titles) VALUES ($1, 'ongoing', ARRAY[]::text[]) RETURNING id",
|
||||||
|
)
|
||||||
|
.bind(title)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
id
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_chapter(pool: &PgPool, manga_id: Uuid, number: i32, pages: i32) -> Uuid {
|
||||||
|
let (id,): (Uuid,) = sqlx::query_as(
|
||||||
|
"INSERT INTO chapters (manga_id, number, title, page_count) VALUES ($1, $2, NULL, $3) RETURNING id",
|
||||||
|
)
|
||||||
|
.bind(manga_id)
|
||||||
|
.bind(number)
|
||||||
|
.bind(pages)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
id
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----- manga resync ---------------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_resync_calls_service_and_returns_refreshed_detail(pool: PgPool) {
|
||||||
|
let stub = StubResync::new();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub.clone());
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
let manga_id = insert_manga(&pool, "Hello").await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
// Stub returned Updated + cover_fetched=true.
|
||||||
|
assert_eq!(body["metadata_status"], "updated");
|
||||||
|
assert_eq!(body["cover_fetched"], true);
|
||||||
|
// Response includes the refreshed manga detail.
|
||||||
|
assert_eq!(body["manga"]["id"], manga_id.to_string());
|
||||||
|
assert_eq!(body["manga"]["title"], "Hello");
|
||||||
|
|
||||||
|
assert_eq!(stub.manga_calls.load(Ordering::SeqCst), 1);
|
||||||
|
|
||||||
|
// Audit row written.
|
||||||
|
let (audit_count,): (i64,) =
|
||||||
|
sqlx::query_as("SELECT count(*) FROM admin_audit WHERE action = 'manga_resync' AND target_id = $1")
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(audit_count, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_resync_returns_404_for_unknown_id(pool: PgPool) {
|
||||||
|
let stub = StubResync::new();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub.clone());
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{}/resync", Uuid::new_v4()),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
// Service must not have been called when the manga doesn't exist.
|
||||||
|
assert_eq!(stub.manga_calls.load(Ordering::SeqCst), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_resync_maps_no_source_to_422(pool: PgPool) {
|
||||||
|
let stub = StubResync::no_source();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub);
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
let manga_id = insert_manga(&pool, "Manual upload, no crawler source").await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["details"]["manga"], "no_source");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_resync_returns_503_when_daemon_disabled(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
let manga_id = insert_manga(&pool, "Z").await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "service_unavailable");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_resync_requires_admin(pool: PgPool) {
|
||||||
|
let stub = StubResync::new();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub);
|
||||||
|
// Non-admin user.
|
||||||
|
let (_u, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = insert_manga(&pool, "M").await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/mangas/{manga_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----- chapter resync -------------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_resync_calls_service_and_returns_refreshed_chapter(pool: PgPool) {
|
||||||
|
let stub = StubResync::new();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub.clone());
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
let manga_id = insert_manga(&pool, "M").await;
|
||||||
|
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["outcome"], "fetched");
|
||||||
|
assert_eq!(body["pages"], 7);
|
||||||
|
assert_eq!(body["chapter"]["id"], chapter_id.to_string());
|
||||||
|
assert_eq!(stub.chapter_calls.load(Ordering::SeqCst), 1);
|
||||||
|
|
||||||
|
let (audit_count,): (i64,) = sqlx::query_as(
|
||||||
|
"SELECT count(*) FROM admin_audit WHERE action = 'chapter_resync' AND target_id = $1",
|
||||||
|
)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(audit_count, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_resync_returns_404_for_unknown_id(pool: PgPool) {
|
||||||
|
let stub = StubResync::new();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub.clone());
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/chapters/{}/resync", Uuid::new_v4()),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
assert_eq!(stub.chapter_calls.load(Ordering::SeqCst), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_resync_maps_no_source_to_422(pool: PgPool) {
|
||||||
|
let stub = StubResync::no_source();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub);
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
let manga_id = insert_manga(&pool, "M").await;
|
||||||
|
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["details"]["chapter"], "no_source");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_resync_returns_503_when_daemon_disabled(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (username, cookie) = common::register_user(&h.app).await;
|
||||||
|
promote_admin(&pool, &username).await;
|
||||||
|
let manga_id = insert_manga(&pool, "M").await;
|
||||||
|
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_resync_requires_admin(pool: PgPool) {
|
||||||
|
let stub = StubResync::new();
|
||||||
|
let h = common::harness_with_resync(pool.clone(), stub);
|
||||||
|
let (_u, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = insert_manga(&pool, "M").await;
|
||||||
|
let chapter_id = insert_chapter(&pool, manga_id, 1, 0).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/chapters/{chapter_id}/resync"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
259
backend/tests/api_admin_role.rs
Normal file
259
backend/tests/api_admin_role.rs
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
//! PR 1 (feat/admin-role) integration tests.
|
||||||
|
//!
|
||||||
|
//! Covers: `bootstrap_admin` semantics, `is_admin` exposed on /auth/me,
|
||||||
|
//! and the `RequireAdmin` extractor's 401/403/200 matrix — including the
|
||||||
|
//! load-bearing decision that Bearer-authed callers can NEVER reach an
|
||||||
|
//! admin-guarded route, even when the underlying user IS admin.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::routing::get;
|
||||||
|
use axum::{Json, Router};
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
|
||||||
|
use mangalord::api;
|
||||||
|
use mangalord::app::AppState;
|
||||||
|
use mangalord::auth::extractor::RequireAdmin;
|
||||||
|
use mangalord::auth::rate_limit::AuthRateLimiter;
|
||||||
|
use mangalord::config::{AuthConfig, UploadConfig};
|
||||||
|
use mangalord::repo;
|
||||||
|
use mangalord::storage::{LocalStorage, Storage};
|
||||||
|
|
||||||
|
/// Test-only handler guarded by `RequireAdmin`. Lets the test suite assert
|
||||||
|
/// the extractor's behaviour end-to-end without depending on an admin
|
||||||
|
/// endpoint existing yet (those land in PR 2+).
|
||||||
|
async fn admin_only_handler(RequireAdmin(user): RequireAdmin) -> Json<serde_json::Value> {
|
||||||
|
Json(json!({ "username": user.username, "is_admin": user.is_admin }))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a router that exposes the production /api/v1/* AND a test-only
|
||||||
|
/// `/_test/admin_only` route guarded by `RequireAdmin`. Pool is consumed;
|
||||||
|
/// callers that want to inspect the DB after a request should clone it.
|
||||||
|
fn admin_test_router(pool: PgPool) -> (Router, TempDir) {
|
||||||
|
let storage_dir = tempfile::tempdir().expect("tempdir");
|
||||||
|
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(storage_dir.path()));
|
||||||
|
let auth = AuthConfig {
|
||||||
|
cookie_secure: false,
|
||||||
|
..AuthConfig::default()
|
||||||
|
};
|
||||||
|
let auth_limiter = Arc::new(AuthRateLimiter::new(auth.rate_limit));
|
||||||
|
let state = AppState {
|
||||||
|
db: pool,
|
||||||
|
storage,
|
||||||
|
auth,
|
||||||
|
upload: UploadConfig::default(),
|
||||||
|
auth_limiter,
|
||||||
|
resync: None,
|
||||||
|
crawler: None,
|
||||||
|
};
|
||||||
|
let app = Router::new()
|
||||||
|
.nest("/api/v1", api::routes())
|
||||||
|
.route("/_test/admin_only", get(admin_only_handler))
|
||||||
|
.with_state(state);
|
||||||
|
(app, storage_dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- bootstrap_admin -------------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn bootstrap_creates_admin_when_user_missing(pool: PgPool) {
|
||||||
|
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
|
||||||
|
.await
|
||||||
|
.expect("bootstrap on empty DB");
|
||||||
|
|
||||||
|
let user = repo::user::find_by_username(&pool, "root")
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.expect("root user exists after bootstrap");
|
||||||
|
assert!(user.is_admin, "bootstrap must set is_admin = true on creation");
|
||||||
|
|
||||||
|
// Password hash must verify the env-supplied password (and not be empty).
|
||||||
|
assert!(
|
||||||
|
mangalord::auth::password::verify_password("hunter2hunter2", &user.password_hash),
|
||||||
|
"bootstrap-created user must accept the env-supplied password"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn bootstrap_promotes_existing_user_without_touching_password(pool: PgPool) {
|
||||||
|
// Pre-existing user, not admin. Use the real register path so the
|
||||||
|
// hash format matches production exactly.
|
||||||
|
let (app, _td) = admin_test_router(pool.clone());
|
||||||
|
let resp = app
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/register",
|
||||||
|
json!({ "username": "preexisting", "password": "originalpw1234" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
|
||||||
|
let before = repo::user::find_by_username(&pool, "preexisting")
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
assert!(!before.is_admin);
|
||||||
|
let original_hash = before.password_hash.clone();
|
||||||
|
|
||||||
|
// Bootstrap with a DIFFERENT password — must not overwrite the hash.
|
||||||
|
repo::user::bootstrap_admin(&pool, "preexisting", "envpw_should_be_ignored")
|
||||||
|
.await
|
||||||
|
.expect("bootstrap on existing user");
|
||||||
|
|
||||||
|
let after = repo::user::find_by_username(&pool, "preexisting")
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
assert!(after.is_admin, "bootstrap must promote existing user");
|
||||||
|
assert_eq!(
|
||||||
|
after.password_hash, original_hash,
|
||||||
|
"bootstrap must NOT overwrite the existing password hash"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
mangalord::auth::password::verify_password("originalpw1234", &after.password_hash),
|
||||||
|
"original password must still verify after bootstrap"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn bootstrap_is_idempotent(pool: PgPool) {
|
||||||
|
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
|
||||||
|
.await
|
||||||
|
.expect("first bootstrap");
|
||||||
|
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
|
||||||
|
.await
|
||||||
|
.expect("second bootstrap is no-op");
|
||||||
|
|
||||||
|
// Exactly one row, still admin.
|
||||||
|
let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM users WHERE username = $1")
|
||||||
|
.bind("root")
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(count, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- /api/v1/auth/me exposes is_admin --------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn auth_me_response_includes_is_admin(pool: PgPool) {
|
||||||
|
let (app, _td) = admin_test_router(pool.clone());
|
||||||
|
let (_username, cookie) = common::register_user(&app).await;
|
||||||
|
let resp = app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/auth/me", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(
|
||||||
|
body["user"]["is_admin"], false,
|
||||||
|
"freshly-registered users default to is_admin=false"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- RequireAdmin: 401 / 403 / 200 matrix ----------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn require_admin_rejects_unauthenticated(pool: PgPool) {
|
||||||
|
let (app, _td) = admin_test_router(pool);
|
||||||
|
let resp = app
|
||||||
|
.oneshot(common::get("/_test/admin_only"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn require_admin_rejects_non_admin_cookie(pool: PgPool) {
|
||||||
|
let (app, _td) = admin_test_router(pool);
|
||||||
|
let (_username, cookie) = common::register_user(&app).await;
|
||||||
|
let resp = app
|
||||||
|
.oneshot(common::get_with_cookie("/_test/admin_only", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "forbidden");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn require_admin_accepts_admin_cookie(pool: PgPool) {
|
||||||
|
let (app, _td) = admin_test_router(pool.clone());
|
||||||
|
let (username, cookie) = common::register_user(&app).await;
|
||||||
|
// Promote via the repo (the admin-users API doesn't exist yet).
|
||||||
|
let u = repo::user::find_by_username(&pool, &username)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
repo::user::set_is_admin_unchecked(&pool, u.id, true).await.unwrap();
|
||||||
|
|
||||||
|
let resp = app
|
||||||
|
.oneshot(common::get_with_cookie("/_test/admin_only", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["username"], username);
|
||||||
|
assert_eq!(body["is_admin"], true);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn require_admin_rejects_bearer_token_even_for_admin_user(pool: PgPool) {
|
||||||
|
// Key privilege-escalation test: an API token belonging to an admin user
|
||||||
|
// must NOT grant admin authority. Bot tokens are excluded from admin
|
||||||
|
// routes by design (the RequireAdmin extractor only accepts session
|
||||||
|
// cookies). See cross-cutting decision #1 in the PR plan.
|
||||||
|
let (app, _td) = admin_test_router(pool.clone());
|
||||||
|
let (username, cookie) = common::register_user(&app).await;
|
||||||
|
|
||||||
|
// Promote to admin and mint an API token (the existing /auth/tokens
|
||||||
|
// endpoint authenticates via the same cookie).
|
||||||
|
let u = repo::user::find_by_username(&pool, &username)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
repo::user::set_is_admin_unchecked(&pool, u.id, true).await.unwrap();
|
||||||
|
|
||||||
|
let resp = app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/auth/tokens",
|
||||||
|
json!({ "name": "test-bot" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let token = body["bearer"]
|
||||||
|
.as_str()
|
||||||
|
.expect("raw bearer token in response")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
// Sanity: the bearer DOES work on a non-admin endpoint (proves the
|
||||||
|
// token is valid, isolating the failure below to the admin guard).
|
||||||
|
let resp = app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::get_with_bearer("/api/v1/auth/me", &token))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
|
||||||
|
// Same token, same admin user, but on the admin-guarded route → 401
|
||||||
|
// (no session cookie present at all from the extractor's POV).
|
||||||
|
let resp = app
|
||||||
|
.oneshot(common::get_with_bearer("/_test/admin_only", &token))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
resp.status(),
|
||||||
|
StatusCode::UNAUTHORIZED,
|
||||||
|
"Bearer-authed admin must NOT pass the RequireAdmin guard"
|
||||||
|
);
|
||||||
|
}
|
||||||
96
backend/tests/api_admin_system.rs
Normal file
96
backend/tests/api_admin_system.rs
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
//! PR 4 (feat/admin-system-api) integration tests.
|
||||||
|
//!
|
||||||
|
//! Shape-only assertions — we don't mock the system, just call the
|
||||||
|
//! endpoint and check the response envelope. Threshold-triggering of
|
||||||
|
//! alerts would require faking statvfs / sysinfo, which is more
|
||||||
|
//! plumbing than the test gives back.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::Router;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
|
||||||
|
use mangalord::repo;
|
||||||
|
|
||||||
|
async fn seed_admin(pool: &PgPool, app: &Router) -> String {
|
||||||
|
let (username, cookie) = common::register_user(app).await;
|
||||||
|
let u = repo::user::find_by_username(pool, &username)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
repo::user::set_is_admin_unchecked(pool, u.id, true).await.unwrap();
|
||||||
|
cookie
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn requires_admin(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_u, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/admin/system", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn unauthenticated_request_is_rejected(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get("/api/v1/admin/system"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn returns_disk_memory_cpu_alerts_shape(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let cookie = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/admin/system", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
|
||||||
|
// Disk: harness uses LocalStorage on a tempdir, so disk SHOULD be
|
||||||
|
// populated. Validate the field shape and percent range.
|
||||||
|
let disk = body
|
||||||
|
.get("disk")
|
||||||
|
.expect("disk key present")
|
||||||
|
.as_object()
|
||||||
|
.expect("disk is an object (LocalStorage exposes a path)");
|
||||||
|
assert!(disk["total_bytes"].as_u64().unwrap() > 0);
|
||||||
|
let pct = disk["percent_used"].as_f64().unwrap();
|
||||||
|
assert!(
|
||||||
|
(0.0..=100.0).contains(&pct),
|
||||||
|
"percent_used outside [0,100]: {pct}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let mem = body.get("memory").expect("memory key").as_object().unwrap();
|
||||||
|
assert!(mem["total_bytes"].as_u64().unwrap() > 0);
|
||||||
|
let mpct = mem["percent_used"].as_f64().unwrap();
|
||||||
|
assert!((0.0..=100.0).contains(&mpct));
|
||||||
|
|
||||||
|
let cpu = body.get("cpu").expect("cpu key").as_object().unwrap();
|
||||||
|
let cpu_pct = cpu["percent_used"].as_f64().unwrap();
|
||||||
|
assert!(
|
||||||
|
(0.0..=100.0).contains(&cpu_pct),
|
||||||
|
"cpu out of range: {cpu_pct}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let alerts = body.get("alerts").expect("alerts key").as_array().unwrap();
|
||||||
|
// Don't assert on length — the box may genuinely be >90% on memory
|
||||||
|
// when the test runs. Just confirm shape of any present entry.
|
||||||
|
for alert in alerts {
|
||||||
|
assert!(alert["level"].is_string());
|
||||||
|
assert!(alert["message"].is_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
605
backend/tests/api_admin_users.rs
Normal file
605
backend/tests/api_admin_users.rs
Normal file
@@ -0,0 +1,605 @@
|
|||||||
|
//! PR 2 (feat/admin-users-api) integration tests.
|
||||||
|
//!
|
||||||
|
//! Exercises list / delete / promote-demote on /api/v1/admin/users:
|
||||||
|
//! pagination + search, the RequireAdmin gate, self-protection,
|
||||||
|
//! last-admin invariant (including the parallel-demote race that
|
||||||
|
//! `pg_advisory_xact_lock` + recount-inside-tx guards against), and
|
||||||
|
//! that audit rows land in `admin_audit` only on successful commit.
|
||||||
|
//!
|
||||||
|
//! Note on the last-admin invariant: the *serial* path via HTTP is
|
||||||
|
//! structurally unreachable — the only configuration that would hit the
|
||||||
|
//! "would orphan admins" branch requires the actor to be the lone admin
|
||||||
|
//! demoting themselves, which the self-guard fires on first. So the
|
||||||
|
//! last-admin checks below call the repo directly to exercise the
|
||||||
|
//! invariant; the HTTP race scenario is covered by
|
||||||
|
//! `parallel_demotes_cannot_orphan_admins`.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use axum::Router;
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use mangalord::error::AppError;
|
||||||
|
use mangalord::repo;
|
||||||
|
|
||||||
|
/// Register a user via the public API and immediately promote them via
|
||||||
|
/// the repo. Returns (username, session cookie, user_id) — the common
|
||||||
|
/// "I need a logged-in admin" prelude.
|
||||||
|
async fn seed_admin(pool: &PgPool, app: &Router) -> (String, String, Uuid) {
|
||||||
|
let (username, cookie) = common::register_user(app).await;
|
||||||
|
let u = repo::user::find_by_username(pool, &username)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
repo::user::set_is_admin_unchecked(pool, u.id, true).await.unwrap();
|
||||||
|
(username, cookie, u.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- RequireAdmin gate -----------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn list_requires_admin(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_username, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/admin/users", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_requires_admin(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_username, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/admin/users/{}", Uuid::new_v4()),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_requires_admin(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_username, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/users/{}", Uuid::new_v4()),
|
||||||
|
json!({ "is_admin": true }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- list with search and pagination ---------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn list_returns_paginated_users(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin_name, cookie, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
let _u1 = common::register_user(&h.app).await;
|
||||||
|
let _u2 = common::register_user(&h.app).await;
|
||||||
|
let _u3 = common::register_user(&h.app).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
"/api/v1/admin/users?limit=2&offset=0",
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let items = body["items"].as_array().expect("items array");
|
||||||
|
assert_eq!(items.len(), 2, "limit=2 should cap the page");
|
||||||
|
assert_eq!(body["page"]["limit"], 2);
|
||||||
|
assert_eq!(body["page"]["offset"], 0);
|
||||||
|
assert_eq!(body["page"]["total"], 4);
|
||||||
|
assert!(items[0].get("is_admin").is_some());
|
||||||
|
assert!(
|
||||||
|
items[0].get("password_hash").is_none(),
|
||||||
|
"password_hash must never leak even to other admins"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn list_filters_by_substring_search(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_admin_name, cookie, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/register",
|
||||||
|
json!({ "username": "zzzfindme01", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
"/api/v1/admin/users?search=zzzfindme",
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let items = body["items"].as_array().unwrap();
|
||||||
|
assert_eq!(items.len(), 1, "search must narrow to the one match");
|
||||||
|
assert_eq!(items[0]["username"], "zzzfindme01");
|
||||||
|
assert_eq!(body["page"]["total"], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- self-protection -------------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn cannot_self_delete(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_username, cookie, actor_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
// Second admin so the last-admin guard isn't what triggers the conflict.
|
||||||
|
let (_other, _, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/admin/users/{actor_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CONFLICT);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "conflict");
|
||||||
|
assert!(
|
||||||
|
body["error"]["message"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
.contains("yourself"),
|
||||||
|
"message must call out the self-action; got {:?}",
|
||||||
|
body["error"]["message"]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn cannot_self_demote(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_username, cookie, actor_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
let (_other, _, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/users/{actor_id}"),
|
||||||
|
json!({ "is_admin": false }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CONFLICT);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert!(body["error"]["message"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
.contains("yourself"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- last-admin invariant (repo layer, see file header) --------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn last_admin_demote_refused_at_repo(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a, _, a_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
let (_b, _, b_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
// admins = {A, B}. Demote A via B (count 2 → 1) — allowed.
|
||||||
|
let r = repo::user::admin_safe_set_is_admin(&pool, b_id, a_id, false)
|
||||||
|
.await
|
||||||
|
.expect("first demote succeeds");
|
||||||
|
assert!(!r.is_admin);
|
||||||
|
|
||||||
|
// admins = {B}. Try to demote B via A (actor doesn't matter to the
|
||||||
|
// repo — that's the HTTP gate's job). Last-admin guard kicks in.
|
||||||
|
let err = repo::user::admin_safe_set_is_admin(&pool, a_id, b_id, false)
|
||||||
|
.await
|
||||||
|
.expect_err("second demote must be refused");
|
||||||
|
match err {
|
||||||
|
AppError::Conflict(m) => assert!(
|
||||||
|
m.contains("last admin"),
|
||||||
|
"expected last-admin conflict; got {m:?}"
|
||||||
|
),
|
||||||
|
other => panic!("expected Conflict, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn last_admin_delete_refused_at_repo(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a, _, a_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
let (_b, _, b_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
// admins = {A, B}. Delete A via B (count 2 → 1) — allowed.
|
||||||
|
repo::user::admin_safe_delete(&pool, b_id, a_id)
|
||||||
|
.await
|
||||||
|
.expect("first delete succeeds");
|
||||||
|
|
||||||
|
// admins = {B}. Try to delete B via a fresh non-admin actor. Last-
|
||||||
|
// admin guard kicks in.
|
||||||
|
let (_c, _, c_id) = {
|
||||||
|
let (cn, _ck) = common::register_user(&h.app).await;
|
||||||
|
let c = repo::user::find_by_username(&pool, &cn).await.unwrap().unwrap();
|
||||||
|
(cn, _ck, c.id)
|
||||||
|
};
|
||||||
|
let err = repo::user::admin_safe_delete(&pool, c_id, b_id)
|
||||||
|
.await
|
||||||
|
.expect_err("second delete must be refused");
|
||||||
|
match err {
|
||||||
|
AppError::Conflict(m) => assert!(
|
||||||
|
m.contains("last admin"),
|
||||||
|
"expected last-admin conflict; got {m:?}"
|
||||||
|
),
|
||||||
|
other => panic!("expected Conflict, got {other:?}"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn parallel_demotes_cannot_orphan_admins(pool: PgPool) {
|
||||||
|
// The race the advisory lock + recount exists to close: two parallel
|
||||||
|
// demotes of two DIFFERENT admins, each reading `count = 2` and
|
||||||
|
// committing, would land at zero admins. With the lock the second
|
||||||
|
// demote sees count = 1 inside the tx and refuses.
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a, _, a_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
let (_b, _, b_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
let pool_x = pool.clone();
|
||||||
|
let pool_y = pool.clone();
|
||||||
|
let task_x = tokio::spawn(async move {
|
||||||
|
repo::user::admin_safe_set_is_admin(&pool_x, a_id, b_id, false).await
|
||||||
|
});
|
||||||
|
let task_y = tokio::spawn(async move {
|
||||||
|
repo::user::admin_safe_set_is_admin(&pool_y, b_id, a_id, false).await
|
||||||
|
});
|
||||||
|
let r_x = task_x.await.unwrap();
|
||||||
|
let r_y = task_y.await.unwrap();
|
||||||
|
|
||||||
|
let outcomes = (r_x.is_ok(), r_y.is_ok());
|
||||||
|
assert!(
|
||||||
|
outcomes == (true, false) || outcomes == (false, true),
|
||||||
|
"exactly one of the two parallel demotes must succeed; got {outcomes:?}"
|
||||||
|
);
|
||||||
|
|
||||||
|
let (count,): (i64,) =
|
||||||
|
sqlx::query_as("SELECT COUNT(*) FROM users WHERE is_admin = true")
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(count, 1, "at least one admin must remain");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- audit log -------------------------------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn promote_writes_audit_row(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, a_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
let (b_name, _b_cookie) = common::register_user(&h.app).await;
|
||||||
|
let b = repo::user::find_by_username(&pool, &b_name)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/users/{}", b.id),
|
||||||
|
json!({ "is_admin": true }),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
|
||||||
|
let rows: Vec<(Option<Uuid>, String, String, Option<Uuid>)> = sqlx::query_as(
|
||||||
|
"SELECT actor_user_id, action, target_kind, target_id FROM admin_audit",
|
||||||
|
)
|
||||||
|
.fetch_all(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows.len(), 1);
|
||||||
|
let (actor, action, kind, target) = rows.into_iter().next().unwrap();
|
||||||
|
assert_eq!(actor, Some(a_id));
|
||||||
|
assert_eq!(action, "promote_user");
|
||||||
|
assert_eq!(kind, "user");
|
||||||
|
assert_eq!(target, Some(b.id));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn redundant_promote_does_not_write_audit_row(pool: PgPool) {
|
||||||
|
// Regression: PATCH {is_admin: true} on someone already admin used
|
||||||
|
// to UPDATE (no-op) and still INSERT a misleading "promote_user"
|
||||||
|
// audit row. Should short-circuit without touching admin_audit.
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, _a_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
let (b_name, _b_cookie, _b_id) = seed_admin(&pool, &h.app).await; // already admin
|
||||||
|
|
||||||
|
let b = repo::user::find_by_username(&pool, &b_name)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/admin/users/{}", b.id),
|
||||||
|
json!({ "is_admin": true }),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
|
||||||
|
let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM admin_audit")
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(count, 0, "no-op promote must not write audit row");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_writes_audit_row(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, a_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
let (b_name, _b_cookie) = common::register_user(&h.app).await;
|
||||||
|
let b = repo::user::find_by_username(&pool, &b_name)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/admin/users/{}", b.id),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
||||||
|
|
||||||
|
let rows: Vec<(Option<Uuid>, String, String, Option<Uuid>, serde_json::Value)> =
|
||||||
|
sqlx::query_as(
|
||||||
|
"SELECT actor_user_id, action, target_kind, target_id, payload FROM admin_audit",
|
||||||
|
)
|
||||||
|
.fetch_all(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(rows.len(), 1);
|
||||||
|
let (actor, action, kind, target, payload) = rows.into_iter().next().unwrap();
|
||||||
|
assert_eq!(actor, Some(a_id));
|
||||||
|
assert_eq!(action, "delete_user");
|
||||||
|
assert_eq!(kind, "user");
|
||||||
|
assert_eq!(target, Some(b.id));
|
||||||
|
assert_eq!(payload["username"], b_name);
|
||||||
|
assert_eq!(payload["was_admin"], false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- POST /admin/users (admin-create) --------------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_requires_admin(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_username, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({ "username": "newbie", "password": "hunter2hunter2" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_unauthenticated_is_rejected(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({ "username": "newbie", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_happy_path_creates_user_and_audit(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, a_id) = seed_admin(&pool, &h.app).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({ "username": "invited01", "password": "freshpass1234" }),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["username"], "invited01");
|
||||||
|
assert_eq!(body["is_admin"], false);
|
||||||
|
assert!(body["id"].as_str().is_some());
|
||||||
|
assert!(
|
||||||
|
body.get("password_hash").is_none(),
|
||||||
|
"password_hash must never appear in admin-create response"
|
||||||
|
);
|
||||||
|
|
||||||
|
let target_id =
|
||||||
|
Uuid::parse_str(body["id"].as_str().unwrap()).unwrap();
|
||||||
|
let (actor, action, kind, target, payload): (
|
||||||
|
Option<Uuid>,
|
||||||
|
String,
|
||||||
|
String,
|
||||||
|
Option<Uuid>,
|
||||||
|
serde_json::Value,
|
||||||
|
) = sqlx::query_as(
|
||||||
|
"SELECT actor_user_id, action, target_kind, target_id, payload \
|
||||||
|
FROM admin_audit ORDER BY at DESC LIMIT 1",
|
||||||
|
)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(actor, Some(a_id));
|
||||||
|
assert_eq!(action, "create_user");
|
||||||
|
assert_eq!(kind, "user");
|
||||||
|
assert_eq!(target, Some(target_id));
|
||||||
|
assert_eq!(payload["username"], "invited01");
|
||||||
|
assert_eq!(payload["is_admin"], false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_can_mint_an_admin_in_one_call(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({
|
||||||
|
"username": "newadmin",
|
||||||
|
"password": "freshpass1234",
|
||||||
|
"is_admin": true
|
||||||
|
}),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["is_admin"], true);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_returns_409_on_duplicate(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
// Seed an existing user via the public register path.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/register",
|
||||||
|
json!({ "username": "taken", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({ "username": "Taken", "password": "freshpass1234" }),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
resp.status(),
|
||||||
|
StatusCode::CONFLICT,
|
||||||
|
"case-insensitive collision via the lower(username) index"
|
||||||
|
);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "conflict");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_rejects_weak_password(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({ "username": "okayname", "password": "short" }),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "invalid_input");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_rejects_invalid_username(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_a_name, a_cookie, _) = seed_admin(&pool, &h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({ "username": "bad name!", "password": "freshpass1234" }),
|
||||||
|
&a_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::BAD_REQUEST);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_user_works_even_when_self_register_disabled(pool: PgPool) {
|
||||||
|
// The admin-create path must NOT be gated by ALLOW_SELF_REGISTER —
|
||||||
|
// that's the entire point of having an admin-create endpoint.
|
||||||
|
let h = common::harness_with_self_register_disabled(pool.clone());
|
||||||
|
// Bootstrap an admin out-of-band since self-register would refuse.
|
||||||
|
repo::user::bootstrap_admin(&pool, "root", "hunter2hunter2")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/login",
|
||||||
|
json!({ "username": "root", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let cookie = common::extract_session_cookie(&resp).unwrap();
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/admin/users",
|
||||||
|
json!({ "username": "invited01", "password": "freshpass1234" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
resp.status(),
|
||||||
|
StatusCode::CREATED,
|
||||||
|
"admin must be able to mint users even with self-register off"
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -567,6 +567,166 @@ async fn user_a_cannot_delete_user_b_token(pool: PgPool) {
|
|||||||
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Username enumeration via login response time: an attacker probes
|
||||||
|
/// for valid usernames by measuring how long /auth/login takes. Before
|
||||||
|
/// the equalisation fix, the no-user branch returned 401 in <1 ms
|
||||||
|
/// while the wrong-password branch took ~50-100 ms (the argon2 verify
|
||||||
|
/// cost). This test asserts the no-user branch now spends at least
|
||||||
|
/// some meaningful fraction of the wrong-password branch's time.
|
||||||
|
///
|
||||||
|
/// Tolerance is intentionally loose so CI variance doesn't flap the
|
||||||
|
/// test. The unequalised gap is large enough (~50x) that even a noisy
|
||||||
|
/// CI run with a 5x slack still catches it.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn login_no_user_branch_runs_argon2_for_timing_equalisation(pool: PgPool) {
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
let h = common::harness(pool);
|
||||||
|
|
||||||
|
// Register the victim user so the wrong-password branch has a real
|
||||||
|
// argon2 hash to verify against.
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/register",
|
||||||
|
json!({ "username": "victim", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Warm-up: first login of the process initialises the dummy hash
|
||||||
|
// lazily. Skip that cost when measuring.
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/login",
|
||||||
|
json!({ "username": "victim", "password": "wrong" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/login",
|
||||||
|
json!({ "username": "ghost", "password": "wrong" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Median-of-N is more stable than a single sample.
|
||||||
|
async fn sample_min(
|
||||||
|
app: &axum::Router,
|
||||||
|
username: &str,
|
||||||
|
n: u32,
|
||||||
|
) -> std::time::Duration {
|
||||||
|
let mut samples = Vec::with_capacity(n as usize);
|
||||||
|
for _ in 0..n {
|
||||||
|
let req = common::post_json(
|
||||||
|
"/api/v1/auth/login",
|
||||||
|
json!({ "username": username, "password": "wrong-guess" }),
|
||||||
|
);
|
||||||
|
let t = Instant::now();
|
||||||
|
let resp = app.clone().oneshot(req).await.unwrap();
|
||||||
|
let d = t.elapsed();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
samples.push(d);
|
||||||
|
}
|
||||||
|
// Use the minimum: it's the floor that argon2 takes, robust
|
||||||
|
// against unrelated stalls (DB connection acquisition, etc.).
|
||||||
|
*samples.iter().min().unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
let wrong_pwd = sample_min(&h.app, "victim", 3).await;
|
||||||
|
let no_user = sample_min(&h.app, "ghost", 3).await;
|
||||||
|
|
||||||
|
// 5x slack: argon2 dominates both branches, so they should be
|
||||||
|
// within an order of magnitude. Unequalised, no_user would be
|
||||||
|
// ~50-100x faster. Asserting "no_user >= wrong_pwd / 5" catches
|
||||||
|
// the bug without being flaky in CI.
|
||||||
|
assert!(
|
||||||
|
no_user * 5 >= wrong_pwd,
|
||||||
|
"login timing leaks user existence: no_user={no_user:?}, wrong_pwd={wrong_pwd:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Brute-force / spray protection: at default production limits, a
|
||||||
|
/// tight loop of /auth/login attempts should burst through the bucket
|
||||||
|
/// and then 429 every subsequent request until the bucket refills.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn login_rate_limited_under_burst_pressure(pool: PgPool) {
|
||||||
|
let h = common::harness_with_auth_rate_limit(pool, 1, 3);
|
||||||
|
|
||||||
|
// Register a victim so the wrong-password branch is real work.
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json("/api/v1/auth/register", creds("victim")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Register consumed one token from the burst-3 bucket. Fire 30
|
||||||
|
// wrong-password logins back-to-back; with per_sec=1 the refill
|
||||||
|
// is too slow to keep up and at least one must come back 429.
|
||||||
|
let mut saw_429 = false;
|
||||||
|
for _ in 0..30 {
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/login",
|
||||||
|
json!({ "username": "victim", "password": "wrong" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
if resp.status() == StatusCode::TOO_MANY_REQUESTS {
|
||||||
|
// RFC 6585 §4: 429 SHOULD include a Retry-After header. The
|
||||||
|
// value is in seconds; with per_sec=1 the bucket needs ~1s
|
||||||
|
// to refill, so the header should be 1 or 2.
|
||||||
|
let retry_after = resp
|
||||||
|
.headers()
|
||||||
|
.get(axum::http::header::RETRY_AFTER)
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.and_then(|s| s.parse::<u32>().ok())
|
||||||
|
.expect("Retry-After header present and numeric");
|
||||||
|
assert!(
|
||||||
|
retry_after >= 1,
|
||||||
|
"Retry-After must be at least 1s, got {retry_after}"
|
||||||
|
);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "too_many_requests");
|
||||||
|
saw_429 = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert!(
|
||||||
|
saw_429,
|
||||||
|
"expected at least one 429 within 30 rapid login attempts"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Default (test-harness) limits are disabled, so existing tests that
|
||||||
|
/// fire multiple auth requests don't start failing.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn default_test_harness_does_not_rate_limit(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
for i in 0..50 {
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/login",
|
||||||
|
json!({ "username": format!("nobody-{i}"), "password": "x" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// None of these should be 429 — only 401.
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED, "iter {i}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
async fn delete_unknown_token_is_404(pool: PgPool) {
|
async fn delete_unknown_token_is_404(pool: PgPool) {
|
||||||
let h = common::harness(pool);
|
let h = common::harness(pool);
|
||||||
@@ -581,3 +741,68 @@ async fn delete_unknown_token_is_404(pool: PgPool) {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Bot token names are user-supplied free-form strings; a 10 MB name
|
||||||
|
/// was accepted before. Cap at 64 chars to match the other free-form
|
||||||
|
/// identifier caps (tags, collection names). The response uses
|
||||||
|
/// `ValidationFailed` (422 with per-field details) so clients can
|
||||||
|
/// render the same shape they already handle for `attach_tag`.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_token_rejects_name_over_64_chars(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/auth/tokens",
|
||||||
|
json!({ "name": "x".repeat(65) }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "validation_failed");
|
||||||
|
assert!(body["error"]["details"]["name"].is_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- self-register toggle + /auth/config -----------------------------------
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn auth_config_reports_self_register_enabled_by_default(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get("/api/v1/auth/config"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["self_register_enabled"], true);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn auth_config_reflects_self_register_disabled(pool: PgPool) {
|
||||||
|
let h = common::harness_with_self_register_disabled(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get("/api/v1/auth/config"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["self_register_enabled"], false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn register_returns_403_when_self_register_disabled(pool: PgPool) {
|
||||||
|
let h = common::harness_with_self_register_disabled(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json("/api/v1/auth/register", creds("alice")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "forbidden");
|
||||||
|
}
|
||||||
|
|||||||
@@ -344,7 +344,7 @@ async fn list_me_enriches_chapter_bookmarks_with_chapter_number(pool: PgPool) {
|
|||||||
let (_, cookie) = common::register_user(&h.app).await;
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
// Seed a chapter directly so we know its number without uploading pages.
|
// Seed a chapter directly so we know its number without uploading pages.
|
||||||
mangalord::repo::chapter::create(&pool, manga_id, 7, Some("The Brand"))
|
mangalord::repo::chapter::create(&pool, manga_id, 7, Some("The Brand"), None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
// Look up its id so we can bookmark it.
|
// Look up its id so we can bookmark it.
|
||||||
@@ -433,5 +433,201 @@ async fn list_me_returns_paged_envelope(pool: PgPool) {
|
|||||||
assert!(body["items"].is_array());
|
assert!(body["items"].is_array());
|
||||||
assert_eq!(body["page"]["limit"], 50);
|
assert_eq!(body["page"]["limit"], 50);
|
||||||
assert_eq!(body["page"]["offset"], 0);
|
assert_eq!(body["page"]["offset"], 0);
|
||||||
assert!(body["page"]["total"].is_null());
|
// `total` is the unfiltered row count, returned so callers (e.g.
|
||||||
|
// the profile overview's bookmark counter) can show a number
|
||||||
|
// without paging through.
|
||||||
|
assert_eq!(body["page"]["total"], 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
// Bookmark create -> SyncChapterContent job enqueue (background task)
|
||||||
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
async fn seed_chapter_with_source(
|
||||||
|
pool: &PgPool,
|
||||||
|
manga_id: Uuid,
|
||||||
|
number: i32,
|
||||||
|
source_id: &str,
|
||||||
|
source_chapter_key: &str,
|
||||||
|
source_url: &str,
|
||||||
|
dropped: bool,
|
||||||
|
) -> Uuid {
|
||||||
|
let chapter_id: Uuid =
|
||||||
|
mangalord::repo::chapter::create(pool, manga_id, number, None, None)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.id;
|
||||||
|
sqlx::query("INSERT INTO sources (id, name, base_url) VALUES ($1, $2, $3) ON CONFLICT DO NOTHING")
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(source_id)
|
||||||
|
.bind("https://example.com")
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let dropped_at = if dropped { "now()" } else { "NULL" };
|
||||||
|
sqlx::query(&format!(
|
||||||
|
"INSERT INTO chapter_sources (source_id, source_chapter_key, chapter_id, source_url, dropped_at) \
|
||||||
|
VALUES ($1, $2, $3, $4, {dropped_at})"
|
||||||
|
))
|
||||||
|
.bind(source_id)
|
||||||
|
.bind(source_chapter_key)
|
||||||
|
.bind(chapter_id)
|
||||||
|
.bind(source_url)
|
||||||
|
.execute(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
chapter_id
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Poll `crawler_jobs` for the expected pending count, up to ~1.5s, so the
|
||||||
|
/// detached `tokio::spawn` from the bookmark create handler has time to
|
||||||
|
/// land regardless of CI scheduling jitter.
|
||||||
|
async fn wait_for_pending_count(pool: &PgPool, expected: i64) -> i64 {
|
||||||
|
for _ in 0..30 {
|
||||||
|
let count: i64 = sqlx::query_scalar(
|
||||||
|
"SELECT COUNT(*) FROM crawler_jobs \
|
||||||
|
WHERE state = 'pending' \
|
||||||
|
AND payload->>'kind' = 'sync_chapter_content'",
|
||||||
|
)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
if count >= expected {
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||||
|
}
|
||||||
|
sqlx::query_scalar::<_, i64>(
|
||||||
|
"SELECT COUNT(*) FROM crawler_jobs \
|
||||||
|
WHERE state = 'pending' \
|
||||||
|
AND payload->>'kind' = 'sync_chapter_content'",
|
||||||
|
)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_enqueues_sync_chapter_content_jobs_for_pending_chapters(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
|
||||||
|
// Two zero-page chapters with non-dropped sources.
|
||||||
|
let c1 = seed_chapter_with_source(&pool, manga_id, 1, "target", "ch1", "https://example.com/c1", false).await;
|
||||||
|
let c2 = seed_chapter_with_source(&pool, manga_id, 2, "target", "ch2", "https://example.com/c2", false).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/bookmarks",
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
|
||||||
|
let count = wait_for_pending_count(&pool, 2).await;
|
||||||
|
assert_eq!(count, 2, "both pending chapters should be enqueued");
|
||||||
|
|
||||||
|
let chapter_ids: Vec<String> = sqlx::query_scalar(
|
||||||
|
"SELECT payload->>'chapter_id' FROM crawler_jobs \
|
||||||
|
WHERE payload->>'kind' = 'sync_chapter_content' \
|
||||||
|
ORDER BY payload->>'chapter_id'",
|
||||||
|
)
|
||||||
|
.fetch_all(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let mut expected = vec![c1.to_string(), c2.to_string()];
|
||||||
|
expected.sort();
|
||||||
|
assert_eq!(chapter_ids, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn re_bookmark_after_delete_does_not_re_enqueue_pending_jobs(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let _ = seed_chapter_with_source(&pool, manga_id, 1, "target", "ch1", "https://example.com/c1", false).await;
|
||||||
|
|
||||||
|
// First bookmark — should enqueue 1.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/bookmarks",
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let bookmark_id = common::body_json(resp).await["id"].as_str().unwrap().to_string();
|
||||||
|
assert_eq!(wait_for_pending_count(&pool, 1).await, 1);
|
||||||
|
|
||||||
|
// Delete the bookmark, then re-bookmark — the existing pending job
|
||||||
|
// is still there so the dedup index suppresses the second enqueue.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/bookmarks/{bookmark_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/bookmarks",
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
|
||||||
|
// Give the background task time to attempt re-enqueue (it should be a no-op).
|
||||||
|
tokio::time::sleep(std::time::Duration::from_millis(300)).await;
|
||||||
|
let final_count: i64 = sqlx::query_scalar(
|
||||||
|
"SELECT COUNT(*) FROM crawler_jobs \
|
||||||
|
WHERE state IN ('pending', 'running') \
|
||||||
|
AND payload->>'kind' = 'sync_chapter_content'",
|
||||||
|
)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(final_count, 1, "dedup index keeps the queue at a single in-flight row");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_skips_chapters_with_dropped_sources(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
|
||||||
|
let _alive = seed_chapter_with_source(&pool, manga_id, 1, "target", "ch1", "https://example.com/c1", false).await;
|
||||||
|
let _dropped = seed_chapter_with_source(&pool, manga_id, 2, "target", "ch2", "https://example.com/c2", true).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/bookmarks",
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
wait_for_pending_count(&pool, 1).await,
|
||||||
|
1,
|
||||||
|
"only the chapter with a non-dropped source row gets enqueued"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,10 +12,18 @@ async fn seed_manga(h: &common::Harness, cookie: &str, title: &str) -> Uuid {
|
|||||||
common::seed_manga_via_api(&h.app, cookie, title).await
|
common::seed_manga_via_api(&h.app, cookie, title).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn seed_chapter(pool: &PgPool, manga_id: Uuid, number: i32, title: Option<&str>) {
|
async fn seed_chapter(
|
||||||
mangalord::repo::chapter::create(pool, manga_id, number, title)
|
pool: &PgPool,
|
||||||
|
manga_id: Uuid,
|
||||||
|
number: i32,
|
||||||
|
title: Option<&str>,
|
||||||
|
) -> Uuid {
|
||||||
|
// Historical seed — uploaded_by remains NULL, mirroring the
|
||||||
|
// pre-Phase-5 rows in the production DB.
|
||||||
|
mangalord::repo::chapter::create(pool, manga_id, number, title, None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap()
|
||||||
|
.id
|
||||||
}
|
}
|
||||||
|
|
||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
@@ -79,16 +87,16 @@ async fn list_chapters_returns_404_for_unknown_manga(pool: PgPool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
async fn get_chapter_by_number(pool: PgPool) {
|
async fn get_chapter_by_id(pool: PgPool) {
|
||||||
let h = common::harness(pool.clone());
|
let h = common::harness(pool.clone());
|
||||||
let (_, cookie) = common::register_user(&h.app).await;
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||||
seed_chapter(&pool, manga_id, 1, Some("The Brand")).await;
|
let chapter_id = seed_chapter(&pool, manga_id, 1, Some("The Brand")).await;
|
||||||
|
|
||||||
let resp = h
|
let resp = h
|
||||||
.app
|
.app
|
||||||
.oneshot(common::get(&format!(
|
.oneshot(common::get(&format!(
|
||||||
"/api/v1/mangas/{manga_id}/chapters/1"
|
"/api/v1/mangas/{manga_id}/chapters/{chapter_id}"
|
||||||
)))
|
)))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -97,18 +105,20 @@ async fn get_chapter_by_number(pool: PgPool) {
|
|||||||
assert_eq!(body["number"], 1);
|
assert_eq!(body["number"], 1);
|
||||||
assert_eq!(body["title"], "The Brand");
|
assert_eq!(body["title"], "The Brand");
|
||||||
assert_eq!(body["page_count"], 0);
|
assert_eq!(body["page_count"], 0);
|
||||||
|
assert_eq!(body["id"], chapter_id.to_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
async fn get_chapter_unknown_number_is_404(pool: PgPool) {
|
async fn get_chapter_unknown_id_is_404(pool: PgPool) {
|
||||||
let h = common::harness(pool);
|
let h = common::harness(pool);
|
||||||
let (_, cookie) = common::register_user(&h.app).await;
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||||
|
let unknown_chapter = Uuid::new_v4();
|
||||||
|
|
||||||
let resp = h
|
let resp = h
|
||||||
.app
|
.app
|
||||||
.oneshot(common::get(&format!(
|
.oneshot(common::get(&format!(
|
||||||
"/api/v1/mangas/{manga_id}/chapters/99"
|
"/api/v1/mangas/{manga_id}/chapters/{unknown_chapter}"
|
||||||
)))
|
)))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -120,10 +130,34 @@ async fn get_chapter_unknown_number_is_404(pool: PgPool) {
|
|||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
async fn get_chapter_unknown_manga_is_404(pool: PgPool) {
|
async fn get_chapter_unknown_manga_is_404(pool: PgPool) {
|
||||||
let h = common::harness(pool);
|
let h = common::harness(pool);
|
||||||
let unknown = Uuid::nil();
|
let unknown_manga = Uuid::nil();
|
||||||
|
let unknown_chapter = Uuid::new_v4();
|
||||||
let resp = h
|
let resp = h
|
||||||
.app
|
.app
|
||||||
.oneshot(common::get(&format!("/api/v1/mangas/{unknown}/chapters/1")))
|
.oneshot(common::get(&format!(
|
||||||
|
"/api/v1/mangas/{unknown_manga}/chapters/{unknown_chapter}"
|
||||||
|
)))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cross-manga isolation: a chapter id belonging to manga A must not
|
||||||
|
/// resolve when accessed via manga B's URL. The (manga_id, id) scoping
|
||||||
|
/// in `find_by_id_in_manga` enforces this.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn get_chapter_from_wrong_manga_is_404(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_a = seed_manga(&h, &cookie, "Berserk").await;
|
||||||
|
let manga_b = seed_manga(&h, &cookie, "Vagabond").await;
|
||||||
|
let chapter_id = seed_chapter(&pool, manga_a, 1, Some("Episode 1")).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get(&format!(
|
||||||
|
"/api/v1/mangas/{manga_b}/chapters/{chapter_id}"
|
||||||
|
)))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
@@ -134,12 +168,12 @@ async fn list_pages_empty_for_chapter_without_upload(pool: PgPool) {
|
|||||||
let h = common::harness(pool.clone());
|
let h = common::harness(pool.clone());
|
||||||
let (_, cookie) = common::register_user(&h.app).await;
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||||
seed_chapter(&pool, manga_id, 1, None).await;
|
let chapter_id = seed_chapter(&pool, manga_id, 1, None).await;
|
||||||
|
|
||||||
let resp = h
|
let resp = h
|
||||||
.app
|
.app
|
||||||
.oneshot(common::get(&format!(
|
.oneshot(common::get(&format!(
|
||||||
"/api/v1/mangas/{manga_id}/chapters/1/pages"
|
"/api/v1/mangas/{manga_id}/chapters/{chapter_id}/pages"
|
||||||
)))
|
)))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -153,11 +187,12 @@ async fn list_pages_returns_404_for_unknown_chapter(pool: PgPool) {
|
|||||||
let h = common::harness(pool);
|
let h = common::harness(pool);
|
||||||
let (_, cookie) = common::register_user(&h.app).await;
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
let manga_id = seed_manga(&h, &cookie, "Berserk").await;
|
||||||
|
let unknown_chapter = Uuid::new_v4();
|
||||||
|
|
||||||
let resp = h
|
let resp = h
|
||||||
.app
|
.app
|
||||||
.oneshot(common::get(&format!(
|
.oneshot(common::get(&format!(
|
||||||
"/api/v1/mangas/{manga_id}/chapters/99/pages"
|
"/api/v1/mangas/{manga_id}/chapters/{unknown_chapter}/pages"
|
||||||
)))
|
)))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|||||||
605
backend/tests/api_collections.rs
Normal file
605
backend/tests/api_collections.rs
Normal file
@@ -0,0 +1,605 @@
|
|||||||
|
mod common;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
async fn create_collection(
|
||||||
|
app: &axum::Router,
|
||||||
|
cookie: &str,
|
||||||
|
name: &str,
|
||||||
|
) -> Value {
|
||||||
|
let resp = app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/collections",
|
||||||
|
json!({ "name": name }),
|
||||||
|
cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED, "create_collection failed");
|
||||||
|
common::body_json(resp).await
|
||||||
|
}
|
||||||
|
|
||||||
|
fn id_of(v: &Value) -> String {
|
||||||
|
v["id"].as_str().unwrap().to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_then_list_returns_only_own(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie_a) = common::register_user(&h.app).await;
|
||||||
|
let (_, cookie_b) = common::register_user(&h.app).await;
|
||||||
|
|
||||||
|
let _favs = create_collection(&h.app, &cookie_a, "Favorites").await;
|
||||||
|
let _read = create_collection(&h.app, &cookie_a, "Reading List").await;
|
||||||
|
|
||||||
|
// User B sees an empty list.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/collections", &cookie_b))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["items"], json!([]));
|
||||||
|
assert_eq!(body["page"]["total"], 0);
|
||||||
|
|
||||||
|
// User A sees both.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/collections", &cookie_a))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let names: Vec<&str> = body["items"]
|
||||||
|
.as_array()
|
||||||
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.map(|c| c["name"].as_str().unwrap())
|
||||||
|
.collect();
|
||||||
|
// Newest-updated first; both rows have the same updated_at on
|
||||||
|
// create so we just sanity-check membership.
|
||||||
|
assert_eq!(names.len(), 2);
|
||||||
|
assert!(names.contains(&"Favorites"));
|
||||||
|
assert!(names.contains(&"Reading List"));
|
||||||
|
// Empty collections render with manga_count 0 and an empty
|
||||||
|
// sample_covers array, not `null`.
|
||||||
|
for item in body["items"].as_array().unwrap() {
|
||||||
|
assert_eq!(item["manga_count"], 0);
|
||||||
|
assert_eq!(item["sample_covers"], json!([]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn duplicate_name_for_same_user_is_409(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let _ = create_collection(&h.app, &cookie, "Favorites").await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/collections",
|
||||||
|
json!({ "name": "favorites" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CONFLICT);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "conflict");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn two_users_can_share_a_collection_name(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, a) = common::register_user(&h.app).await;
|
||||||
|
let (_, b) = common::register_user(&h.app).await;
|
||||||
|
let _ = create_collection(&h.app, &a, "Favorites").await;
|
||||||
|
// No conflict — uniqueness is per-(user_id, lower(name)).
|
||||||
|
let _ = create_collection(&h.app, &b, "Favorites").await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_requires_authentication(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/collections",
|
||||||
|
json!({ "name": "Anon" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn create_rejects_blank_name_with_422(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/collections",
|
||||||
|
json!({ "name": " " }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn get_one_returns_404_for_non_owner_no_existence_leak(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, a) = common::register_user(&h.app).await;
|
||||||
|
let (_, b) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &a, "Favorites").await;
|
||||||
|
let id = id_of(&coll);
|
||||||
|
|
||||||
|
// Owner-mismatch is collapsed to 404 so the API doesn't disclose
|
||||||
|
// collection existence to non-owners. Otherwise an attacker could
|
||||||
|
// distinguish "exists, not yours" from "doesn't exist" by status.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
&b,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn add_manga_is_idempotent_and_picks_201_then_200(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "Favorites").await;
|
||||||
|
let coll_id = id_of(&coll);
|
||||||
|
|
||||||
|
let req = || {
|
||||||
|
common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas"),
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
let first = h.app.clone().oneshot(req()).await.unwrap();
|
||||||
|
assert_eq!(first.status(), StatusCode::CREATED);
|
||||||
|
let second = h.app.oneshot(req()).await.unwrap();
|
||||||
|
assert_eq!(second.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn add_manga_returns_404_when_manga_missing(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "Favorites").await;
|
||||||
|
let coll_id = id_of(&coll);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas"),
|
||||||
|
json!({ "manga_id": Uuid::new_v4().to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn add_manga_to_someone_elses_collection_is_404(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, a) = common::register_user(&h.app).await;
|
||||||
|
let (_, b) = common::register_user(&h.app).await;
|
||||||
|
let coll_a = create_collection(&h.app, &a, "Mine").await;
|
||||||
|
let coll_a_id = id_of(&coll_a);
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &b, "Anything").await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_a_id}/mangas"),
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&b,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// 404 not 403 — same non-existence-leak rationale as `get_one`.
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_on_other_users_collection_is_404(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, a) = common::register_user(&h.app).await;
|
||||||
|
let (_, b) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &a, "Mine").await;
|
||||||
|
let id = id_of(&coll);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({ "name": "Hijacked" }),
|
||||||
|
&b,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_description_null_clears_existing_value(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "C").await;
|
||||||
|
let id = id_of(&coll);
|
||||||
|
// Seed a description first via PATCH.
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({ "description": "starting desc" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// Now PATCH with description=null and expect the column cleared.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({ "description": null }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert!(body["description"].is_null(), "expected description cleared");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_description_empty_string_sets_empty_not_null(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "C").await;
|
||||||
|
let id = id_of(&coll);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({ "description": "" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
// Empty string is a valid distinct value; only `null` clears.
|
||||||
|
assert_eq!(body["description"], "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_description_omitted_leaves_value_intact(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "C").await;
|
||||||
|
let id = id_of(&coll);
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({ "description": "Keep me" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// PATCH that doesn't mention description must not touch it.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({ "name": "Renamed" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["name"], "Renamed");
|
||||||
|
assert_eq!(body["description"], "Keep me");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_with_empty_body_leaves_row_unchanged(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "Stable").await;
|
||||||
|
let id = id_of(&coll);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["name"], "Stable");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn my_collections_for_unknown_manga_returns_empty_list(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{}/my-collections", Uuid::new_v4()),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// Non-existent manga is treated the same as a manga the user
|
||||||
|
// hasn't collected — empty list. The handler comment documents
|
||||||
|
// this; the test pins it.
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["collection_ids"], json!([]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn list_mangas_returns_collection_contents(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let m1 = common::seed_manga_via_api(&h.app, &cookie, "First").await;
|
||||||
|
let m2 = common::seed_manga_via_api(&h.app, &cookie, "Second").await;
|
||||||
|
let _untagged = common::seed_manga_via_api(&h.app, &cookie, "NotInIt").await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "Mix").await;
|
||||||
|
let coll_id = id_of(&coll);
|
||||||
|
|
||||||
|
for m in [m1, m2] {
|
||||||
|
let r = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas"),
|
||||||
|
json!({ "manga_id": m.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(r.status(), StatusCode::CREATED);
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let titles: Vec<&str> = body["items"]
|
||||||
|
.as_array()
|
||||||
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.map(|m| m["title"].as_str().unwrap())
|
||||||
|
.collect();
|
||||||
|
// Newest-added first.
|
||||||
|
assert_eq!(titles, vec!["Second", "First"]);
|
||||||
|
assert_eq!(body["page"]["total"], 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn remove_manga_is_idempotent(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "M").await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "C").await;
|
||||||
|
let coll_id = id_of(&coll);
|
||||||
|
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas"),
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let first = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas/{manga_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(first.status(), StatusCode::NO_CONTENT);
|
||||||
|
// Removing again is still a 204 — DELETE is idempotent.
|
||||||
|
let second = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas/{manga_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(second.status(), StatusCode::NO_CONTENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn my_collections_for_manga_lists_only_owned_containing(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, a) = common::register_user(&h.app).await;
|
||||||
|
let (_, b) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &a, "X").await;
|
||||||
|
|
||||||
|
let a_coll = create_collection(&h.app, &a, "A's").await;
|
||||||
|
let b_coll = create_collection(&h.app, &b, "B's").await;
|
||||||
|
let a_coll_id = id_of(&a_coll);
|
||||||
|
let b_coll_id = id_of(&b_coll);
|
||||||
|
|
||||||
|
for (coll, cookie) in [(&a_coll_id, &a), (&b_coll_id, &b)] {
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll}/mangas"),
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{manga_id}/my-collections"),
|
||||||
|
&a,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let ids: Vec<&str> = body["collection_ids"]
|
||||||
|
.as_array()
|
||||||
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.as_str().unwrap())
|
||||||
|
.collect();
|
||||||
|
assert_eq!(ids, vec![a_coll_id.as_str()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_collection_updates_name_and_description(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "Old name").await;
|
||||||
|
let id = id_of(&coll);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{id}"),
|
||||||
|
json!({ "name": "New name", "description": "Some notes" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["name"], "New name");
|
||||||
|
assert_eq!(body["description"], "Some notes");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_collection_cascades_attachments(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "M").await;
|
||||||
|
let coll = create_collection(&h.app, &cookie, "C").await;
|
||||||
|
let coll_id = id_of(&coll);
|
||||||
|
|
||||||
|
let _ = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas"),
|
||||||
|
json!({ "manga_id": manga_id.to_string() }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
||||||
|
|
||||||
|
let (count,): (i64,) =
|
||||||
|
sqlx::query_as("SELECT count(*) FROM collection_mangas WHERE collection_id = $1")
|
||||||
|
.bind(Uuid::parse_str(&coll_id).unwrap())
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(count, 0, "collection_mangas should cascade-delete with the collection");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn list_summary_carries_sample_covers_when_mangas_attached(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
|
||||||
|
// Seed a manga with a cover via the upload endpoint so the
|
||||||
|
// cover_image_path column gets populated.
|
||||||
|
let make_metadata = |title: &str| {
|
||||||
|
common::MultipartBuilder::new()
|
||||||
|
.add_json("metadata", json!({ "title": title }))
|
||||||
|
.add_file("cover", "cover.png", "image/png", &common::fake_png_bytes())
|
||||||
|
};
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_multipart_with_cookie(
|
||||||
|
"/api/v1/mangas",
|
||||||
|
make_metadata("With cover"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let manga_id = body["id"].as_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let coll = create_collection(&h.app, &cookie, "Visual").await;
|
||||||
|
let coll_id = id_of(&coll);
|
||||||
|
let r = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/collections/{coll_id}/mangas"),
|
||||||
|
json!({ "manga_id": manga_id }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(r.status(), StatusCode::CREATED);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/collections", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let item = &body["items"][0];
|
||||||
|
assert_eq!(item["manga_count"], 1);
|
||||||
|
let covers = item["sample_covers"].as_array().unwrap();
|
||||||
|
assert_eq!(covers.len(), 1);
|
||||||
|
assert!(covers[0]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
.starts_with(&format!("mangas/{manga_id}/cover")));
|
||||||
|
}
|
||||||
405
backend/tests/api_history.rs
Normal file
405
backend/tests/api_history.rs
Normal file
@@ -0,0 +1,405 @@
|
|||||||
|
mod common;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use common::MultipartBuilder;
|
||||||
|
|
||||||
|
async fn seed_chapter(app: &axum::Router, cookie: &str, manga_id: Uuid, number: i32) -> String {
|
||||||
|
let resp = app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{manga_id}/chapters"),
|
||||||
|
MultipartBuilder::new()
|
||||||
|
.add_json("metadata", json!({ "number": number }))
|
||||||
|
.add_file("page", "1.png", "image/png", &common::fake_png_bytes()),
|
||||||
|
cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
body["id"].as_str().unwrap().to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn upsert_progress(
|
||||||
|
app: &axum::Router,
|
||||||
|
cookie: &str,
|
||||||
|
body: Value,
|
||||||
|
) -> Value {
|
||||||
|
let resp = app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::put_json_with_cookie(
|
||||||
|
"/api/v1/me/read-progress",
|
||||||
|
body,
|
||||||
|
cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK, "upsert failed: {:?}", resp.status());
|
||||||
|
common::body_json(resp).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn upsert_creates_then_overwrites(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let chapter_id = seed_chapter(&h.app, &cookie, manga_id, 1).await;
|
||||||
|
|
||||||
|
let first = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
json!({ "manga_id": manga_id.to_string(), "chapter_id": chapter_id, "page": 5 }),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(first["manga_id"], manga_id.to_string());
|
||||||
|
assert_eq!(first["page"], 5);
|
||||||
|
|
||||||
|
// A second upsert overwrites the page even when it moves backwards
|
||||||
|
// — re-reading scenarios just take the latest write.
|
||||||
|
let second = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
json!({ "manga_id": manga_id.to_string(), "chapter_id": chapter_id, "page": 1 }),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
assert_eq!(second["page"], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn upsert_with_unknown_manga_is_404(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::put_json_with_cookie(
|
||||||
|
"/api/v1/me/read-progress",
|
||||||
|
json!({ "manga_id": Uuid::new_v4().to_string(), "page": 1 }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// The FK violation in repo::upsert is mapped to NotFound.
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn upsert_with_page_zero_is_422(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::put_json_with_cookie(
|
||||||
|
"/api/v1/me/read-progress",
|
||||||
|
json!({ "manga_id": manga_id.to_string(), "page": 0 }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn list_orders_most_recent_first(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let m1 = common::seed_manga_via_api(&h.app, &cookie, "First").await;
|
||||||
|
let m2 = common::seed_manga_via_api(&h.app, &cookie, "Second").await;
|
||||||
|
|
||||||
|
let _ = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
json!({ "manga_id": m1.to_string(), "page": 1 }),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let _ = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
json!({ "manga_id": m2.to_string(), "page": 1 }),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/read-progress", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let titles: Vec<&str> = body["items"]
|
||||||
|
.as_array()
|
||||||
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.map(|r| r["manga_title"].as_str().unwrap())
|
||||||
|
.collect();
|
||||||
|
// Second was upserted last → it surfaces first.
|
||||||
|
assert_eq!(titles, vec!["Second", "First"]);
|
||||||
|
assert_eq!(body["page"]["total"], 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn list_is_per_user_only(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, a) = common::register_user(&h.app).await;
|
||||||
|
let (_, b) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &a, "Berserk").await;
|
||||||
|
let _ = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&a,
|
||||||
|
json!({ "manga_id": manga_id.to_string(), "page": 7 }),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/read-progress", &b))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["items"], json!([]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn get_single_manga_returns_404_when_unread(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/me/read-progress/{manga_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn get_single_manga_returns_progress_after_upsert(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let chapter_id = seed_chapter(&h.app, &cookie, manga_id, 7).await;
|
||||||
|
let _ = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
json!({
|
||||||
|
"manga_id": manga_id.to_string(),
|
||||||
|
"chapter_id": chapter_id,
|
||||||
|
"page": 12
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie(
|
||||||
|
&format!("/api/v1/me/read-progress/{manga_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["page"], 12);
|
||||||
|
// chapter_number is resolved in the same round-trip so the
|
||||||
|
// Continue CTA can render without listing chapters.
|
||||||
|
assert_eq!(body["chapter_number"], 7);
|
||||||
|
assert_eq!(body["chapter_id"], chapter_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn upsert_rejects_chapter_from_a_different_manga(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_a = common::seed_manga_via_api(&h.app, &cookie, "A").await;
|
||||||
|
let manga_b = common::seed_manga_via_api(&h.app, &cookie, "B").await;
|
||||||
|
let chapter_of_b = seed_chapter(&h.app, &cookie, manga_b, 1).await;
|
||||||
|
|
||||||
|
// Pair manga A with a chapter from manga B — must be rejected.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::put_json_with_cookie(
|
||||||
|
"/api/v1/me/read-progress",
|
||||||
|
json!({
|
||||||
|
"manga_id": manga_a.to_string(),
|
||||||
|
"chapter_id": chapter_of_b,
|
||||||
|
"page": 1
|
||||||
|
}),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "validation_failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_progress_on_never_read_manga_is_204(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Untouched").await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/me/read-progress/{manga_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// DELETE is idempotent — clearing nothing is still success.
|
||||||
|
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_progress_is_idempotent(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let _ = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
json!({ "manga_id": manga_id.to_string(), "page": 1 }),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
for _ in 0..2 {
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::delete_with_cookie(
|
||||||
|
&format!("/api/v1/me/read-progress/{manga_id}"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NO_CONTENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn deleted_chapter_leaves_progress_row_with_null_chapter(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let chapter_id_str = seed_chapter(&h.app, &cookie, manga_id, 1).await;
|
||||||
|
let chapter_id = Uuid::parse_str(&chapter_id_str).unwrap();
|
||||||
|
let _ = upsert_progress(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
json!({ "manga_id": manga_id.to_string(), "chapter_id": chapter_id_str, "page": 3 }),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
// Delete the chapter directly — the FK ON DELETE SET NULL keeps
|
||||||
|
// the progress row but clears chapter_id.
|
||||||
|
sqlx::query("DELETE FROM chapters WHERE id = $1")
|
||||||
|
.bind(chapter_id)
|
||||||
|
.execute(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/read-progress", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let item = &body["items"][0];
|
||||||
|
assert!(item["chapter_id"].is_null(), "chapter_id should be null after cascade");
|
||||||
|
assert!(item["chapter_number"].is_null());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn uploads_lists_manga_and_chapter_uploads_interleaved(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
|
||||||
|
// Two manga uploads with covers, then a chapter on one of them.
|
||||||
|
let m1 = common::seed_manga_via_api(&h.app, &cookie, "Alpha").await;
|
||||||
|
let _m2 = common::seed_manga_via_api(&h.app, &cookie, "Beta").await;
|
||||||
|
let _ = seed_chapter(&h.app, &cookie, m1, 1).await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/uploads", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let items = body["items"].as_array().unwrap();
|
||||||
|
assert_eq!(items.len(), 3);
|
||||||
|
// Most recent first; the chapter upload happened after both mangas.
|
||||||
|
assert_eq!(items[0]["kind"], "chapter");
|
||||||
|
assert_eq!(items[1]["kind"], "manga");
|
||||||
|
assert_eq!(items[2]["kind"], "manga");
|
||||||
|
assert_eq!(body["page"]["total"], 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn uploads_is_per_user_only(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, a) = common::register_user(&h.app).await;
|
||||||
|
let (_, b) = common::register_user(&h.app).await;
|
||||||
|
let _ = common::seed_manga_via_api(&h.app, &a, "A's manga").await;
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/me/uploads", &b))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["items"], json!([]));
|
||||||
|
assert_eq!(body["page"]["total"], 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn manga_create_stamps_uploaded_by_with_current_user(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Stamped").await;
|
||||||
|
|
||||||
|
let (uploaded_by,): (Option<Uuid>,) =
|
||||||
|
sqlx::query_as("SELECT uploaded_by FROM mangas WHERE id = $1")
|
||||||
|
.bind(manga_id)
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert!(uploaded_by.is_some(), "manga.uploaded_by should be set");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn chapter_create_stamps_uploaded_by_with_current_user(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
let chapter_id_str = seed_chapter(&h.app, &cookie, manga_id, 1).await;
|
||||||
|
|
||||||
|
let (uploaded_by,): (Option<Uuid>,) =
|
||||||
|
sqlx::query_as("SELECT uploaded_by FROM chapters WHERE id = $1")
|
||||||
|
.bind(Uuid::parse_str(&chapter_id_str).unwrap())
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert!(uploaded_by.is_some(), "chapter.uploaded_by should be set");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn read_progress_requires_authentication(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
for path in [
|
||||||
|
"/api/v1/me/read-progress",
|
||||||
|
"/api/v1/me/uploads",
|
||||||
|
] {
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::get(path))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED, "{path} should require auth");
|
||||||
|
}
|
||||||
|
}
|
||||||
462
backend/tests/api_mangas_cover.rs
Normal file
462
backend/tests/api_mangas_cover.rs
Normal file
@@ -0,0 +1,462 @@
|
|||||||
|
mod common;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use common::{
|
||||||
|
body_json, delete_with_cookie, fake_jpeg_bytes, fake_png_bytes, get, harness,
|
||||||
|
post_multipart_with_cookie, put_multipart, put_multipart_with_cookie, register_user,
|
||||||
|
MultipartBuilder,
|
||||||
|
};
|
||||||
|
|
||||||
|
async fn create_manga_with_cover(
|
||||||
|
app: &axum::Router,
|
||||||
|
cookie: &str,
|
||||||
|
title: &str,
|
||||||
|
cover: Option<(&str, &[u8])>,
|
||||||
|
) -> Value {
|
||||||
|
let mut form =
|
||||||
|
MultipartBuilder::new().add_json("metadata", json!({ "title": title }));
|
||||||
|
if let Some((ct, bytes)) = cover {
|
||||||
|
form = form.add_file("cover", "cover.bin", ct, bytes);
|
||||||
|
}
|
||||||
|
let resp = app
|
||||||
|
.clone()
|
||||||
|
.oneshot(post_multipart_with_cookie("/api/v1/mangas", form, cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
resp.status(),
|
||||||
|
StatusCode::CREATED,
|
||||||
|
"seed create_manga failed: {:?}",
|
||||||
|
resp.status()
|
||||||
|
);
|
||||||
|
body_json(resp).await
|
||||||
|
}
|
||||||
|
|
||||||
|
fn id_of(body: &Value) -> Uuid {
|
||||||
|
Uuid::parse_str(body["id"].as_str().unwrap()).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cover_form(bytes: &[u8]) -> MultipartBuilder {
|
||||||
|
MultipartBuilder::new().add_file("cover", "cover.bin", "application/octet-stream", bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_sets_path_when_none_existed(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(&h.app, &cookie, "Cover Me", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
assert!(manga["cover_image_path"].is_null());
|
||||||
|
|
||||||
|
let bytes = fake_png_bytes();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&bytes),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
let expected_key = format!("mangas/{id}/cover.png");
|
||||||
|
assert_eq!(body["cover_image_path"], expected_key);
|
||||||
|
assert_eq!(body["title"], "Cover Me");
|
||||||
|
|
||||||
|
let file_resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get(&format!("/api/v1/files/{expected_key}")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(file_resp.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_replaces_existing_same_extension(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let original = fake_png_bytes();
|
||||||
|
let manga = create_manga_with_cover(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
"Replace Me",
|
||||||
|
Some(("image/png", &original)),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
let original_key = format!("mangas/{id}/cover.png");
|
||||||
|
assert_eq!(manga["cover_image_path"], original_key);
|
||||||
|
|
||||||
|
let mut replacement = fake_png_bytes();
|
||||||
|
replacement.extend_from_slice(b"-replacement-marker");
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&replacement),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["cover_image_path"], original_key);
|
||||||
|
|
||||||
|
let file_resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get(&format!("/api/v1/files/{original_key}")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(file_resp.status(), StatusCode::OK);
|
||||||
|
let body_bytes = http_body_util::BodyExt::collect(file_resp.into_body())
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.to_bytes();
|
||||||
|
assert_eq!(body_bytes.as_ref(), replacement.as_slice());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_replaces_existing_different_extension_and_deletes_old_blob(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let png = fake_png_bytes();
|
||||||
|
let manga = create_manga_with_cover(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
"Switch Ext",
|
||||||
|
Some(("image/png", &png)),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
let old_key = format!("mangas/{id}/cover.png");
|
||||||
|
assert_eq!(manga["cover_image_path"], old_key);
|
||||||
|
|
||||||
|
let jpeg = fake_jpeg_bytes();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&jpeg),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
let new_key = format!("mangas/{id}/cover.jpg");
|
||||||
|
assert_eq!(body["cover_image_path"], new_key);
|
||||||
|
|
||||||
|
let new_file = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get(&format!("/api/v1/files/{new_key}")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(new_file.status(), StatusCode::OK);
|
||||||
|
|
||||||
|
let old_file = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get(&format!("/api/v1/files/{old_key}")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(old_file.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_rejects_unauthenticated(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(&h.app, &cookie, "Public Read", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&fake_png_bytes()),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_404_on_unknown_id(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let id = Uuid::new_v4();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&fake_png_bytes()),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_rejects_non_image_with_unsupported_media_type(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(&h.app, &cookie, "Not Image", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
let pdf = b"%PDF-1.4\n%\xc4\xe5".to_vec();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&pdf),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNSUPPORTED_MEDIA_TYPE);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "unsupported_media_type");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_rejects_oversized(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(&h.app, &cookie, "Too Big", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
// Harness max_file_bytes is 256 KiB; 300 KiB trips the cap.
|
||||||
|
let mut bytes = fake_png_bytes();
|
||||||
|
bytes.resize(300 * 1024, 0);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&bytes),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::PAYLOAD_TOO_LARGE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_rejects_missing_cover_part(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(&h.app, &cookie, "Empty Form", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
MultipartBuilder::new(),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "validation_failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_preserves_other_metadata(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
"Keep My Fields",
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&fake_png_bytes()),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert_eq!(body["title"], "Keep My Fields");
|
||||||
|
assert_eq!(body["status"], "ongoing");
|
||||||
|
assert_eq!(body["authors"], json!([]));
|
||||||
|
assert_eq!(body["genres"], json!([]));
|
||||||
|
assert_eq!(body["tags"], json!([]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_cover_clears_path_and_removes_blob(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let png = fake_png_bytes();
|
||||||
|
let manga = create_manga_with_cover(
|
||||||
|
&h.app,
|
||||||
|
&cookie,
|
||||||
|
"Bye Cover",
|
||||||
|
Some(("image/png", &png)),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
let key = format!("mangas/{id}/cover.png");
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(delete_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert!(body["cover_image_path"].is_null());
|
||||||
|
assert_eq!(body["title"], "Bye Cover");
|
||||||
|
|
||||||
|
let file_resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(get(&format!("/api/v1/files/{key}")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(file_resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_cover_is_idempotent_when_no_cover_present(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(&h.app, &cookie, "Never Had One", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
for _ in 0..2 {
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(delete_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = body_json(resp).await;
|
||||||
|
assert!(body["cover_image_path"].is_null());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_cover_rejects_unauthenticated(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let manga = create_manga_with_cover(&h.app, &cookie, "Locked", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(
|
||||||
|
axum::http::Request::builder()
|
||||||
|
.method("DELETE")
|
||||||
|
.uri(format!("/api/v1/mangas/{id}/cover"))
|
||||||
|
.body(axum::body::Body::empty())
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_cover_404_on_unknown_id(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, cookie) = register_user(&h.app).await;
|
||||||
|
let id = Uuid::new_v4();
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(delete_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Authz: PUT /mangas/:id/cover must be uploader-only.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn put_cover_forbidden_for_non_uploader(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, owner_cookie) = register_user(&h.app).await;
|
||||||
|
let (_, intruder_cookie) = register_user(&h.app).await;
|
||||||
|
|
||||||
|
let manga =
|
||||||
|
create_manga_with_cover(&h.app, &owner_cookie, "Mine", None).await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(put_multipart_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
cover_form(&fake_png_bytes()),
|
||||||
|
&intruder_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Authz: DELETE /mangas/:id/cover must be uploader-only.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn delete_cover_forbidden_for_non_uploader(pool: PgPool) {
|
||||||
|
let h = harness(pool);
|
||||||
|
let (_, owner_cookie) = register_user(&h.app).await;
|
||||||
|
let (_, intruder_cookie) = register_user(&h.app).await;
|
||||||
|
|
||||||
|
let manga = create_manga_with_cover(
|
||||||
|
&h.app,
|
||||||
|
&owner_cookie,
|
||||||
|
"Mine",
|
||||||
|
Some(("image/jpeg", &fake_jpeg_bytes())),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let id = id_of(&manga);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(delete_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}/cover"),
|
||||||
|
&intruder_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
@@ -566,3 +566,78 @@ async fn patch_requires_authentication(pool: PgPool) {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A signed-in user who didn't upload the manga must not be able to
|
||||||
|
/// PATCH it. Without the uploader-gate this returned 200 — see
|
||||||
|
/// REVIEW.md "manga PATCH / cover endpoints don't check ownership".
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_forbidden_for_non_uploader(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, owner_cookie) = common::register_user(&h.app).await;
|
||||||
|
let (_, intruder_cookie) = common::register_user(&h.app).await;
|
||||||
|
|
||||||
|
let created = create_manga(&h.app, &owner_cookie, json!({ "title": "Mine" })).await;
|
||||||
|
let id = id_of(&created);
|
||||||
|
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}"),
|
||||||
|
json!({ "status": "completed" }),
|
||||||
|
&intruder_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Owner can still edit their own manga (regression guard for the
|
||||||
|
/// authz fix).
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_allowed_for_uploader(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let created = create_manga(&h.app, &cookie, json!({ "title": "Owned" })).await;
|
||||||
|
let id = id_of(&created);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}"),
|
||||||
|
json!({ "status": "completed" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Legacy rows with `uploaded_by IS NULL` (created before migration
|
||||||
|
/// 0011) remain editable by any signed-in user. Without this carve-out
|
||||||
|
/// the historical-data note in 0011 would be broken.
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn patch_allowed_on_legacy_null_uploader(pool: PgPool) {
|
||||||
|
let h = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let created = create_manga(&h.app, &cookie, json!({ "title": "Legacy" })).await;
|
||||||
|
let id = id_of(&created);
|
||||||
|
|
||||||
|
// Simulate a row uploaded before the column existed: clear
|
||||||
|
// uploaded_by directly via SQL.
|
||||||
|
sqlx::query("UPDATE mangas SET uploaded_by = NULL WHERE id = $1")
|
||||||
|
.bind(id)
|
||||||
|
.execute(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let (_, other_cookie) = common::register_user(&h.app).await;
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::patch_json_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{id}"),
|
||||||
|
json!({ "status": "completed" }),
|
||||||
|
&other_cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|||||||
189
backend/tests/api_private_mode.rs
Normal file
189
backend/tests/api_private_mode.rs
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
//! Site-wide auth gate (`PRIVATE_MODE=true`).
|
||||||
|
//!
|
||||||
|
//! With private mode on, every API path except a small allowlist
|
||||||
|
//! (`/health`, `/auth/config`, `/auth/login`, `/auth/logout`) requires
|
||||||
|
//! a valid session cookie or bearer token, and `/auth/register` is
|
||||||
|
//! force-blocked regardless of `ALLOW_SELF_REGISTER`. With private mode
|
||||||
|
//! off (the default), nothing changes — the `public_mode_*` test
|
||||||
|
//! pins that regression guard.
|
||||||
|
|
||||||
|
mod common;
|
||||||
|
|
||||||
|
use serde_json::json;
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tower::ServiceExt;
|
||||||
|
|
||||||
|
use axum::http::StatusCode;
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn private_mode_blocks_anonymous_manga_list(pool: PgPool) {
|
||||||
|
let h = common::harness_with_private_mode(pool);
|
||||||
|
let resp = h.app.oneshot(common::get("/api/v1/mangas")).await.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn private_mode_blocks_anonymous_files(pool: PgPool) {
|
||||||
|
let h = common::harness_with_private_mode(pool);
|
||||||
|
// The path doesn't have to exist — the guard runs before routing,
|
||||||
|
// so the response is 401 (not 404). That's the property the test
|
||||||
|
// is pinning: nothing leaks via crafted URLs.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get("/api/v1/files/anything.png"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNAUTHORIZED);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn private_mode_allows_session_cookie_read(pool: PgPool) {
|
||||||
|
// Register through a non-private harness sharing the same DB pool
|
||||||
|
// so the session row exists. Then exercise the gate using a fresh
|
||||||
|
// private-mode harness against the same DB.
|
||||||
|
let public = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&public.app).await;
|
||||||
|
|
||||||
|
let private = common::harness_with_private_mode(pool);
|
||||||
|
let resp = private
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_cookie("/api/v1/mangas", &cookie))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn private_mode_allows_bearer_token_read(pool: PgPool) {
|
||||||
|
let public = common::harness(pool.clone());
|
||||||
|
let (_, cookie) = common::register_user(&public.app).await;
|
||||||
|
|
||||||
|
let resp = public
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
"/api/v1/auth/tokens",
|
||||||
|
json!({ "name": "private-mode-bot" }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let bearer = body["bearer"].as_str().unwrap().to_string();
|
||||||
|
|
||||||
|
let private = common::harness_with_private_mode(pool);
|
||||||
|
let resp = private
|
||||||
|
.app
|
||||||
|
.oneshot(common::get_with_bearer("/api/v1/mangas", &bearer))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn private_mode_allows_login_endpoint_anonymous(pool: PgPool) {
|
||||||
|
// Seed a user via the public harness so login has credentials to
|
||||||
|
// verify against.
|
||||||
|
let public = common::harness(pool.clone());
|
||||||
|
let _ = public
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/register",
|
||||||
|
json!({ "username": "alice", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let private = common::harness_with_private_mode(pool);
|
||||||
|
let resp = private
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/login",
|
||||||
|
json!({ "username": "alice", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
// Reaches the login handler and succeeds — *not* 401 from the
|
||||||
|
// gate. That's the property we're pinning.
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn private_mode_allows_health_and_config_anonymous(pool: PgPool) {
|
||||||
|
let h = common::harness_with_private_mode(pool);
|
||||||
|
|
||||||
|
let r = h
|
||||||
|
.app
|
||||||
|
.clone()
|
||||||
|
.oneshot(common::get("/api/v1/health"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(r.status(), StatusCode::OK);
|
||||||
|
|
||||||
|
let r = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get("/api/v1/auth/config"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(r.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn private_mode_blocks_register_even_when_self_register_enabled(pool: PgPool) {
|
||||||
|
// harness_with_private_mode keeps `allow_self_register=true` (the
|
||||||
|
// default) — private mode is supposed to force-block register
|
||||||
|
// regardless. That's what this test pins.
|
||||||
|
let h = common::harness_with_private_mode(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json(
|
||||||
|
"/api/v1/auth/register",
|
||||||
|
json!({ "username": "alice", "password": "hunter2hunter2" }),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::FORBIDDEN);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "forbidden");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn auth_config_reports_private_mode_and_effective_self_register(pool: PgPool) {
|
||||||
|
let h = common::harness_with_private_mode(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get("/api/v1/auth/config"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["private_mode"], true);
|
||||||
|
// Effective value: `allow_self_register && !private_mode` is false
|
||||||
|
// here even though the raw `allow_self_register` is true.
|
||||||
|
assert_eq!(body["self_register_enabled"], false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn public_mode_does_not_gate_anonymous_reads(pool: PgPool) {
|
||||||
|
// Regression guard: with private_mode off (the default), the gate
|
||||||
|
// must be a no-op so existing public deployments stay public.
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let resp = h.app.oneshot(common::get("/api/v1/mangas")).await.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn public_mode_reports_private_mode_false(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get("/api/v1/auth/config"))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["private_mode"], false);
|
||||||
|
assert_eq!(body["self_register_enabled"], true);
|
||||||
|
}
|
||||||
@@ -59,6 +59,31 @@ async fn reattach_same_tag_is_idempotent_and_returns_200(pool: PgPool) {
|
|||||||
assert_eq!(second.status(), StatusCode::OK);
|
assert_eq!(second.status(), StatusCode::OK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tag names over 64 chars are rejected at the handler boundary. The
|
||||||
|
/// repo enforces the same cap, but doing it at the handler keeps the
|
||||||
|
/// envelope consistent with the other validation paths
|
||||||
|
/// (username, collection name, etc.).
|
||||||
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
async fn attach_rejects_tag_name_over_64_chars(pool: PgPool) {
|
||||||
|
let h = common::harness(pool);
|
||||||
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
|
|
||||||
|
let long_name: String = "x".repeat(65);
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::post_json_with_cookie(
|
||||||
|
&format!("/api/v1/mangas/{manga_id}/tags"),
|
||||||
|
json!({ "name": long_name }),
|
||||||
|
&cookie,
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::UNPROCESSABLE_ENTITY);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
assert_eq!(body["error"]["code"], "validation_failed");
|
||||||
|
}
|
||||||
|
|
||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
async fn tag_names_dedup_case_insensitively(pool: PgPool) {
|
async fn tag_names_dedup_case_insensitively(pool: PgPool) {
|
||||||
let h = common::harness(pool);
|
let h = common::harness(pool);
|
||||||
|
|||||||
@@ -139,13 +139,17 @@ async fn files_endpoint_streams_in_multiple_frames(pool: PgPool) {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(resp.status(), StatusCode::CREATED);
|
assert_eq!(resp.status(), StatusCode::CREATED);
|
||||||
|
let chapter_id = common::body_json(resp).await["id"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
// Fetch the page back via the streaming files endpoint.
|
// Fetch the page back via the streaming files endpoint.
|
||||||
let pages = h
|
let pages = h
|
||||||
.app
|
.app
|
||||||
.clone()
|
.clone()
|
||||||
.oneshot(common::get(&format!(
|
.oneshot(common::get(&format!(
|
||||||
"/api/v1/mangas/{manga_id}/chapters/1/pages"
|
"/api/v1/mangas/{manga_id}/chapters/{chapter_id}/pages"
|
||||||
)))
|
)))
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -317,8 +321,12 @@ async fn create_chapter_rejects_renamed_non_image_page(pool: PgPool) {
|
|||||||
assert_eq!(body["error"]["code"], "unsupported_media_type");
|
assert_eq!(body["error"]["code"], "unsupported_media_type");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Multiple chapters can share the same number — different
|
||||||
|
/// scanlations, re-uploads, translator notes. As of migration 0013,
|
||||||
|
/// (manga_id, number) is not unique and each upload gets its own
|
||||||
|
/// chapter id.
|
||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
async fn create_chapter_returns_409_on_duplicate_number(pool: PgPool) {
|
async fn create_chapter_allows_duplicate_numbers_as_separate_chapters(pool: PgPool) {
|
||||||
let h = common::harness(pool);
|
let h = common::harness(pool);
|
||||||
let (_, cookie) = common::register_user(&h.app).await;
|
let (_, cookie) = common::register_user(&h.app).await;
|
||||||
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
let manga_id = common::seed_manga_via_api(&h.app, &cookie, "Berserk").await;
|
||||||
@@ -334,10 +342,27 @@ async fn create_chapter_returns_409_on_duplicate_number(pool: PgPool) {
|
|||||||
};
|
};
|
||||||
let first = h.app.clone().oneshot(make()).await.unwrap();
|
let first = h.app.clone().oneshot(make()).await.unwrap();
|
||||||
assert_eq!(first.status(), StatusCode::CREATED);
|
assert_eq!(first.status(), StatusCode::CREATED);
|
||||||
let second = h.app.oneshot(make()).await.unwrap();
|
let first_id = common::body_json(first).await["id"].as_str().unwrap().to_string();
|
||||||
assert_eq!(second.status(), StatusCode::CONFLICT);
|
|
||||||
let body = common::body_json(second).await;
|
let second = h.app.clone().oneshot(make()).await.unwrap();
|
||||||
assert_eq!(body["error"]["code"], "conflict");
|
assert_eq!(second.status(), StatusCode::CREATED);
|
||||||
|
let second_id = common::body_json(second).await["id"].as_str().unwrap().to_string();
|
||||||
|
|
||||||
|
assert_ne!(first_id, second_id, "each upload gets a distinct chapter id");
|
||||||
|
|
||||||
|
// List endpoint surfaces both rows.
|
||||||
|
let resp = h
|
||||||
|
.app
|
||||||
|
.oneshot(common::get(&format!("/api/v1/mangas/{manga_id}/chapters")))
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(resp.status(), StatusCode::OK);
|
||||||
|
let body = common::body_json(resp).await;
|
||||||
|
let items = body["items"].as_array().unwrap();
|
||||||
|
assert_eq!(items.len(), 2, "both Ch.1 uploads listed separately");
|
||||||
|
for item in items {
|
||||||
|
assert_eq!(item["number"], 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[sqlx::test(migrations = "./migrations")]
|
#[sqlx::test(migrations = "./migrations")]
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user