diff --git a/.env.example b/.env.example index b278536..406f54d 100644 --- a/.env.example +++ b/.env.example @@ -90,23 +90,36 @@ CRAWLER_CHROMIUM_BINARY= # CRAWLER_TOR_CONTROL_URL= (empty) below — the tor service can stay # running, it just won't be used. # +# Going through TOR adds latency to every fetch; image downloads in +# particular slow noticeably. The win is on sites that rate-limit or +# fingerprint by exit IP — NEWNYM recirculation makes a fresh exit +# cheap to reach for. +# # CRAWLER_PROXY: SOCKS5(h) URL. Use `socks5h://` (not `socks5://`) so # DNS resolution also goes through TOR, avoiding leaks via the host's # resolver. Leave unset to talk to the upstream directly. CRAWLER_PROXY=socks5h://tor:9050 # Control-port URL for SIGNAL NEWNYM ("get a fresh circuit"). Triggered # automatically on bad pages (broken-page body, missing #logo) and on -# the Unauthenticated session probe outcome. Leave unset to disable the -# recircuit feature (the SOCKS proxy still works). +# the Unauthenticated session probe outcome. Leave unset to disable +# the recircuit feature (the SOCKS proxy still works). CRAWLER_TOR_CONTROL_URL=tcp://tor:9051 -# Auth — cookie file (preferred) or password (HashedControlPassword). -# Cookie wins when both are set. The bundled torrc enables cookie auth -# and shares /var/lib/tor between containers via a named volume. -CRAWLER_TOR_CONTROL_COOKIE_PATH=/var/lib/tor/control_auth_cookie -# CRAWLER_TOR_CONTROL_PASSWORD= # Max NEWNYM-and-retry cycles per recircuit-eligible failure. Default 3. CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS=3 +# ----- TOR control-port password ----- +# Shared between the bundled dockurr/tor service (which hashes it into +# its HashedControlPassword) and the backend's +# CRAWLER_TOR_CONTROL_PASSWORD. REQUIRED — docker-compose.yml fails +# fast if absent. Generate a strong random string; rotate by setting +# a new value and restarting both `tor` and `backend`. +# +# Operators running their own non-dockurr tor daemon with cookie-file +# auth can ignore this var and instead set +# CRAWLER_TOR_CONTROL_COOKIE_PATH on the backend — the TorController +# prefers cookie when both are present. +TOR_CONTROL_PASSWORD=change-me-to-a-strong-random-string + # ----- Frontend ----- # The frontend container runs SvelteKit's Node adapter on :3000 and # proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the diff --git a/docker-compose.yml b/docker-compose.yml index d3b7625..f3a5974 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -24,13 +24,34 @@ services: # can signal NEWNYM on bad pages. See tor/torrc for the daemon # config; both ports are only `expose`d (compose-internal), never # bound on the host. + # + # We bypass dockurr/tor's stock entrypoint because it binds the + # control port to localhost (unreachable from the backend + # container) and skips its own HashedControlPassword injection + # when the user's torrc declares a ControlPort. Our wrapper + # (tor/entrypoint.sh) generates the hash from $PASSWORD and execs + # tor with our torrc. Backend authenticates with the same plain + # string via CRAWLER_TOR_CONTROL_PASSWORD. image: dockurr/tor:latest + entrypoint: ["/bin/sh", "/usr/local/bin/mangalord-entrypoint.sh"] + environment: + PASSWORD: ${TOR_CONTROL_PASSWORD:?TOR_CONTROL_PASSWORD must be set in .env} volumes: - ./tor/torrc:/etc/tor/torrc:ro - - tor-data:/var/lib/tor + - ./tor/entrypoint.sh:/usr/local/bin/mangalord-entrypoint.sh:ro expose: - "9050" - "9051" + # Wait for both control + SOCKS ports to listen before downstream + # services start. dockurr/tor's main process spawns before tor + # itself is bound, so `service_started` alone races the first + # NEWNYM call. + healthcheck: + test: ["CMD-SHELL", "nc -z 127.0.0.1 9050 && nc -z 127.0.0.1 9051"] + interval: 5s + timeout: 5s + retries: 20 + start_period: 30s restart: unless-stopped backend: @@ -39,7 +60,7 @@ services: postgres: condition: service_healthy tor: - condition: service_started + condition: service_healthy environment: DATABASE_URL: postgres://${POSTGRES_USER:-mangalord}:${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set in .env}@postgres:5432/${POSTGRES_DB:-mangalord} BIND_ADDRESS: 0.0.0.0:8080 @@ -61,18 +82,17 @@ services: # so the image actually contains the binary. CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-} # TOR proxy + NEWNYM recircuit (see .env.example for details). - # Defaults assume the bundled `tor` service above; override to - # empty strings to disable. + # Defaults assume the bundled `tor` service above; override + # CRAWLER_PROXY= and CRAWLER_TOR_CONTROL_URL= (both empty) in + # .env to disable. CRAWLER_TOR_CONTROL_PASSWORD MUST match the + # tor service's PASSWORD (both wired to the same TOR_CONTROL_PASSWORD + # .env var below). CRAWLER_PROXY: ${CRAWLER_PROXY-socks5h://tor:9050} CRAWLER_TOR_CONTROL_URL: ${CRAWLER_TOR_CONTROL_URL-tcp://tor:9051} - CRAWLER_TOR_CONTROL_COOKIE_PATH: ${CRAWLER_TOR_CONTROL_COOKIE_PATH-/var/lib/tor/control_auth_cookie} - CRAWLER_TOR_CONTROL_PASSWORD: ${CRAWLER_TOR_CONTROL_PASSWORD:-} + CRAWLER_TOR_CONTROL_PASSWORD: ${TOR_CONTROL_PASSWORD:?TOR_CONTROL_PASSWORD must be set in .env} CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS: ${CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS:-3} volumes: - storage-data:/var/lib/mangalord/storage - # Read the TOR control-auth cookie from the shared named volume. - # Read-only on the backend side; the tor service is the writer. - - tor-data:/var/lib/tor:ro # No host port mapping in the default setup — the frontend proxies # /api/* through its hooks.server.ts. Expose :8080 only if you want # to hit the API directly from the host (e.g., bot scripts during @@ -94,4 +114,3 @@ services: volumes: postgres-data: storage-data: - tor-data: diff --git a/tor/entrypoint.sh b/tor/entrypoint.sh new file mode 100755 index 0000000..3424c34 --- /dev/null +++ b/tor/entrypoint.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# Mangalord wrapper around dockurr/tor's tor binary. +# +# We bypass the image's stock entrypoint for two reasons: +# 1. It generates a `ControlPort 9051` line that binds to localhost +# only (tor's default), but our backend lives in a separate +# container and needs to reach 0.0.0.0:9051. +# 2. It then *skips* writing HashedControlPassword whenever the +# user's torrc declares a ControlPort, so we can't both bind to +# 0.0.0.0 and benefit from its auto-hashing — it's one or the +# other. Doing the hashing ourselves is simpler than threading +# around its logic. +# +# This wrapper hashes $PASSWORD with `tor --hash-password`, appends a +# `HashedControlPassword` line to a writable copy of /etc/tor/torrc, +# then execs tor. Container runs as root (image default); tor binds +# 9050/9051 which don't require root and is fine inside a single- +# purpose container. + +set -eu + +if [ -z "${PASSWORD:-}" ]; then + echo "ERROR: PASSWORD env must be set (the plain string the backend will" >&2 + echo " send as CRAWLER_TOR_CONTROL_PASSWORD)" >&2 + exit 1 +fi + +# `tor --hash-password` prints the hash on the last line of stdout +# (preceded by initialization noise). +HASH=$(tor --hash-password "$PASSWORD" 2>/dev/null | tail -n1) +if [ -z "$HASH" ]; then + echo "ERROR: 'tor --hash-password' produced no output" >&2 + exit 1 +fi + +# /etc/tor/torrc is bind-mounted read-only, so copy + append. +cp /etc/tor/torrc /tmp/torrc +printf '\n# Injected by mangalord-entrypoint.sh from $PASSWORD env.\nHashedControlPassword %s\n' "$HASH" >> /tmp/torrc + +exec tor -f /tmp/torrc diff --git a/tor/torrc b/tor/torrc index ffd47b9..4f800cb 100644 --- a/tor/torrc +++ b/tor/torrc @@ -12,20 +12,26 @@ # to age out. SOCKSPort 0.0.0.0:9050 IsolateDestAddr IsolateDestPort -# Control port for SIGNAL NEWNYM. Cookie auth means no secret to manage -# in .env — the cookie file is created by the daemon at startup and -# shared with the backend container via the named `tor-data` volume. -# CookieAuthFileGroupReadable lets the backend's gid read it without -# having to run as root. +# Control port for SIGNAL NEWNYM. We rely on the dockurr/tor +# entrypoint to inject `HashedControlPassword ` from its +# PASSWORD env var (see docker-compose.yml `tor.environment.PASSWORD`) +# via a higher-priority --defaults-torrc. We just need to declare the +# port itself here. ControlPort 0.0.0.0:9051 -CookieAuthentication 1 -CookieAuthFile /var/lib/tor/control_auth_cookie -CookieAuthFileGroupReadable 1 -# Keep circuits short-lived so NEWNYM actually changes our visible -# exit soon. Default is 600s (10 min); 60s is short enough that retries -# after a brief site rate-limit window almost always see a new IP. -MaxCircuitDirtiness 60 +# Keep circuits dirty for a while so a single chapter (which serial- +# fetches all its images through the same SOCKS endpoint) finishes on +# one circuit rather than mid-circuit-rotating in a way that looks like +# anti-bot evasion to the target. NEWNYM still forces a fresh circuit +# immediately when we want one — this is just the idle-rotation knob. +MaxCircuitDirtiness 600 + +# Drop privileges to the image's `tor` user after binding ports. +# Required because /var/lib/tor (the image's DataDirectory volume) +# is owned by tor:tor and tor refuses to use a data dir it doesn't +# own. Our entrypoint runs as root only so it can call +# `tor --hash-password` and write /tmp/torrc. +User tor # Data + logs. DataDirectory /var/lib/tor