feat(deploy): dockurr/tor service + torrc; wire crawler to use it by default
Adds a `tor` service to the compose stack (dockurr/tor) with a torrc tuned for the crawler — SOCKS5 on 9050 with IsolateDestAddr + IsolateDestPort so NEWNYM picks up promptly, control port on 9051 with cookie auth, MaxCircuitDirtiness 60. Backend defaults CRAWLER_PROXY → socks5h://tor:9050 and CRAWLER_TOR_CONTROL_URL → tcp://tor:9051 so TOR + recircuit are on out-of-the-box. Operators can override both to empty in .env to opt out without removing the service. The tor-data named volume is mounted ro on the backend so it can read /var/lib/tor/control_auth_cookie; CookieAuthFileGroupReadable handles the permissions. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
24
.env.example
24
.env.example
@@ -83,6 +83,30 @@ CRAWLER_MAX_IMAGE_BYTES=33554432
|
|||||||
# the image actually contains the binary.
|
# the image actually contains the binary.
|
||||||
CRAWLER_CHROMIUM_BINARY=
|
CRAWLER_CHROMIUM_BINARY=
|
||||||
|
|
||||||
|
# ----- Crawler TOR proxy + recircuit -----
|
||||||
|
# The compose stack ships a `tor` service (dockurr/tor) and defaults
|
||||||
|
# CRAWLER_PROXY to it, so by default all crawler traffic exits via the
|
||||||
|
# TOR network. To opt out, set CRAWLER_PROXY= (empty) AND
|
||||||
|
# CRAWLER_TOR_CONTROL_URL= (empty) below — the tor service can stay
|
||||||
|
# running, it just won't be used.
|
||||||
|
#
|
||||||
|
# CRAWLER_PROXY: SOCKS5(h) URL. Use `socks5h://` (not `socks5://`) so
|
||||||
|
# DNS resolution also goes through TOR, avoiding leaks via the host's
|
||||||
|
# resolver. Leave unset to talk to the upstream directly.
|
||||||
|
CRAWLER_PROXY=socks5h://tor:9050
|
||||||
|
# Control-port URL for SIGNAL NEWNYM ("get a fresh circuit"). Triggered
|
||||||
|
# automatically on bad pages (broken-page body, missing #logo) and on
|
||||||
|
# the Unauthenticated session probe outcome. Leave unset to disable the
|
||||||
|
# recircuit feature (the SOCKS proxy still works).
|
||||||
|
CRAWLER_TOR_CONTROL_URL=tcp://tor:9051
|
||||||
|
# Auth — cookie file (preferred) or password (HashedControlPassword).
|
||||||
|
# Cookie wins when both are set. The bundled torrc enables cookie auth
|
||||||
|
# and shares /var/lib/tor between containers via a named volume.
|
||||||
|
CRAWLER_TOR_CONTROL_COOKIE_PATH=/var/lib/tor/control_auth_cookie
|
||||||
|
# CRAWLER_TOR_CONTROL_PASSWORD=
|
||||||
|
# Max NEWNYM-and-retry cycles per recircuit-eligible failure. Default 3.
|
||||||
|
CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS=3
|
||||||
|
|
||||||
# ----- Frontend -----
|
# ----- Frontend -----
|
||||||
# The frontend container runs SvelteKit's Node adapter on :3000 and
|
# The frontend container runs SvelteKit's Node adapter on :3000 and
|
||||||
# proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the
|
# proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the
|
||||||
|
|||||||
@@ -19,11 +19,27 @@ services:
|
|||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 10
|
retries: 10
|
||||||
|
|
||||||
|
tor:
|
||||||
|
# SOCKS5 proxy for the crawler, plus a control port so the backend
|
||||||
|
# can signal NEWNYM on bad pages. See tor/torrc for the daemon
|
||||||
|
# config; both ports are only `expose`d (compose-internal), never
|
||||||
|
# bound on the host.
|
||||||
|
image: dockurr/tor:latest
|
||||||
|
volumes:
|
||||||
|
- ./tor/torrc:/etc/tor/torrc:ro
|
||||||
|
- tor-data:/var/lib/tor
|
||||||
|
expose:
|
||||||
|
- "9050"
|
||||||
|
- "9051"
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
backend:
|
backend:
|
||||||
build: ./backend
|
build: ./backend
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
tor:
|
||||||
|
condition: service_started
|
||||||
environment:
|
environment:
|
||||||
DATABASE_URL: postgres://${POSTGRES_USER:-mangalord}:${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set in .env}@postgres:5432/${POSTGRES_DB:-mangalord}
|
DATABASE_URL: postgres://${POSTGRES_USER:-mangalord}:${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set in .env}@postgres:5432/${POSTGRES_DB:-mangalord}
|
||||||
BIND_ADDRESS: 0.0.0.0:8080
|
BIND_ADDRESS: 0.0.0.0:8080
|
||||||
@@ -44,8 +60,19 @@ services:
|
|||||||
# arm64 deployments. Pair with `--build-arg INSTALL_CHROMIUM=true`
|
# arm64 deployments. Pair with `--build-arg INSTALL_CHROMIUM=true`
|
||||||
# so the image actually contains the binary.
|
# so the image actually contains the binary.
|
||||||
CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-}
|
CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-}
|
||||||
|
# TOR proxy + NEWNYM recircuit (see .env.example for details).
|
||||||
|
# Defaults assume the bundled `tor` service above; override to
|
||||||
|
# empty strings to disable.
|
||||||
|
CRAWLER_PROXY: ${CRAWLER_PROXY-socks5h://tor:9050}
|
||||||
|
CRAWLER_TOR_CONTROL_URL: ${CRAWLER_TOR_CONTROL_URL-tcp://tor:9051}
|
||||||
|
CRAWLER_TOR_CONTROL_COOKIE_PATH: ${CRAWLER_TOR_CONTROL_COOKIE_PATH-/var/lib/tor/control_auth_cookie}
|
||||||
|
CRAWLER_TOR_CONTROL_PASSWORD: ${CRAWLER_TOR_CONTROL_PASSWORD:-}
|
||||||
|
CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS: ${CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS:-3}
|
||||||
volumes:
|
volumes:
|
||||||
- storage-data:/var/lib/mangalord/storage
|
- storage-data:/var/lib/mangalord/storage
|
||||||
|
# Read the TOR control-auth cookie from the shared named volume.
|
||||||
|
# Read-only on the backend side; the tor service is the writer.
|
||||||
|
- tor-data:/var/lib/tor:ro
|
||||||
# No host port mapping in the default setup — the frontend proxies
|
# No host port mapping in the default setup — the frontend proxies
|
||||||
# /api/* through its hooks.server.ts. Expose :8080 only if you want
|
# /api/* through its hooks.server.ts. Expose :8080 only if you want
|
||||||
# to hit the API directly from the host (e.g., bot scripts during
|
# to hit the API directly from the host (e.g., bot scripts during
|
||||||
@@ -67,3 +94,4 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
postgres-data:
|
postgres-data:
|
||||||
storage-data:
|
storage-data:
|
||||||
|
tor-data:
|
||||||
|
|||||||
32
tor/torrc
Normal file
32
tor/torrc
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# torrc for the Mangalord crawler.
|
||||||
|
#
|
||||||
|
# Mounted into the dockurr/tor container at /etc/tor/torrc. The
|
||||||
|
# crawler talks to this daemon over the internal compose network only:
|
||||||
|
# `expose:` on the tor service surfaces 9050/9051 to sibling
|
||||||
|
# containers, never to the host.
|
||||||
|
|
||||||
|
# SOCKS5 proxy that reqwest and Chromium use. IsolateDestAddr +
|
||||||
|
# IsolateDestPort means each new (destination IP, port) draws a fresh
|
||||||
|
# circuit — so a SIGNAL NEWNYM picks up promptly on the next
|
||||||
|
# navigation instead of having to wait for an existing dirty circuit
|
||||||
|
# to age out.
|
||||||
|
SOCKSPort 0.0.0.0:9050 IsolateDestAddr IsolateDestPort
|
||||||
|
|
||||||
|
# Control port for SIGNAL NEWNYM. Cookie auth means no secret to manage
|
||||||
|
# in .env — the cookie file is created by the daemon at startup and
|
||||||
|
# shared with the backend container via the named `tor-data` volume.
|
||||||
|
# CookieAuthFileGroupReadable lets the backend's gid read it without
|
||||||
|
# having to run as root.
|
||||||
|
ControlPort 0.0.0.0:9051
|
||||||
|
CookieAuthentication 1
|
||||||
|
CookieAuthFile /var/lib/tor/control_auth_cookie
|
||||||
|
CookieAuthFileGroupReadable 1
|
||||||
|
|
||||||
|
# Keep circuits short-lived so NEWNYM actually changes our visible
|
||||||
|
# exit soon. Default is 600s (10 min); 60s is short enough that retries
|
||||||
|
# after a brief site rate-limit window almost always see a new IP.
|
||||||
|
MaxCircuitDirtiness 60
|
||||||
|
|
||||||
|
# Data + logs.
|
||||||
|
DataDirectory /var/lib/tor
|
||||||
|
Log notice stdout
|
||||||
Reference in New Issue
Block a user