diff --git a/.env.example b/.env.example index 274ac73..b278536 100644 --- a/.env.example +++ b/.env.example @@ -83,6 +83,30 @@ CRAWLER_MAX_IMAGE_BYTES=33554432 # the image actually contains the binary. CRAWLER_CHROMIUM_BINARY= +# ----- Crawler TOR proxy + recircuit ----- +# The compose stack ships a `tor` service (dockurr/tor) and defaults +# CRAWLER_PROXY to it, so by default all crawler traffic exits via the +# TOR network. To opt out, set CRAWLER_PROXY= (empty) AND +# CRAWLER_TOR_CONTROL_URL= (empty) below — the tor service can stay +# running, it just won't be used. +# +# CRAWLER_PROXY: SOCKS5(h) URL. Use `socks5h://` (not `socks5://`) so +# DNS resolution also goes through TOR, avoiding leaks via the host's +# resolver. Leave unset to talk to the upstream directly. +CRAWLER_PROXY=socks5h://tor:9050 +# Control-port URL for SIGNAL NEWNYM ("get a fresh circuit"). Triggered +# automatically on bad pages (broken-page body, missing #logo) and on +# the Unauthenticated session probe outcome. Leave unset to disable the +# recircuit feature (the SOCKS proxy still works). +CRAWLER_TOR_CONTROL_URL=tcp://tor:9051 +# Auth — cookie file (preferred) or password (HashedControlPassword). +# Cookie wins when both are set. The bundled torrc enables cookie auth +# and shares /var/lib/tor between containers via a named volume. +CRAWLER_TOR_CONTROL_COOKIE_PATH=/var/lib/tor/control_auth_cookie +# CRAWLER_TOR_CONTROL_PASSWORD= +# Max NEWNYM-and-retry cycles per recircuit-eligible failure. Default 3. +CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS=3 + # ----- Frontend ----- # The frontend container runs SvelteKit's Node adapter on :3000 and # proxies /api/* to BACKEND_URL via src/hooks.server.ts. In compose the diff --git a/docker-compose.yml b/docker-compose.yml index a02a992..d3b7625 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,11 +19,27 @@ services: timeout: 5s retries: 10 + tor: + # SOCKS5 proxy for the crawler, plus a control port so the backend + # can signal NEWNYM on bad pages. See tor/torrc for the daemon + # config; both ports are only `expose`d (compose-internal), never + # bound on the host. + image: dockurr/tor:latest + volumes: + - ./tor/torrc:/etc/tor/torrc:ro + - tor-data:/var/lib/tor + expose: + - "9050" + - "9051" + restart: unless-stopped + backend: build: ./backend depends_on: postgres: condition: service_healthy + tor: + condition: service_started environment: DATABASE_URL: postgres://${POSTGRES_USER:-mangalord}:${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set in .env}@postgres:5432/${POSTGRES_DB:-mangalord} BIND_ADDRESS: 0.0.0.0:8080 @@ -44,8 +60,19 @@ services: # arm64 deployments. Pair with `--build-arg INSTALL_CHROMIUM=true` # so the image actually contains the binary. CRAWLER_CHROMIUM_BINARY: ${CRAWLER_CHROMIUM_BINARY:-} + # TOR proxy + NEWNYM recircuit (see .env.example for details). + # Defaults assume the bundled `tor` service above; override to + # empty strings to disable. + CRAWLER_PROXY: ${CRAWLER_PROXY-socks5h://tor:9050} + CRAWLER_TOR_CONTROL_URL: ${CRAWLER_TOR_CONTROL_URL-tcp://tor:9051} + CRAWLER_TOR_CONTROL_COOKIE_PATH: ${CRAWLER_TOR_CONTROL_COOKIE_PATH-/var/lib/tor/control_auth_cookie} + CRAWLER_TOR_CONTROL_PASSWORD: ${CRAWLER_TOR_CONTROL_PASSWORD:-} + CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS: ${CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS:-3} volumes: - storage-data:/var/lib/mangalord/storage + # Read the TOR control-auth cookie from the shared named volume. + # Read-only on the backend side; the tor service is the writer. + - tor-data:/var/lib/tor:ro # No host port mapping in the default setup — the frontend proxies # /api/* through its hooks.server.ts. Expose :8080 only if you want # to hit the API directly from the host (e.g., bot scripts during @@ -67,3 +94,4 @@ services: volumes: postgres-data: storage-data: + tor-data: diff --git a/tor/torrc b/tor/torrc new file mode 100644 index 0000000..ffd47b9 --- /dev/null +++ b/tor/torrc @@ -0,0 +1,32 @@ +# torrc for the Mangalord crawler. +# +# Mounted into the dockurr/tor container at /etc/tor/torrc. The +# crawler talks to this daemon over the internal compose network only: +# `expose:` on the tor service surfaces 9050/9051 to sibling +# containers, never to the host. + +# SOCKS5 proxy that reqwest and Chromium use. IsolateDestAddr + +# IsolateDestPort means each new (destination IP, port) draws a fresh +# circuit — so a SIGNAL NEWNYM picks up promptly on the next +# navigation instead of having to wait for an existing dirty circuit +# to age out. +SOCKSPort 0.0.0.0:9050 IsolateDestAddr IsolateDestPort + +# Control port for SIGNAL NEWNYM. Cookie auth means no secret to manage +# in .env — the cookie file is created by the daemon at startup and +# shared with the backend container via the named `tor-data` volume. +# CookieAuthFileGroupReadable lets the backend's gid read it without +# having to run as root. +ControlPort 0.0.0.0:9051 +CookieAuthentication 1 +CookieAuthFile /var/lib/tor/control_auth_cookie +CookieAuthFileGroupReadable 1 + +# Keep circuits short-lived so NEWNYM actually changes our visible +# exit soon. Default is 600s (10 min); 60s is short enough that retries +# after a brief site rate-limit window almost always see a new IP. +MaxCircuitDirtiness 60 + +# Data + logs. +DataDirectory /var/lib/tor +Log notice stdout