From 8557e432a22ae6419441aed988d5740501066c03 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 31 May 2026 18:43:15 +0200 Subject: [PATCH] feat(crawler): plumb TorController through FetchContext and pipelines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds CRAWLER_TOR_CONTROL_URL / _PASSWORD / _COOKIE_PATH / _RECIRCUIT_MAX_ATTEMPTS to CrawlerConfig and to bin/crawler.rs's env reads. Constructs an Option> at daemon / CLI startup and threads it through FetchContext, pipeline::run_metadata_pass, and content::sync_chapter_content as Option<&TorController>. Pure scaffolding — the controller isn't used yet; behavior is unchanged. Next commit wires the retry hooks and session-probe recircuit. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/src/app.rs | 17 +++++++++++++++++ backend/src/bin/crawler.rs | 28 ++++++++++++++++++++++++++++ backend/src/config.rs | 30 ++++++++++++++++++++++++++++++ backend/src/crawler/content.rs | 1 + backend/src/crawler/pipeline.rs | 2 ++ backend/src/crawler/source.rs | 4 ++++ 6 files changed, 82 insertions(+) diff --git a/backend/src/app.rs b/backend/src/app.rs index f667586..7132674 100644 --- a/backend/src/app.rs +++ b/backend/src/app.rs @@ -157,6 +157,17 @@ async fn spawn_crawler_daemon( let session_expired = Arc::new(AtomicBool::new(false)); + let tor = crate::crawler::tor::TorController::from_parts( + cfg.tor_control_url.as_deref(), + cfg.tor_control_password.as_deref(), + cfg.tor_control_cookie_path.as_deref(), + ) + .context("build TorController from CRAWLER_TOR_CONTROL_* env")? + .map(Arc::new); + if let Some(t) = &tor { + tracing::info!(?t, "TOR control configured; transient pages will trigger NEWNYM"); + } + let metadata_pass: Option> = cfg.start_url.as_ref().map(|url| { let m: Arc = Arc::new(RealMetadataPass { browser_manager: Arc::clone(&browser_manager), @@ -167,6 +178,7 @@ async fn spawn_crawler_daemon( start_url: url.clone(), download_allowlist: cfg.download_allowlist.clone(), max_image_bytes: cfg.max_image_bytes, + tor: tor.as_ref().map(Arc::clone), }); m }); @@ -179,6 +191,7 @@ async fn spawn_crawler_daemon( rate: Arc::clone(&rate), download_allowlist: cfg.download_allowlist.clone(), max_image_bytes: cfg.max_image_bytes, + tor: tor.as_ref().map(Arc::clone), }); // Shared cancellation: daemon shutdown cancels the BrowserManager's @@ -232,6 +245,7 @@ struct RealMetadataPass { start_url: String, download_allowlist: DownloadAllowlist, max_image_bytes: usize, + tor: Option>, } #[async_trait] @@ -248,6 +262,7 @@ impl MetadataPass for RealMetadataPass { false, &self.download_allowlist, self.max_image_bytes, + self.tor.as_deref(), ) .await; if let Err(e) = &result { @@ -267,6 +282,7 @@ struct RealChapterDispatcher { rate: Arc, download_allowlist: DownloadAllowlist, max_image_bytes: usize, + tor: Option>, } #[async_trait] @@ -298,6 +314,7 @@ impl ChapterDispatcher for RealChapterDispatcher { false, &self.download_allowlist, self.max_image_bytes, + self.tor.as_deref(), ) .await; drop(lease); diff --git a/backend/src/bin/crawler.rs b/backend/src/bin/crawler.rs index 453a1d8..f47b4f2 100644 --- a/backend/src/bin/crawler.rs +++ b/backend/src/bin/crawler.rs @@ -78,6 +78,16 @@ async fn main() -> anyhow::Result<()> { let proxy_url = std::env::var("CRAWLER_PROXY") .ok() .filter(|s| !s.trim().is_empty()); + let tor_control_url = std::env::var("CRAWLER_TOR_CONTROL_URL") + .ok() + .filter(|s| !s.trim().is_empty()); + let tor_control_password = std::env::var("CRAWLER_TOR_CONTROL_PASSWORD") + .ok() + .filter(|s| !s.trim().is_empty()); + let tor_control_cookie_path = std::env::var("CRAWLER_TOR_CONTROL_COOKIE_PATH") + .ok() + .filter(|s| !s.trim().is_empty()) + .map(std::path::PathBuf::from); let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false); let db = PgPoolOptions::new() @@ -173,6 +183,17 @@ async fn main() -> anyhow::Result<()> { let session_ready = phpsessid.is_some() && cookie_domain.is_some(); let manager = BrowserManager::new(options, Duration::ZERO, on_launch); + let tor = mangalord::crawler::tor::TorController::from_parts( + tor_control_url.as_deref(), + tor_control_password.as_deref(), + tor_control_cookie_path.as_deref(), + ) + .context("build TorController from CRAWLER_TOR_CONTROL_* env")? + .map(Arc::new); + if let Some(t) = &tor { + tracing::info!(?t, "TOR control configured"); + } + let result = run( Arc::clone(&manager), &db, @@ -187,6 +208,7 @@ async fn main() -> anyhow::Result<()> { skip_chapter_content || !session_ready, chapter_workers, force_refetch_chapters, + tor.clone(), ) .await; @@ -216,6 +238,7 @@ async fn run( skip_chapter_content: bool, chapter_workers: usize, force_refetch_chapters: bool, + tor: Option>, ) -> anyhow::Result<()> { let mut rate = HostRateLimiters::new(Duration::from_millis(rate_ms)); if let Some(host) = cdn_host { @@ -267,6 +290,7 @@ async fn run( skip_chapters, allowlist.as_ref(), max_image_bytes, + tor.as_deref(), ) .await?; tracing::info!(?stats, "metadata pass complete"); @@ -283,6 +307,7 @@ async fn run( force_refetch_chapters, Arc::clone(&allowlist), max_image_bytes, + tor.clone(), ) .await?; } @@ -308,6 +333,7 @@ async fn sync_bookmarked_chapter_content( force_refetch: bool, allowlist: Arc, max_image_bytes: usize, + tor: Option>, ) -> anyhow::Result<()> { let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as( r#" @@ -345,6 +371,7 @@ async fn sync_bookmarked_chapter_content( let rate = Arc::clone(&rate); let manager = Arc::clone(&manager); let allowlist = Arc::clone(&allowlist); + let tor = tor.clone(); let stats = &stats; async move { if session_expired.load(std::sync::atomic::Ordering::Relaxed) { @@ -371,6 +398,7 @@ async fn sync_bookmarked_chapter_content( force_refetch, allowlist.as_ref(), max_image_bytes, + tor.as_deref(), ) .await; drop(lease); diff --git a/backend/src/config.rs b/backend/src/config.rs index e983d03..9f68a83 100644 --- a/backend/src/config.rs +++ b/backend/src/config.rs @@ -97,6 +97,20 @@ pub struct CrawlerConfig { pub cookie_domain: Option, pub user_agent: Option, pub proxy: Option, + /// `tcp://host:port`, `host:port`, or bare `host` (default port + /// 9051). When `None`, TOR-recircuit-on-transient is disabled and + /// the crawler behaves identically to pre-TOR releases. + pub tor_control_url: Option, + /// HashedControlPassword auth. Used only when + /// `tor_control_cookie_path` is `None`. + pub tor_control_password: Option, + /// Cookie-file auth path (e.g. + /// `/var/lib/tor/control_auth_cookie`). Takes precedence over + /// password when both are set. + pub tor_control_cookie_path: Option, + /// Maximum NEWNYM-and-retry cycles per recircuit-eligible failure. + /// Defaults to 3. + pub tor_recircuit_max_attempts: u32, pub browser: LaunchOptions, /// Hosts the crawler is allowed to download images / covers from. /// Always seeded with the host of `start_url` and (when set) the @@ -124,6 +138,10 @@ impl Default for CrawlerConfig { cookie_domain: None, user_agent: None, proxy: None, + tor_control_url: None, + tor_control_password: None, + tor_control_cookie_path: None, + tor_recircuit_max_attempts: 3, browser: LaunchOptions::headless(), download_allowlist: DownloadAllowlist::new(), max_image_bytes: DEFAULT_MAX_IMAGE_BYTES, @@ -234,6 +252,18 @@ impl CrawlerConfig { proxy: std::env::var("CRAWLER_PROXY") .ok() .filter(|s| !s.trim().is_empty()), + tor_control_url: std::env::var("CRAWLER_TOR_CONTROL_URL") + .ok() + .filter(|s| !s.trim().is_empty()), + tor_control_password: std::env::var("CRAWLER_TOR_CONTROL_PASSWORD") + .ok() + .filter(|s| !s.trim().is_empty()), + tor_control_cookie_path: std::env::var("CRAWLER_TOR_CONTROL_COOKIE_PATH") + .ok() + .filter(|s| !s.trim().is_empty()) + .map(PathBuf::from), + tor_recircuit_max_attempts: env_u64("CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS", 3) + .max(1) as u32, browser: LaunchOptions::from_env(), download_allowlist, max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES), diff --git a/backend/src/crawler/content.rs b/backend/src/crawler/content.rs index 7843561..7cdb0f8 100644 --- a/backend/src/crawler/content.rs +++ b/backend/src/crawler/content.rs @@ -91,6 +91,7 @@ pub async fn sync_chapter_content( force_refetch: bool, allowlist: &DownloadAllowlist, max_image_bytes: usize, + _tor: Option<&crate::crawler::tor::TorController>, ) -> anyhow::Result { // Skip if already fetched, unless caller explicitly forces. if !force_refetch { diff --git a/backend/src/crawler/pipeline.rs b/backend/src/crawler/pipeline.rs index 320c49e..9796357 100644 --- a/backend/src/crawler/pipeline.rs +++ b/backend/src/crawler/pipeline.rs @@ -103,6 +103,7 @@ pub async fn run_metadata_pass( skip_chapters: bool, allowlist: &DownloadAllowlist, max_image_bytes: usize, + tor: Option<&crate::crawler::tor::TorController>, ) -> anyhow::Result { let lease = browser_manager .acquire() @@ -121,6 +122,7 @@ pub async fn run_metadata_pass( let ctx = FetchContext { browser: browser_ref, rate, + tor, }; let source_id = source.id(); diff --git a/backend/src/crawler/source.rs b/backend/src/crawler/source.rs index edbbf3a..c89528d 100644 --- a/backend/src/crawler/source.rs +++ b/backend/src/crawler/source.rs @@ -67,6 +67,10 @@ pub struct SourceChapter { pub struct FetchContext<'a> { pub browser: &'a Browser, pub rate: &'a crate::crawler::rate_limit::HostRateLimiters, + /// Optional TOR control-port client. When `Some`, retry helpers + /// signal `NEWNYM` between transient-page attempts so the next try + /// draws a fresh exit. `None` keeps pre-TOR behavior. + pub tor: Option<&'a crate::crawler::tor::TorController>, } /// Lazy iterator over discovered manga refs. The caller drives the