feat(crawler): plumb TorController through FetchContext and pipelines

Adds CRAWLER_TOR_CONTROL_URL / _PASSWORD / _COOKIE_PATH /
_RECIRCUIT_MAX_ATTEMPTS to CrawlerConfig and to bin/crawler.rs's
env reads. Constructs an Option<Arc<TorController>> at daemon /
CLI startup and threads it through FetchContext,
pipeline::run_metadata_pass, and content::sync_chapter_content as
Option<&TorController>.

Pure scaffolding — the controller isn't used yet; behavior is
unchanged. Next commit wires the retry hooks and session-probe
recircuit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-31 18:43:15 +02:00
parent d6d84dedcb
commit 8557e432a2
6 changed files with 82 additions and 0 deletions

View File

@@ -91,6 +91,7 @@ pub async fn sync_chapter_content(
force_refetch: bool,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
_tor: Option<&crate::crawler::tor::TorController>,
) -> anyhow::Result<SyncOutcome> {
// Skip if already fetched, unless caller explicitly forces.
if !force_refetch {

View File

@@ -103,6 +103,7 @@ pub async fn run_metadata_pass(
skip_chapters: bool,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
tor: Option<&crate::crawler::tor::TorController>,
) -> anyhow::Result<MetadataStats> {
let lease = browser_manager
.acquire()
@@ -121,6 +122,7 @@ pub async fn run_metadata_pass(
let ctx = FetchContext {
browser: browser_ref,
rate,
tor,
};
let source_id = source.id();

View File

@@ -67,6 +67,10 @@ pub struct SourceChapter {
pub struct FetchContext<'a> {
pub browser: &'a Browser,
pub rate: &'a crate::crawler::rate_limit::HostRateLimiters,
/// Optional TOR control-port client. When `Some`, retry helpers
/// signal `NEWNYM` between transient-page attempts so the next try
/// draws a fresh exit. `None` keeps pre-TOR behavior.
pub tor: Option<&'a crate::crawler::tor::TorController>,
}
/// Lazy iterator over discovered manga refs. The caller drives the