From 4e203506456c7bcde8688d10a43314166902208e Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 31 May 2026 20:55:29 +0200 Subject: [PATCH] =?UTF-8?q?fix(crawler):=20translate=20socks5h://=20?= =?UTF-8?q?=E2=86=92=20socks5://=20for=20Chromium=20--proxy-server?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chromium doesn't know the socks5h scheme (curl/reqwest convention) and bails navigations with ERR_NO_SUPPORTED_PROXIES. It does, however, send destination hostnames over SOCKS5 by default, so stripping the `h` is a pure scheme rename — remote-DNS behaviour is preserved. reqwest keeps the user's original CRAWLER_PROXY string (`socks5h://...` remains valid and meaningful for it). Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/src/app.rs | 3 +- backend/src/bin/crawler.rs | 3 +- backend/src/crawler/url_utils.rs | 50 ++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/backend/src/app.rs b/backend/src/app.rs index f83e42b..8288ca5 100644 --- a/backend/src/app.rs +++ b/backend/src/app.rs @@ -140,7 +140,8 @@ async fn spawn_crawler_daemon( // authenticated without operator action. let mut launch_opts = cfg.browser.clone(); if let Some(proxy) = &cfg.proxy { - launch_opts.extra_args.push(format!("--proxy-server={proxy}")); + let chromium_proxy = crate::crawler::url_utils::chromium_proxy_arg(proxy); + launch_opts.extra_args.push(format!("--proxy-server={chromium_proxy}")); } let on_launch = match (&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url) { (Some(sid), Some(domain), Some(start_url)) => { diff --git a/backend/src/bin/crawler.rs b/backend/src/bin/crawler.rs index 1a62a10..15ca8d4 100644 --- a/backend/src/bin/crawler.rs +++ b/backend/src/bin/crawler.rs @@ -127,7 +127,8 @@ async fn main() -> anyhow::Result<()> { let mut options = LaunchOptions::from_env(); if let Some(proxy) = &proxy_url { - options.extra_args.push(format!("--proxy-server={proxy}")); + let chromium_proxy = mangalord::crawler::url_utils::chromium_proxy_arg(proxy); + options.extra_args.push(format!("--proxy-server={chromium_proxy}")); } let keep_open = match (keep_browser_open, options.mode) { (true, BrowserMode::Headed) => true, diff --git a/backend/src/crawler/url_utils.rs b/backend/src/crawler/url_utils.rs index 0d50ea4..23d1e47 100644 --- a/backend/src/crawler/url_utils.rs +++ b/backend/src/crawler/url_utils.rs @@ -91,6 +91,26 @@ pub fn registrable_domain(url: &str) -> Option { Some(format!(".{}", registrable.join("."))) } +/// Normalise a SOCKS proxy URL for Chromium's `--proxy-server=` flag. +/// +/// reqwest accepts both `socks5://` (resolve locally) and +/// `socks5h://` (resolve via the SOCKS server — important when the +/// proxy is TOR and we don't want the host's resolver to see the +/// target hostname). Chromium does **not** know the `socks5h` scheme +/// and refuses navigations with `ERR_NO_SUPPORTED_PROXIES`. It +/// already sends destination hostnames over SOCKS5 by default +/// regardless, so stripping the `h` is a pure scheme rename — the +/// remote-DNS behaviour is preserved. +/// +/// Non-SOCKS schemes pass through unchanged. +pub fn chromium_proxy_arg(proxy: &str) -> String { + if let Some(rest) = proxy.strip_prefix("socks5h://") { + format!("socks5://{rest}") + } else { + proxy.to_string() + } +} + #[cfg(test)] mod tests { use super::*; @@ -191,4 +211,34 @@ mod tests { Some("[2001:db8::1]") ); } + + #[test] + fn chromium_proxy_arg_strips_socks5h_to_socks5() { + // Regression: passing socks5h:// to Chromium yields + // ERR_NO_SUPPORTED_PROXIES at navigation time. + assert_eq!( + chromium_proxy_arg("socks5h://127.0.0.1:9050"), + "socks5://127.0.0.1:9050" + ); + assert_eq!( + chromium_proxy_arg("socks5h://tor:9050"), + "socks5://tor:9050" + ); + } + + #[test] + fn chromium_proxy_arg_passes_socks5_unchanged() { + assert_eq!( + chromium_proxy_arg("socks5://127.0.0.1:9050"), + "socks5://127.0.0.1:9050" + ); + } + + #[test] + fn chromium_proxy_arg_passes_non_socks_unchanged() { + assert_eq!( + chromium_proxy_arg("http://proxy.example:8080"), + "http://proxy.example:8080" + ); + } }