Chromium doesn't know the socks5h scheme (curl/reqwest convention) and bails navigations with ERR_NO_SUPPORTED_PROXIES. It does, however, send destination hostnames over SOCKS5 by default, so stripping the `h` is a pure scheme rename — remote-DNS behaviour is preserved. reqwest keeps the user's original CRAWLER_PROXY string (`socks5h://...` remains valid and meaningful for it). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
245 lines
8.6 KiB
Rust
245 lines
8.6 KiB
Rust
//! Centralised URL helpers for the crawler subsystem.
|
|
//!
|
|
//! Three near-identical hand-rolled URL parsers used to live in
|
|
//! `crawler::session`, `crawler::rate_limit`, and `crawler::pipeline`
|
|
//! respectively, each with subtly different edge-case behaviour
|
|
//! around port handling and IPv6 literals. They're consolidated here
|
|
//! so the divergence can't drift again.
|
|
//!
|
|
//! The hand-rolled implementations are kept intentionally — they
|
|
//! preserve the exact semantics every existing test pins. A future
|
|
//! refactor can switch to `reqwest::Url` if it can be done without
|
|
//! changing those semantics.
|
|
|
|
/// Lowercased host (no port). Returns `None` for inputs without a
|
|
/// `scheme://host` shape — those would never have reached the network
|
|
/// layer anyway. Used by the per-host rate limiter as its bucket key.
|
|
///
|
|
/// IPv6 literals are kept in their `[::1]` bracketed form so the
|
|
/// `rsplit_once(':')` port-stripping logic doesn't split inside the
|
|
/// address (e.g. `https://[::1]/foo` used to return `"[:"` because
|
|
/// the rightmost `:` is inside the literal). Buckets keyed by
|
|
/// `[::1]` vs `::1` are still uniquely-per-host; the brackets are
|
|
/// cosmetic.
|
|
pub fn host_of(url: &str) -> Option<String> {
|
|
let after_scheme = url.split_once("://")?.1;
|
|
let host_with_port = after_scheme.split('/').next()?;
|
|
let host = if host_with_port.starts_with('[') {
|
|
// IPv6 literal: keep through the closing bracket. There may
|
|
// be a trailing `:port` after `]`; strip only that.
|
|
match host_with_port.rfind(']') {
|
|
Some(end) => &host_with_port[..=end],
|
|
None => host_with_port,
|
|
}
|
|
} else {
|
|
// Hostnames and IPv4 literals: trailing `:port` (if any) is
|
|
// after the last `:`.
|
|
host_with_port
|
|
.rsplit_once(':')
|
|
.map_or(host_with_port, |(h, _)| h)
|
|
};
|
|
(!host.is_empty()).then(|| host.to_ascii_lowercase())
|
|
}
|
|
|
|
/// `scheme://host` with no path or port stripping. Used by the metadata
|
|
/// pass to seed `sources.base_url` from `CRAWLER_START_URL`.
|
|
pub fn origin_of(url: &str) -> Option<String> {
|
|
let (scheme, rest) = url.split_once("://")?;
|
|
let host = rest.split('/').next()?;
|
|
Some(format!("{scheme}://{host}"))
|
|
}
|
|
|
|
/// Approximate registrable-domain calculation: take the last two
|
|
/// dot-labels of the host, prefix with `.`. Used to set a parent-
|
|
/// domain cookie so the catalog's `www.` / `m.` redirects don't drop
|
|
/// the cookie mid-crawl.
|
|
///
|
|
/// Caveat: wrong for multi-part TLDs (`.co.uk`, `.com.br`). The
|
|
/// operator can override via `CRAWLER_COOKIE_DOMAIN`; pulling in the
|
|
/// Public Suffix List for one knob isn't worth it yet.
|
|
///
|
|
/// Bare hostnames (e.g. `localhost`) return the host as-is, with no
|
|
/// leading dot — setting `.localhost` as a cookie domain is invalid.
|
|
/// IPv6 literals (e.g. `[::1]`) are returned bracketed and unchanged;
|
|
/// the browser will reject them as a cookie `Domain` anyway, but the
|
|
/// representation stays sensible. Same `starts_with('[')` branch as
|
|
/// [`host_of`] for consistent IPv6 handling across the module.
|
|
pub fn registrable_domain(url: &str) -> Option<String> {
|
|
let after_scheme = url.split_once("://")?.1;
|
|
let host_with_port = after_scheme.split('/').next()?;
|
|
let host_str = if host_with_port.starts_with('[') {
|
|
// IPv6 literal: keep through the closing bracket; an optional
|
|
// `:port` follows `]`.
|
|
match host_with_port.rfind(']') {
|
|
Some(end) => &host_with_port[..=end],
|
|
None => host_with_port,
|
|
}
|
|
} else {
|
|
host_with_port
|
|
.rsplit_once(':')
|
|
.map_or(host_with_port, |(h, _)| h)
|
|
};
|
|
let host = host_str.to_ascii_lowercase();
|
|
if host.is_empty() {
|
|
return None;
|
|
}
|
|
let labels: Vec<&str> = host.split('.').filter(|l| !l.is_empty()).collect();
|
|
if labels.len() < 2 {
|
|
return Some(host);
|
|
}
|
|
let registrable = &labels[labels.len() - 2..];
|
|
Some(format!(".{}", registrable.join(".")))
|
|
}
|
|
|
|
/// Normalise a SOCKS proxy URL for Chromium's `--proxy-server=` flag.
|
|
///
|
|
/// reqwest accepts both `socks5://` (resolve locally) and
|
|
/// `socks5h://` (resolve via the SOCKS server — important when the
|
|
/// proxy is TOR and we don't want the host's resolver to see the
|
|
/// target hostname). Chromium does **not** know the `socks5h` scheme
|
|
/// and refuses navigations with `ERR_NO_SUPPORTED_PROXIES`. It
|
|
/// already sends destination hostnames over SOCKS5 by default
|
|
/// regardless, so stripping the `h` is a pure scheme rename — the
|
|
/// remote-DNS behaviour is preserved.
|
|
///
|
|
/// Non-SOCKS schemes pass through unchanged.
|
|
pub fn chromium_proxy_arg(proxy: &str) -> String {
|
|
if let Some(rest) = proxy.strip_prefix("socks5h://") {
|
|
format!("socks5://{rest}")
|
|
} else {
|
|
proxy.to_string()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn host_of_strips_port_and_lowercases() {
|
|
assert_eq!(
|
|
host_of("https://CDN.Example.com:443/x").as_deref(),
|
|
Some("cdn.example.com")
|
|
);
|
|
assert_eq!(host_of("http://localhost/").as_deref(), Some("localhost"));
|
|
assert_eq!(host_of("not a url"), None);
|
|
}
|
|
|
|
#[test]
|
|
fn host_of_keeps_bracketed_ipv6_literal_intact() {
|
|
// Regression: the old impl rsplit_once(':')'d the IPv6 address,
|
|
// returning "[:" instead of "[::1]". A real IPv6 source would
|
|
// silently get a wrong rate-limit bucket key.
|
|
assert_eq!(host_of("https://[::1]/").as_deref(), Some("[::1]"));
|
|
assert_eq!(host_of("https://[::1]:8080/").as_deref(), Some("[::1]"));
|
|
assert_eq!(
|
|
host_of("https://[2001:db8::1]/foo").as_deref(),
|
|
Some("[2001:db8::1]")
|
|
);
|
|
assert_eq!(
|
|
host_of("https://[2001:db8::1]:443/foo").as_deref(),
|
|
Some("[2001:db8::1]")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn origin_of_returns_scheme_and_host() {
|
|
assert_eq!(
|
|
origin_of("https://example.com/some/path?q=1").as_deref(),
|
|
Some("https://example.com")
|
|
);
|
|
assert_eq!(origin_of("garbage"), None);
|
|
}
|
|
|
|
#[test]
|
|
fn registrable_domain_strips_subdomain() {
|
|
assert_eq!(
|
|
registrable_domain("https://www.target-site.com/manga/foo/").as_deref(),
|
|
Some(".target-site.com")
|
|
);
|
|
assert_eq!(
|
|
registrable_domain("https://m.example.org").as_deref(),
|
|
Some(".example.org")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn registrable_domain_keeps_two_label_host() {
|
|
assert_eq!(
|
|
registrable_domain("https://example.com/").as_deref(),
|
|
Some(".example.com")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn registrable_domain_handles_port() {
|
|
assert_eq!(
|
|
registrable_domain("http://www.foo.bar:8080/x").as_deref(),
|
|
Some(".foo.bar")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn registrable_domain_bare_hostname_no_leading_dot() {
|
|
assert_eq!(
|
|
registrable_domain("http://localhost:5173").as_deref(),
|
|
Some("localhost")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn registrable_domain_returns_none_for_garbage() {
|
|
assert!(registrable_domain("not a url").is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn registrable_domain_keeps_bracketed_ipv6_literal_intact() {
|
|
// Symmetric with host_of's IPv6 fix. The cookie-domain code
|
|
// won't accept an IP as a `Domain` value, but the function
|
|
// should at least return a sensible representation rather
|
|
// than the truncated `"[:"` the old port-stripper produced.
|
|
assert_eq!(
|
|
registrable_domain("https://[::1]/").as_deref(),
|
|
Some("[::1]")
|
|
);
|
|
assert_eq!(
|
|
registrable_domain("https://[::1]:8080/").as_deref(),
|
|
Some("[::1]")
|
|
);
|
|
assert_eq!(
|
|
registrable_domain("https://[2001:db8::1]/foo").as_deref(),
|
|
Some("[2001:db8::1]")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn chromium_proxy_arg_strips_socks5h_to_socks5() {
|
|
// Regression: passing socks5h:// to Chromium yields
|
|
// ERR_NO_SUPPORTED_PROXIES at navigation time.
|
|
assert_eq!(
|
|
chromium_proxy_arg("socks5h://127.0.0.1:9050"),
|
|
"socks5://127.0.0.1:9050"
|
|
);
|
|
assert_eq!(
|
|
chromium_proxy_arg("socks5h://tor:9050"),
|
|
"socks5://tor:9050"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn chromium_proxy_arg_passes_socks5_unchanged() {
|
|
assert_eq!(
|
|
chromium_proxy_arg("socks5://127.0.0.1:9050"),
|
|
"socks5://127.0.0.1:9050"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn chromium_proxy_arg_passes_non_socks_unchanged() {
|
|
assert_eq!(
|
|
chromium_proxy_arg("http://proxy.example:8080"),
|
|
"http://proxy.example:8080"
|
|
);
|
|
}
|
|
}
|