Files
Mangalord/backend/src/crawler/url_utils.rs
MechaCat02 4e20350645
All checks were successful
deploy / test-backend (push) Successful in 19m30s
deploy / test-frontend (push) Successful in 9m42s
deploy / build-and-push (push) Successful in 8m10s
deploy / deploy (push) Successful in 15s
fix(crawler): translate socks5h:// → socks5:// for Chromium --proxy-server
Chromium doesn't know the socks5h scheme (curl/reqwest convention)
and bails navigations with ERR_NO_SUPPORTED_PROXIES. It does, however,
send destination hostnames over SOCKS5 by default, so stripping the
`h` is a pure scheme rename — remote-DNS behaviour is preserved.

reqwest keeps the user's original CRAWLER_PROXY string (`socks5h://...`
remains valid and meaningful for it).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-31 20:56:45 +02:00

245 lines
8.6 KiB
Rust

//! Centralised URL helpers for the crawler subsystem.
//!
//! Three near-identical hand-rolled URL parsers used to live in
//! `crawler::session`, `crawler::rate_limit`, and `crawler::pipeline`
//! respectively, each with subtly different edge-case behaviour
//! around port handling and IPv6 literals. They're consolidated here
//! so the divergence can't drift again.
//!
//! The hand-rolled implementations are kept intentionally — they
//! preserve the exact semantics every existing test pins. A future
//! refactor can switch to `reqwest::Url` if it can be done without
//! changing those semantics.
/// Lowercased host (no port). Returns `None` for inputs without a
/// `scheme://host` shape — those would never have reached the network
/// layer anyway. Used by the per-host rate limiter as its bucket key.
///
/// IPv6 literals are kept in their `[::1]` bracketed form so the
/// `rsplit_once(':')` port-stripping logic doesn't split inside the
/// address (e.g. `https://[::1]/foo` used to return `"[:"` because
/// the rightmost `:` is inside the literal). Buckets keyed by
/// `[::1]` vs `::1` are still uniquely-per-host; the brackets are
/// cosmetic.
pub fn host_of(url: &str) -> Option<String> {
let after_scheme = url.split_once("://")?.1;
let host_with_port = after_scheme.split('/').next()?;
let host = if host_with_port.starts_with('[') {
// IPv6 literal: keep through the closing bracket. There may
// be a trailing `:port` after `]`; strip only that.
match host_with_port.rfind(']') {
Some(end) => &host_with_port[..=end],
None => host_with_port,
}
} else {
// Hostnames and IPv4 literals: trailing `:port` (if any) is
// after the last `:`.
host_with_port
.rsplit_once(':')
.map_or(host_with_port, |(h, _)| h)
};
(!host.is_empty()).then(|| host.to_ascii_lowercase())
}
/// `scheme://host` with no path or port stripping. Used by the metadata
/// pass to seed `sources.base_url` from `CRAWLER_START_URL`.
pub fn origin_of(url: &str) -> Option<String> {
let (scheme, rest) = url.split_once("://")?;
let host = rest.split('/').next()?;
Some(format!("{scheme}://{host}"))
}
/// Approximate registrable-domain calculation: take the last two
/// dot-labels of the host, prefix with `.`. Used to set a parent-
/// domain cookie so the catalog's `www.` / `m.` redirects don't drop
/// the cookie mid-crawl.
///
/// Caveat: wrong for multi-part TLDs (`.co.uk`, `.com.br`). The
/// operator can override via `CRAWLER_COOKIE_DOMAIN`; pulling in the
/// Public Suffix List for one knob isn't worth it yet.
///
/// Bare hostnames (e.g. `localhost`) return the host as-is, with no
/// leading dot — setting `.localhost` as a cookie domain is invalid.
/// IPv6 literals (e.g. `[::1]`) are returned bracketed and unchanged;
/// the browser will reject them as a cookie `Domain` anyway, but the
/// representation stays sensible. Same `starts_with('[')` branch as
/// [`host_of`] for consistent IPv6 handling across the module.
pub fn registrable_domain(url: &str) -> Option<String> {
let after_scheme = url.split_once("://")?.1;
let host_with_port = after_scheme.split('/').next()?;
let host_str = if host_with_port.starts_with('[') {
// IPv6 literal: keep through the closing bracket; an optional
// `:port` follows `]`.
match host_with_port.rfind(']') {
Some(end) => &host_with_port[..=end],
None => host_with_port,
}
} else {
host_with_port
.rsplit_once(':')
.map_or(host_with_port, |(h, _)| h)
};
let host = host_str.to_ascii_lowercase();
if host.is_empty() {
return None;
}
let labels: Vec<&str> = host.split('.').filter(|l| !l.is_empty()).collect();
if labels.len() < 2 {
return Some(host);
}
let registrable = &labels[labels.len() - 2..];
Some(format!(".{}", registrable.join(".")))
}
/// Normalise a SOCKS proxy URL for Chromium's `--proxy-server=` flag.
///
/// reqwest accepts both `socks5://` (resolve locally) and
/// `socks5h://` (resolve via the SOCKS server — important when the
/// proxy is TOR and we don't want the host's resolver to see the
/// target hostname). Chromium does **not** know the `socks5h` scheme
/// and refuses navigations with `ERR_NO_SUPPORTED_PROXIES`. It
/// already sends destination hostnames over SOCKS5 by default
/// regardless, so stripping the `h` is a pure scheme rename — the
/// remote-DNS behaviour is preserved.
///
/// Non-SOCKS schemes pass through unchanged.
pub fn chromium_proxy_arg(proxy: &str) -> String {
if let Some(rest) = proxy.strip_prefix("socks5h://") {
format!("socks5://{rest}")
} else {
proxy.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn host_of_strips_port_and_lowercases() {
assert_eq!(
host_of("https://CDN.Example.com:443/x").as_deref(),
Some("cdn.example.com")
);
assert_eq!(host_of("http://localhost/").as_deref(), Some("localhost"));
assert_eq!(host_of("not a url"), None);
}
#[test]
fn host_of_keeps_bracketed_ipv6_literal_intact() {
// Regression: the old impl rsplit_once(':')'d the IPv6 address,
// returning "[:" instead of "[::1]". A real IPv6 source would
// silently get a wrong rate-limit bucket key.
assert_eq!(host_of("https://[::1]/").as_deref(), Some("[::1]"));
assert_eq!(host_of("https://[::1]:8080/").as_deref(), Some("[::1]"));
assert_eq!(
host_of("https://[2001:db8::1]/foo").as_deref(),
Some("[2001:db8::1]")
);
assert_eq!(
host_of("https://[2001:db8::1]:443/foo").as_deref(),
Some("[2001:db8::1]")
);
}
#[test]
fn origin_of_returns_scheme_and_host() {
assert_eq!(
origin_of("https://example.com/some/path?q=1").as_deref(),
Some("https://example.com")
);
assert_eq!(origin_of("garbage"), None);
}
#[test]
fn registrable_domain_strips_subdomain() {
assert_eq!(
registrable_domain("https://www.target-site.com/manga/foo/").as_deref(),
Some(".target-site.com")
);
assert_eq!(
registrable_domain("https://m.example.org").as_deref(),
Some(".example.org")
);
}
#[test]
fn registrable_domain_keeps_two_label_host() {
assert_eq!(
registrable_domain("https://example.com/").as_deref(),
Some(".example.com")
);
}
#[test]
fn registrable_domain_handles_port() {
assert_eq!(
registrable_domain("http://www.foo.bar:8080/x").as_deref(),
Some(".foo.bar")
);
}
#[test]
fn registrable_domain_bare_hostname_no_leading_dot() {
assert_eq!(
registrable_domain("http://localhost:5173").as_deref(),
Some("localhost")
);
}
#[test]
fn registrable_domain_returns_none_for_garbage() {
assert!(registrable_domain("not a url").is_none());
}
#[test]
fn registrable_domain_keeps_bracketed_ipv6_literal_intact() {
// Symmetric with host_of's IPv6 fix. The cookie-domain code
// won't accept an IP as a `Domain` value, but the function
// should at least return a sensible representation rather
// than the truncated `"[:"` the old port-stripper produced.
assert_eq!(
registrable_domain("https://[::1]/").as_deref(),
Some("[::1]")
);
assert_eq!(
registrable_domain("https://[::1]:8080/").as_deref(),
Some("[::1]")
);
assert_eq!(
registrable_domain("https://[2001:db8::1]/foo").as_deref(),
Some("[2001:db8::1]")
);
}
#[test]
fn chromium_proxy_arg_strips_socks5h_to_socks5() {
// Regression: passing socks5h:// to Chromium yields
// ERR_NO_SUPPORTED_PROXIES at navigation time.
assert_eq!(
chromium_proxy_arg("socks5h://127.0.0.1:9050"),
"socks5://127.0.0.1:9050"
);
assert_eq!(
chromium_proxy_arg("socks5h://tor:9050"),
"socks5://tor:9050"
);
}
#[test]
fn chromium_proxy_arg_passes_socks5_unchanged() {
assert_eq!(
chromium_proxy_arg("socks5://127.0.0.1:9050"),
"socks5://127.0.0.1:9050"
);
}
#[test]
fn chromium_proxy_arg_passes_non_socks_unchanged() {
assert_eq!(
chromium_proxy_arg("http://proxy.example:8080"),
"http://proxy.example:8080"
);
}
}