//! Centralised URL helpers for the crawler subsystem. //! //! Three near-identical hand-rolled URL parsers used to live in //! `crawler::session`, `crawler::rate_limit`, and `crawler::pipeline` //! respectively, each with subtly different edge-case behaviour //! around port handling and IPv6 literals. They're consolidated here //! so the divergence can't drift again. //! //! The hand-rolled implementations are kept intentionally — they //! preserve the exact semantics every existing test pins. A future //! refactor can switch to `reqwest::Url` if it can be done without //! changing those semantics. /// Lowercased host (no port). Returns `None` for inputs without a /// `scheme://host` shape — those would never have reached the network /// layer anyway. Used by the per-host rate limiter as its bucket key. /// /// IPv6 literals are kept in their `[::1]` bracketed form so the /// `rsplit_once(':')` port-stripping logic doesn't split inside the /// address (e.g. `https://[::1]/foo` used to return `"[:"` because /// the rightmost `:` is inside the literal). Buckets keyed by /// `[::1]` vs `::1` are still uniquely-per-host; the brackets are /// cosmetic. pub fn host_of(url: &str) -> Option { let after_scheme = url.split_once("://")?.1; let host_with_port = after_scheme.split('/').next()?; let host = if host_with_port.starts_with('[') { // IPv6 literal: keep through the closing bracket. There may // be a trailing `:port` after `]`; strip only that. match host_with_port.rfind(']') { Some(end) => &host_with_port[..=end], None => host_with_port, } } else { // Hostnames and IPv4 literals: trailing `:port` (if any) is // after the last `:`. host_with_port .rsplit_once(':') .map_or(host_with_port, |(h, _)| h) }; (!host.is_empty()).then(|| host.to_ascii_lowercase()) } /// `scheme://host` with no path or port stripping. Used by the metadata /// pass to seed `sources.base_url` from `CRAWLER_START_URL`. pub fn origin_of(url: &str) -> Option { let (scheme, rest) = url.split_once("://")?; let host = rest.split('/').next()?; Some(format!("{scheme}://{host}")) } /// Approximate registrable-domain calculation: take the last two /// dot-labels of the host, prefix with `.`. Used to set a parent- /// domain cookie so the catalog's `www.` / `m.` redirects don't drop /// the cookie mid-crawl. /// /// Caveat: wrong for multi-part TLDs (`.co.uk`, `.com.br`). The /// operator can override via `CRAWLER_COOKIE_DOMAIN`; pulling in the /// Public Suffix List for one knob isn't worth it yet. /// /// Bare hostnames (e.g. `localhost`) return the host as-is, with no /// leading dot — setting `.localhost` as a cookie domain is invalid. /// IPv6 literals (e.g. `[::1]`) are returned bracketed and unchanged; /// the browser will reject them as a cookie `Domain` anyway, but the /// representation stays sensible. Same `starts_with('[')` branch as /// [`host_of`] for consistent IPv6 handling across the module. pub fn registrable_domain(url: &str) -> Option { let after_scheme = url.split_once("://")?.1; let host_with_port = after_scheme.split('/').next()?; let host_str = if host_with_port.starts_with('[') { // IPv6 literal: keep through the closing bracket; an optional // `:port` follows `]`. match host_with_port.rfind(']') { Some(end) => &host_with_port[..=end], None => host_with_port, } } else { host_with_port .rsplit_once(':') .map_or(host_with_port, |(h, _)| h) }; let host = host_str.to_ascii_lowercase(); if host.is_empty() { return None; } let labels: Vec<&str> = host.split('.').filter(|l| !l.is_empty()).collect(); if labels.len() < 2 { return Some(host); } let registrable = &labels[labels.len() - 2..]; Some(format!(".{}", registrable.join("."))) } #[cfg(test)] mod tests { use super::*; #[test] fn host_of_strips_port_and_lowercases() { assert_eq!( host_of("https://CDN.Example.com:443/x").as_deref(), Some("cdn.example.com") ); assert_eq!(host_of("http://localhost/").as_deref(), Some("localhost")); assert_eq!(host_of("not a url"), None); } #[test] fn host_of_keeps_bracketed_ipv6_literal_intact() { // Regression: the old impl rsplit_once(':')'d the IPv6 address, // returning "[:" instead of "[::1]". A real IPv6 source would // silently get a wrong rate-limit bucket key. assert_eq!(host_of("https://[::1]/").as_deref(), Some("[::1]")); assert_eq!(host_of("https://[::1]:8080/").as_deref(), Some("[::1]")); assert_eq!( host_of("https://[2001:db8::1]/foo").as_deref(), Some("[2001:db8::1]") ); assert_eq!( host_of("https://[2001:db8::1]:443/foo").as_deref(), Some("[2001:db8::1]") ); } #[test] fn origin_of_returns_scheme_and_host() { assert_eq!( origin_of("https://example.com/some/path?q=1").as_deref(), Some("https://example.com") ); assert_eq!(origin_of("garbage"), None); } #[test] fn registrable_domain_strips_subdomain() { assert_eq!( registrable_domain("https://www.target-site.com/manga/foo/").as_deref(), Some(".target-site.com") ); assert_eq!( registrable_domain("https://m.example.org").as_deref(), Some(".example.org") ); } #[test] fn registrable_domain_keeps_two_label_host() { assert_eq!( registrable_domain("https://example.com/").as_deref(), Some(".example.com") ); } #[test] fn registrable_domain_handles_port() { assert_eq!( registrable_domain("http://www.foo.bar:8080/x").as_deref(), Some(".foo.bar") ); } #[test] fn registrable_domain_bare_hostname_no_leading_dot() { assert_eq!( registrable_domain("http://localhost:5173").as_deref(), Some("localhost") ); } #[test] fn registrable_domain_returns_none_for_garbage() { assert!(registrable_domain("not a url").is_none()); } #[test] fn registrable_domain_keeps_bracketed_ipv6_literal_intact() { // Symmetric with host_of's IPv6 fix. The cookie-domain code // won't accept an IP as a `Domain` value, but the function // should at least return a sensible representation rather // than the truncated `"[:"` the old port-stripper produced. assert_eq!( registrable_domain("https://[::1]/").as_deref(), Some("[::1]") ); assert_eq!( registrable_domain("https://[::1]:8080/").as_deref(), Some("[::1]") ); assert_eq!( registrable_domain("https://[2001:db8::1]/foo").as_deref(), Some("[2001:db8::1]") ); } }