chromiumoxide/handler/
network_utils.rs

1use std::borrow::Cow;
2
3#[inline]
4fn strip_special_schemes(url: &str) -> &str {
5    let url = url.strip_prefix("blob:").unwrap_or(url);
6    url.strip_prefix("filesystem:").unwrap_or(url)
7}
8
9/// Returns (host_without_port, rest_starting_at_/ ? # or empty)
10/// Robust: handles protocol-relative, userinfo, IPv6 literals, ports.
11#[inline]
12pub fn host_and_rest(url: &str) -> Option<(&str, &str)> {
13    let url = strip_special_schemes(url);
14
15    let host_start = if let Some(pos) = url.find("://") {
16        pos + 3
17    } else if url.starts_with("//") {
18        2
19    } else {
20        return None;
21    };
22
23    // End of authority (first / ? # after host_start)
24    let mut rest_start = url.len();
25    if let Some(i) = url[host_start..].find('/') {
26        rest_start = host_start + i;
27    }
28    if let Some(i) = url[host_start..].find('?') {
29        rest_start = rest_start.min(host_start + i);
30    }
31    if let Some(i) = url[host_start..].find('#') {
32        rest_start = rest_start.min(host_start + i);
33    }
34
35    let authority = &url[host_start..rest_start];
36    if authority.is_empty() {
37        return None;
38    }
39
40    // Drop userinfo if present: user:pass@host
41    let authority = authority.rsplit('@').next().unwrap_or(authority);
42
43    // IPv6: [::1]:8080
44    if authority.as_bytes().first() == Some(&b'[') {
45        let close = authority.find(']')?;
46        let host = &authority[1..close];
47        return Some((host, &url[rest_start..]));
48    }
49
50    // IPv4/hostname: host:port
51    let host_end = authority.find(':').unwrap_or(authority.len());
52    let host = &authority[..host_end];
53    if host.is_empty() {
54        return None;
55    }
56
57    Some((host, &url[rest_start..]))
58}
59
60#[inline]
61fn eq_ignore_ascii_case(a: &str, b: &str) -> bool {
62    a.len() == b.len()
63        && a.as_bytes()
64            .iter()
65            .zip(b.as_bytes().iter())
66            .all(|(x, y)| x.to_ascii_lowercase() == y.to_ascii_lowercase())
67}
68
69#[inline]
70fn ends_with_ignore_ascii_case(hay: &str, suf: &str) -> bool {
71    if suf.len() > hay.len() {
72        return false;
73    }
74    let a = &hay.as_bytes()[hay.len() - suf.len()..];
75    let b = suf.as_bytes();
76    a.iter()
77        .zip(b.iter())
78        .all(|(x, y)| x.to_ascii_lowercase() == y.to_ascii_lowercase())
79}
80
81/// Host matches base if host == base OR host ends with ".{base}" (case-insensitive),
82/// with a required dot boundary to prevent "evil-logrocket.com" matching "logrocket.com".
83#[inline]
84pub fn host_is_subdomain_of(host: &str, base: &str) -> bool {
85    let host = host.trim_end_matches('.');
86    let base = base.trim_end_matches('.');
87
88    if base.is_empty() {
89        return false;
90    }
91
92    if eq_ignore_ascii_case(host, base) {
93        return true;
94    }
95
96    if host.len() <= base.len() {
97        return false;
98    }
99
100    let dot_pos = host.len() - base.len() - 1;
101    host.as_bytes().get(dot_pos) == Some(&b'.') && ends_with_ignore_ascii_case(host, base)
102}
103
104/// Common subdomain labels.
105static COMMON_SUBDOMAIN_LABELS: phf::Set<&'static str> = phf::phf_set! {
106    "www","m","amp","api","cdn","static","assets","img","images","media","files",
107    "login","auth","sso","id","account","accounts",
108    "app","apps","dashboard","admin","portal","console",
109    "status","support","help","docs","blog",
110    "dev","staging","stage","test","qa","uat","beta","alpha","preview","demo","sandbox",
111    "uploads","download","storage","origin","edge","cache",
112    "mail","email","smtp","mx","webmail",
113    "graphql","rpc","ws",
114};
115
116#[inline]
117/// Common sub domains.
118fn is_common_subdomain_label(lbl: &str) -> bool {
119    if lbl.is_empty() {
120        return false;
121    }
122    let lower = lbl.to_ascii_lowercase(); // alloc
123    COMMON_SUBDOMAIN_LABELS.contains(lower.as_str())
124}
125
126#[inline]
127pub fn base_domain_from_url<'a>(main_url: &'a str) -> Option<&'a str> {
128    let (host, _) = host_and_rest(main_url)?;
129    Some(base_domain_from_host(host))
130}
131
132/// Given a base domain (already computed) and a URL, returns the “relative” path
133/// for same-site/subdomain URLs, otherwise returns the original URL.
134#[inline]
135pub fn rel_for_ignore_script<'a>(base_domain: &str, url: &'a str) -> Cow<'a, str> {
136    if url.starts_with('/') {
137        return Cow::Borrowed(url);
138    }
139
140    let base = base_domain.trim_end_matches('.');
141    if base.is_empty() {
142        return Cow::Borrowed(url);
143    }
144
145    if let Some((host, rest)) = host_and_rest(url) {
146        if host_is_subdomain_of(host, base) {
147            // Convert same-site absolute URL into a path-like string.
148            if rest.starts_with('/') {
149                return Cow::Borrowed(rest);
150            }
151            // e.g. "https://x.com?y" or "https://x.com#y"
152            return Cow::Borrowed("/");
153        }
154    }
155
156    Cow::Borrowed(url)
157}
158
159#[inline]
160/// Common cc.
161fn is_common_cc_sld(sld: &str) -> bool {
162    let s = sld.as_bytes();
163    match s.len() {
164        2 => matches!(
165            [s[0].to_ascii_lowercase(), s[1].to_ascii_lowercase()],
166            [b'c', b'o'] | // co
167            [b'a', b'c'] | // ac
168            [b'g', b'o'] | // go
169            [b'o', b'r'] | // or
170            [b'n', b'e'] | // ne
171            [b'e', b'd'] | // ed
172            [b'g', b'r'] | // gr
173            [b'l', b'g'] | // lg
174            [b'a', b'd'] // ad
175        ),
176        3 => matches!(
177            [
178                s[0].to_ascii_lowercase(),
179                s[1].to_ascii_lowercase(),
180                s[2].to_ascii_lowercase()
181            ],
182            // globally common
183            [b'c', b'o', b'm'] | // com
184            [b'n', b'e', b't'] | // net
185            [b'o', b'r', b'g'] | // org
186            [b'g', b'o', b'v'] | // gov
187            [b'e', b'd', b'u'] | // edu
188            [b'm', b'i', b'l'] | // mil
189            [b'n', b'i', b'c'] | // nic
190            [b's', b'c', b'h'] | // sch
191            // MX / some LATAM
192            [b'g', b'o', b'b'] // gob
193        ),
194        4 => matches!(
195            [
196                s[0].to_ascii_lowercase(),
197                s[1].to_ascii_lowercase(),
198                s[2].to_ascii_lowercase(),
199                s[3].to_ascii_lowercase()
200            ],
201            [b'g', b'o', b'u', b'v'] // gouv (seen in some places)
202        ),
203        _ => false,
204    }
205}
206
207#[inline]
208/// Get the base “site” domain from a host.
209///
210/// - Normal sites: `staging.logrocket.com` -> `logrocket.com`
211/// - ccTLD-ish: `a.b.example.co.uk` -> `example.co.uk` (existing heuristic)
212/// - Multi-tenant SaaS: `logrocket.chilipiper.com` -> `logrocket.chilipiper.com`
213///   (keeps one extra label when it looks like a tenant, not `www`/`cdn`/etc.)
214pub fn base_domain_from_host(host: &str) -> &str {
215    let mut h = host.trim_end_matches('.');
216    if let Some(x) = h.strip_prefix("www.") {
217        h = x;
218    }
219    if let Some(x) = h.strip_prefix("m.") {
220        h = x;
221    }
222
223    // Find last two dots
224    let last_dot = match h.rfind('.') {
225        Some(p) => p,
226        None => return h,
227    };
228    let prev_dot = match h[..last_dot].rfind('.') {
229        Some(p) => p,
230        None => return h, // only 1 dot
231    };
232
233    let tld = &h[last_dot + 1..];
234    let sld = &h[prev_dot + 1..last_dot];
235
236    let mut base = &h[prev_dot + 1..]; // "example.com" or "co.uk"
237
238    if tld.len() == 2 && is_common_cc_sld(sld) {
239        if let Some(prev2_dot) = h[..prev_dot].rfind('.') {
240            base = &h[prev2_dot + 1..]; // "example.co.uk"
241        }
242    }
243
244    if h.len() > base.len() + 1 {
245        let base_start = h.len() - base.len();
246        let boundary = base_start - 1;
247        if h.as_bytes().get(boundary) == Some(&b'.') {
248            let left_part = &h[..boundary];
249            // label immediately to the left of base
250            let (lbl_start, lbl) = match left_part.rfind('.') {
251                Some(p) => (p + 1, &left_part[p + 1..]),
252                None => (0, left_part),
253            };
254
255            if !lbl.is_empty() && !is_common_subdomain_label(lbl) {
256                // return "tenant.base" => slice starting at lbl_start
257                return &h[lbl_start..];
258            }
259        }
260    }
261
262    base
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_domain_match_basic_and_subdomains() {
271        let base = "logrocket.com";
272
273        assert!(host_is_subdomain_of("logrocket.com", base));
274        assert!(host_is_subdomain_of("staging.logrocket.com", base));
275        assert!(host_is_subdomain_of("a.b.c.logrocket.com", base));
276
277        // case-insensitive
278        assert!(host_is_subdomain_of(
279            "StAgInG.LoGrOcKeT.CoM",
280            "LOGROCKET.COM"
281        ));
282    }
283
284    #[test]
285    fn test_domain_match_no_false_positives() {
286        let base = "logrocket.com";
287
288        // must be dot-boundary
289        assert!(!host_is_subdomain_of("evil-logrocket.com", base));
290        assert!(!host_is_subdomain_of("logrocket.com.evil.com", base));
291        assert!(!host_is_subdomain_of("staginglogrocket.com", base));
292        assert!(!host_is_subdomain_of("logrocket.co", base));
293    }
294
295    #[test]
296    fn test_host_and_rest_handles_userinfo_port_ipv6() {
297        let (h, rest) =
298            host_and_rest("https://user:pass@staging.logrocket.com:8443/a.js?x=1#y").unwrap();
299        assert_eq!(h, "staging.logrocket.com");
300        assert_eq!(rest, "/a.js?x=1#y");
301
302        let (h, rest) = host_and_rest("http://[::1]:8080/path").unwrap();
303        assert_eq!(h, "::1");
304        assert_eq!(rest, "/path");
305    }
306
307    #[test]
308    fn test_rel_for_ignore_script_logrocket_example() {
309        let base = "logrocket.com";
310
311        let main = "https://logrocket.com/careers";
312        assert_eq!(rel_for_ignore_script(base, main).as_ref(), "/careers");
313
314        let script = "https://staging.logrocket.com/LogRocket.min.js";
315        assert_eq!(
316            rel_for_ignore_script(base, script).as_ref(),
317            "/LogRocket.min.js"
318        );
319
320        // Different site stays absolute
321        let other = "https://cdn.other.com/app.js";
322        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
323
324        // Root-relative stays as-is
325        assert_eq!(
326            rel_for_ignore_script(base, "/static/app.js").as_ref(),
327            "/static/app.js"
328        );
329    }
330
331    #[test]
332    fn test_rel_for_ignore_script_query_only_same_site() {
333        let base = "example.com";
334        let u = "https://sub.example.com?x=1";
335        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/");
336    }
337
338    #[test]
339    fn test_rel_for_ignore_script_special_schemes() {
340        let base = "example.com";
341        let u = "blob:https://example.com/path/to/blob";
342        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/path/to/blob");
343    }
344
345    #[test]
346    fn test_base_domain_tenant_subdomain() {
347        let base = base_domain_from_host("logrocket.chilipiper.com");
348        assert_eq!(base, "logrocket.chilipiper.com");
349
350        // same tenant (subdomain) becomes relative
351        let u = "https://assets.logrocket.chilipiper.com/a.js";
352        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/a.js");
353
354        // different tenant must NOT match
355        let other = "https://othertenant.chilipiper.com/a.js";
356        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
357    }
358}