Skip to main content

chromiumoxide/handler/
network_utils.rs

1use std::borrow::Cow;
2
3use memchr::{memchr, memchr3, memrchr};
4
5#[inline]
6fn strip_special_schemes(url: &str) -> &str {
7    let url = url.strip_prefix("blob:").unwrap_or(url);
8    url.strip_prefix("filesystem:").unwrap_or(url)
9}
10
11/// Returns (host_without_port, rest_starting_at_/ ? # or empty)
12/// Robust: handles protocol-relative, userinfo, IPv6 literals, ports.
13#[inline]
14pub fn host_and_rest(url: &str) -> Option<(&str, &str)> {
15    let url = strip_special_schemes(url);
16    let bytes = url.as_bytes();
17
18    let host_start = if let Some(pos) = memchr(b':', bytes) {
19        if bytes.get(pos + 1) == Some(&b'/') && bytes.get(pos + 2) == Some(&b'/') {
20            pos + 3
21        } else if bytes.starts_with(b"//") {
22            2
23        } else {
24            return None;
25        }
26    } else if bytes.starts_with(b"//") {
27        2
28    } else {
29        return None;
30    };
31
32    // End of authority: find first of / ? # after host_start in one SIMD pass.
33    let rest_start =
34        memchr3(b'/', b'?', b'#', &bytes[host_start..]).map_or(url.len(), |i| host_start + i);
35
36    let authority = &url[host_start..rest_start];
37    if authority.is_empty() {
38        return None;
39    }
40
41    // Drop userinfo if present: user:pass@host
42    let authority = match memrchr(b'@', authority.as_bytes()) {
43        Some(pos) => &authority[pos + 1..],
44        None => authority,
45    };
46
47    let ab = authority.as_bytes();
48
49    // IPv6: [::1]:8080
50    if ab.first() == Some(&b'[') {
51        let close = memchr(b']', ab)?;
52        let host = &authority[1..close];
53        return Some((host, &url[rest_start..]));
54    }
55
56    // IPv4/hostname: host:port
57    let host_end = memchr(b':', ab).unwrap_or(ab.len());
58    let host = &authority[..host_end];
59    if host.is_empty() {
60        return None;
61    }
62
63    Some((host, &url[rest_start..]))
64}
65
66#[inline]
67fn eq_ignore_ascii_case(a: &str, b: &str) -> bool {
68    a.eq_ignore_ascii_case(b)
69}
70
71#[inline]
72pub fn ends_with_ignore_ascii_case(hay: &str, suf: &str) -> bool {
73    if suf.len() > hay.len() {
74        return false;
75    }
76    hay[hay.len() - suf.len()..].eq_ignore_ascii_case(suf)
77}
78
79#[inline]
80pub fn base_domain_from_any(s: &str) -> &str {
81    if let Some((h, _)) = host_and_rest(s) {
82        base_domain_from_host(h)
83    } else {
84        base_domain_from_host(s)
85    }
86}
87
88#[inline]
89pub fn first_label(host: &str) -> &str {
90    let h = host.trim_end_matches('.');
91    match memchr(b'.', h.as_bytes()) {
92        Some(i) => &h[..i],
93        None => h,
94    }
95}
96
97#[inline]
98pub fn host_contains_label_icase(host: &str, label: &str) -> bool {
99    let host = host.trim_end_matches('.');
100    let label = label.trim_matches('.');
101
102    if host.is_empty() || label.is_empty() {
103        return false;
104    }
105
106    let hb = host.as_bytes();
107    let lb = label.as_bytes();
108
109    // Use memchr to jump between dots instead of scanning byte-by-byte.
110    let mut start = 0usize;
111
112    // Skip leading dots.
113    while start < hb.len() && hb[start] == b'.' {
114        start += 1;
115    }
116
117    while start < hb.len() {
118        let end = memchr(b'.', &hb[start..]).map_or(hb.len(), |i| start + i);
119
120        if end - start == lb.len() && hb[start..end].eq_ignore_ascii_case(lb) {
121            return true;
122        }
123
124        // Skip past the dot and any consecutive dots.
125        start = end + 1;
126        while start < hb.len() && hb[start] == b'.' {
127            start += 1;
128        }
129    }
130
131    false
132}
133
134/// Host matches base if host == base OR host ends with ".{base}" (case-insensitive),
135/// with a required dot boundary to prevent "evil-mainr.com" matching "mainr.com".
136#[inline]
137pub fn host_is_subdomain_of(host: &str, base: &str) -> bool {
138    let host = host.trim_end_matches('.');
139    let base = base.trim_end_matches('.');
140
141    if base.is_empty() {
142        return false;
143    }
144
145    if eq_ignore_ascii_case(host, base) {
146        return true;
147    }
148
149    if host.len() <= base.len() {
150        return false;
151    }
152
153    let dot_pos = host.len() - base.len() - 1;
154    host.as_bytes().get(dot_pos) == Some(&b'.') && ends_with_ignore_ascii_case(host, base)
155}
156
157/// Common subdomain labels.
158static COMMON_SUBDOMAIN_LABELS: phf::Set<&'static str> = phf::phf_set! {
159    "www","m","amp","api","cdn","static","assets","img","images","media","files",
160    "login","auth","sso","id","account","accounts",
161    "app","apps","dashboard","admin","portal","console",
162    "status","support","help","docs","blog",
163    "dev","staging","stage","test","qa","uat","beta","alpha","preview","demo","sandbox",
164    "uploads","download","storage","origin","edge","cache",
165    "mail","email","smtp","mx","webmail",
166    "graphql","rpc","ws",
167};
168
169#[inline]
170/// Common sub domains.
171fn is_common_subdomain_label(lbl: &str) -> bool {
172    if lbl.is_empty() {
173        return false;
174    }
175    let lower = lbl.to_ascii_lowercase();
176    COMMON_SUBDOMAIN_LABELS.contains(lower.as_str())
177}
178
179#[inline]
180pub fn base_domain_from_url(main_url: &str) -> Option<&str> {
181    let (host, _) = host_and_rest(main_url)?;
182    Some(base_domain_from_host(host))
183}
184
185/// Given a base domain (already computed) and a URL, returns the “relative” path
186/// for same-site/subdomain URLs, otherwise returns the original URL.
187#[inline]
188pub fn rel_for_ignore_script<'a>(main_host_or_base: &str, url: &'a str) -> Cow<'a, str> {
189    if url.starts_with('/') {
190        return Cow::Borrowed(url);
191    }
192
193    let base = base_domain_from_host(main_host_or_base.trim_end_matches('.'));
194    let base = base.trim_end_matches('.');
195    if base.is_empty() {
196        return Cow::Borrowed(url);
197    }
198
199    let brand = first_label(base);
200
201    if let Some((host, rest)) = host_and_rest(url) {
202        if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
203            if rest.starts_with('/') {
204                return Cow::Borrowed(rest);
205            }
206            return Cow::Borrowed("/");
207        }
208    }
209
210    Cow::Borrowed(url)
211}
212
213#[inline]
214/// Common cc.
215fn is_common_cc_sld(sld: &str) -> bool {
216    let s = sld.as_bytes();
217    match s.len() {
218        2 => matches!(
219            [s[0].to_ascii_lowercase(), s[1].to_ascii_lowercase()],
220            [b'c', b'o'] | // co
221            [b'a', b'c'] | // ac
222            [b'g', b'o'] | // go
223            [b'o', b'r'] | // or
224            [b'n', b'e'] | // ne
225            [b'e', b'd'] | // ed
226            [b'g', b'r'] | // gr
227            [b'l', b'g'] | // lg
228            [b'a', b'd'] // ad
229        ),
230        3 => matches!(
231            [
232                s[0].to_ascii_lowercase(),
233                s[1].to_ascii_lowercase(),
234                s[2].to_ascii_lowercase()
235            ],
236            // globally common
237            [b'c', b'o', b'm'] | // com
238            [b'n', b'e', b't'] | // net
239            [b'o', b'r', b'g'] | // org
240            [b'g', b'o', b'v'] | // gov
241            [b'e', b'd', b'u'] | // edu
242            [b'm', b'i', b'l'] | // mil
243            [b'n', b'i', b'c'] | // nic
244            [b's', b'c', b'h'] | // sch
245            // MX / some LATAM
246            [b'g', b'o', b'b'] // gob
247        ),
248        4 => matches!(
249            [
250                s[0].to_ascii_lowercase(),
251                s[1].to_ascii_lowercase(),
252                s[2].to_ascii_lowercase(),
253                s[3].to_ascii_lowercase()
254            ],
255            [b'g', b'o', b'u', b'v'] // gouv (seen in some places)
256        ),
257        _ => false,
258    }
259}
260
261#[inline]
262/// Get the base “site” domain from a host.
263///
264/// - Normal sites: `staging.mainr.com` -> `mainr.com`
265/// - ccTLD-ish: `a.b.example.co.uk` -> `example.co.uk` (existing heuristic)
266/// - Multi-tenant SaaS: `mainr.chilipiper.com` -> `mainr.chilipiper.com`
267///   (keeps one extra label when it looks like a tenant, not `www`/`cdn`/etc.)
268pub fn base_domain_from_host(host: &str) -> &str {
269    let mut h = host.trim_end_matches('.');
270    if let Some(x) = h.strip_prefix("www.") {
271        h = x;
272    }
273    if let Some(x) = h.strip_prefix("m.") {
274        h = x;
275    }
276
277    // Find last two dots using SIMD-accelerated reverse search.
278    let hb = h.as_bytes();
279    let last_dot = match memrchr(b'.', hb) {
280        Some(p) => p,
281        None => return h,
282    };
283    let prev_dot = match memrchr(b'.', &hb[..last_dot]) {
284        Some(p) => p,
285        None => return h, // only 1 dot
286    };
287
288    let tld = &h[last_dot + 1..];
289    let sld = &h[prev_dot + 1..last_dot];
290
291    let mut base = &h[prev_dot + 1..]; // "example.com" or "co.uk"
292
293    if tld.len() == 2 && is_common_cc_sld(sld) {
294        if let Some(prev2_dot) = memrchr(b'.', &hb[..prev_dot]) {
295            base = &h[prev2_dot + 1..]; // "example.co.uk"
296        }
297    }
298
299    if h.len() > base.len() + 1 {
300        let base_start = h.len() - base.len();
301        let boundary = base_start - 1;
302        if hb.get(boundary) == Some(&b'.') {
303            let left_part = &h[..boundary];
304            // label immediately to the left of base
305            let (lbl_start, lbl) = match memrchr(b'.', left_part.as_bytes()) {
306                Some(p) => (p + 1, &left_part[p + 1..]),
307                None => (0, left_part),
308            };
309
310            if !lbl.is_empty() && !is_common_subdomain_label(lbl) {
311                // return "tenant.base" => slice starting at lbl_start
312                return &h[lbl_start..];
313            }
314        }
315    }
316
317    base
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    #[test]
325    fn test_domain_match_basic_and_subdomains() {
326        let base = "mainr.com";
327
328        assert!(host_is_subdomain_of("mainr.com", base));
329        assert!(host_is_subdomain_of("staging.mainr.com", base));
330        assert!(host_is_subdomain_of("a.b.c.mainr.com", base));
331
332        // case-insensitive
333        assert!(host_is_subdomain_of("StAgInG.mainr.CoM", "mainr.COM"));
334    }
335
336    #[test]
337    fn test_domain_match_no_false_positives() {
338        let base = "mainr.com";
339
340        // must be dot-boundary
341        assert!(!host_is_subdomain_of("evil-mainr.com", base));
342        assert!(!host_is_subdomain_of("mainr.com.evil.com", base));
343        assert!(!host_is_subdomain_of("stagingmainr.com", base));
344        assert!(!host_is_subdomain_of("mainr.co", base));
345    }
346
347    #[test]
348    fn test_host_and_rest_handles_userinfo_port_ipv6() {
349        let (h, rest) =
350            host_and_rest("https://user:pass@staging.mainr.com:8443/a.js?x=1#y").unwrap();
351        assert_eq!(h, "staging.mainr.com");
352        assert_eq!(rest, "/a.js?x=1#y");
353
354        let (h, rest) = host_and_rest("http://[::1]:8080/path").unwrap();
355        assert_eq!(h, "::1");
356        assert_eq!(rest, "/path");
357    }
358
359    #[test]
360    fn test_rel_for_ignore_script_mainr_example() {
361        let base = "mainr.com";
362
363        let main = "https://mainr.com/careers";
364        assert_eq!(rel_for_ignore_script(base, main).as_ref(), "/careers");
365
366        let script = "https://staging.mainr.com/mainr.min.js";
367        assert_eq!(
368            rel_for_ignore_script(base, script).as_ref(),
369            "/mainr.min.js"
370        );
371
372        // Different site stays absolute
373        let other = "https://cdn.other.com/app.js";
374        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
375
376        // Root-relative stays as-is
377        assert_eq!(
378            rel_for_ignore_script(base, "/static/app.js").as_ref(),
379            "/static/app.js"
380        );
381    }
382
383    #[test]
384    fn test_rel_for_ignore_script_query_only_same_site() {
385        let base = "example.com";
386        let u = "https://sub.example.com?x=1";
387        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/");
388    }
389
390    #[test]
391    fn test_rel_for_ignore_script_special_schemes() {
392        let base = "example.com";
393        let u = "blob:https://example.com/path/to/blob";
394        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/path/to/blob");
395    }
396
397    #[test]
398    fn test_base_domain_tenant_subdomain() {
399        let base = base_domain_from_host("mainr.chilipiper.com");
400        assert_eq!(base, "mainr.chilipiper.com");
401
402        // same tenant (subdomain) becomes relative
403        let u = "https://assets.mainr.chilipiper.com/a.js";
404        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/a.js");
405
406        // different tenant must NOT match
407        let other = "https://othertenant.chilipiper.com/a.js";
408        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
409    }
410
411    #[test]
412    fn test_brand_label_allows_vendor_subdomain() {
413        let base = "mainr.com";
414        let u = "https://mainr.chilipiper.com/concierge-js/cjs/concierge.js";
415        assert_eq!(
416            rel_for_ignore_script(base, u).as_ref(),
417            "/concierge-js/cjs/concierge.js"
418        );
419
420        // Important: not a substring match
421        let bad = "https://evil-mainr.com/x.js";
422        assert_eq!(rel_for_ignore_script(base, bad).as_ref(), bad);
423    }
424
425    #[test]
426    fn test_allows_vendor_host_when_brand_label_matches_main_site() {
427        // main page host is www.mainr.com
428        let main_host = "www.mainr.com";
429
430        let u = "https://mainr.chilipiper.com/concierge-js/cjs/concierge.js";
431        assert_eq!(
432            rel_for_ignore_script(main_host, u).as_ref(),
433            "/concierge-js/cjs/concierge.js"
434        );
435    }
436
437    // --- Additional edge-case tests for SIMD-accelerated paths ---
438
439    #[test]
440    fn test_host_and_rest_edge_cases() {
441        // Protocol-relative URL
442        let (h, rest) = host_and_rest("//example.com/path").unwrap();
443        assert_eq!(h, "example.com");
444        assert_eq!(rest, "/path");
445
446        // No path, query, or fragment
447        let (h, rest) = host_and_rest("https://example.com").unwrap();
448        assert_eq!(h, "example.com");
449        assert_eq!(rest, "");
450
451        // Query only (no path)
452        let (h, rest) = host_and_rest("https://example.com?q=1").unwrap();
453        assert_eq!(h, "example.com");
454        assert_eq!(rest, "?q=1");
455
456        // Fragment only (no path)
457        let (h, rest) = host_and_rest("https://example.com#frag").unwrap();
458        assert_eq!(h, "example.com");
459        assert_eq!(rest, "#frag");
460
461        // No scheme returns None
462        assert!(host_and_rest("example.com/path").is_none());
463        assert!(host_and_rest("").is_none());
464
465        // blob: + filesystem: schemes
466        let (h, _) = host_and_rest("filesystem:https://example.com/path").unwrap();
467        assert_eq!(h, "example.com");
468
469        // Port only, no path
470        let (h, rest) = host_and_rest("https://example.com:8080").unwrap();
471        assert_eq!(h, "example.com");
472        assert_eq!(rest, "");
473
474        // Userinfo with port
475        let (h, _) = host_and_rest("https://user@example.com:443/x").unwrap();
476        assert_eq!(h, "example.com");
477
478        // IPv6 without port
479        let (h, rest) = host_and_rest("http://[::1]/path").unwrap();
480        assert_eq!(h, "::1");
481        assert_eq!(rest, "/path");
482
483        // Empty authority
484        assert!(host_and_rest("http:///path").is_none());
485    }
486
487    #[test]
488    fn test_host_contains_label_icase_edge_cases() {
489        // Basic match
490        assert!(host_contains_label_icase("www.example.com", "example"));
491        assert!(host_contains_label_icase("www.example.com", "EXAMPLE"));
492        assert!(host_contains_label_icase("www.example.com", "www"));
493        assert!(host_contains_label_icase("www.example.com", "com"));
494
495        // Exact single-label host
496        assert!(host_contains_label_icase("localhost", "localhost"));
497        assert!(host_contains_label_icase("LOCALHOST", "localhost"));
498
499        // No partial matches
500        assert!(!host_contains_label_icase("www.example.com", "exam"));
501        assert!(!host_contains_label_icase("www.example.com", "ample"));
502
503        // Empty inputs
504        assert!(!host_contains_label_icase("", "example"));
505        assert!(!host_contains_label_icase("example.com", ""));
506
507        // Trailing dots
508        assert!(host_contains_label_icase("example.com.", "com"));
509        assert!(host_contains_label_icase("example.com.", "example"));
510    }
511
512    #[test]
513    fn test_first_label_edge_cases() {
514        assert_eq!(first_label("www.example.com"), "www");
515        assert_eq!(first_label("example.com"), "example");
516        assert_eq!(first_label("localhost"), "localhost");
517        assert_eq!(first_label("example.com."), "example");
518    }
519
520    #[test]
521    fn test_base_domain_from_host_edge_cases() {
522        // Simple two-label
523        assert_eq!(base_domain_from_host("example.com"), "example.com");
524
525        // Strip www/m
526        assert_eq!(base_domain_from_host("www.example.com"), "example.com");
527        assert_eq!(base_domain_from_host("m.example.com"), "example.com");
528
529        // ccTLD
530        assert_eq!(base_domain_from_host("example.co.uk"), "example.co.uk");
531        assert_eq!(base_domain_from_host("www.example.co.uk"), "example.co.uk");
532
533        // Single label
534        assert_eq!(base_domain_from_host("localhost"), "localhost");
535
536        // Trailing dot
537        assert_eq!(base_domain_from_host("example.com."), "example.com");
538    }
539
540    #[test]
541    fn test_host_is_subdomain_of_edge_cases() {
542        // Trailing dots
543        assert!(host_is_subdomain_of("example.com.", "example.com."));
544        assert!(host_is_subdomain_of("sub.example.com.", "example.com."));
545
546        // Empty base
547        assert!(!host_is_subdomain_of("example.com", ""));
548
549        // Exact match
550        assert!(host_is_subdomain_of("example.com", "example.com"));
551
552        // Shorter host than base
553        assert!(!host_is_subdomain_of("com", "example.com"));
554    }
555}