chromiumoxide/handler/
network_utils.rs

1use std::borrow::Cow;
2
3#[inline]
4fn strip_special_schemes(url: &str) -> &str {
5    let url = url.strip_prefix("blob:").unwrap_or(url);
6    url.strip_prefix("filesystem:").unwrap_or(url)
7}
8
9/// Returns (host_without_port, rest_starting_at_/ ? # or empty)
10/// Robust: handles protocol-relative, userinfo, IPv6 literals, ports.
11#[inline]
12pub fn host_and_rest(url: &str) -> Option<(&str, &str)> {
13    let url = strip_special_schemes(url);
14
15    let host_start = if let Some(pos) = url.find("://") {
16        pos + 3
17    } else if url.starts_with("//") {
18        2
19    } else {
20        return None;
21    };
22
23    // End of authority (first / ? # after host_start)
24    let mut rest_start = url.len();
25    if let Some(i) = url[host_start..].find('/') {
26        rest_start = host_start + i;
27    }
28    if let Some(i) = url[host_start..].find('?') {
29        rest_start = rest_start.min(host_start + i);
30    }
31    if let Some(i) = url[host_start..].find('#') {
32        rest_start = rest_start.min(host_start + i);
33    }
34
35    let authority = &url[host_start..rest_start];
36    if authority.is_empty() {
37        return None;
38    }
39
40    // Drop userinfo if present: user:pass@host
41    let authority = authority.rsplit('@').next().unwrap_or(authority);
42
43    // IPv6: [::1]:8080
44    if authority.as_bytes().first() == Some(&b'[') {
45        let close = authority.find(']')?;
46        let host = &authority[1..close];
47        return Some((host, &url[rest_start..]));
48    }
49
50    // IPv4/hostname: host:port
51    let host_end = authority.find(':').unwrap_or(authority.len());
52    let host = &authority[..host_end];
53    if host.is_empty() {
54        return None;
55    }
56
57    Some((host, &url[rest_start..]))
58}
59
60#[inline]
61fn eq_ignore_ascii_case(a: &str, b: &str) -> bool {
62    a.len() == b.len()
63        && a.as_bytes()
64            .iter()
65            .zip(b.as_bytes().iter())
66            .all(|(x, y)| x.to_ascii_lowercase() == y.to_ascii_lowercase())
67}
68
69#[inline]
70pub fn ends_with_ignore_ascii_case(hay: &str, suf: &str) -> bool {
71    if suf.len() > hay.len() {
72        return false;
73    }
74    let a = &hay.as_bytes()[hay.len() - suf.len()..];
75    let b = suf.as_bytes();
76    a.iter()
77        .zip(b.iter())
78        .all(|(x, y)| x.to_ascii_lowercase() == y.to_ascii_lowercase())
79}
80
81#[inline]
82pub fn base_domain_from_any(s: &str) -> &str {
83    if let Some((h, _)) = host_and_rest(s) {
84        base_domain_from_host(h)
85    } else {
86        base_domain_from_host(s)
87    }
88}
89
90#[inline]
91pub fn first_label(host: &str) -> &str {
92    let h = host.trim_end_matches('.');
93    match h.find('.') {
94        Some(i) => &h[..i],
95        None => h,
96    }
97}
98
99#[inline]
100pub fn host_contains_label_icase(host: &str, label: &str) -> bool {
101    let host = host.trim_end_matches('.');
102    let label = label.trim_matches('.');
103
104    if host.is_empty() || label.is_empty() {
105        return false;
106    }
107
108    let hb = host.as_bytes();
109    let lb = label.as_bytes();
110
111    let mut i = 0usize;
112    while i < hb.len() {
113        while i < hb.len() && hb[i] == b'.' {
114            i += 1;
115        }
116        if i >= hb.len() {
117            break;
118        }
119
120        let start = i;
121        while i < hb.len() && hb[i] != b'.' {
122            i += 1;
123        }
124        let end = i;
125
126        if end - start == lb.len() {
127            let mut ok = true;
128            for k in 0..lb.len() {
129                if hb[start + k].to_ascii_lowercase() != lb[k].to_ascii_lowercase() {
130                    ok = false;
131                    break;
132                }
133            }
134            if ok {
135                return true;
136            }
137        }
138    }
139
140    false
141}
142
143/// Host matches base if host == base OR host ends with ".{base}" (case-insensitive),
144/// with a required dot boundary to prevent "evil-mainr.com" matching "mainr.com".
145#[inline]
146pub fn host_is_subdomain_of(host: &str, base: &str) -> bool {
147    let host = host.trim_end_matches('.');
148    let base = base.trim_end_matches('.');
149
150    if base.is_empty() {
151        return false;
152    }
153
154    if eq_ignore_ascii_case(host, base) {
155        return true;
156    }
157
158    if host.len() <= base.len() {
159        return false;
160    }
161
162    let dot_pos = host.len() - base.len() - 1;
163    host.as_bytes().get(dot_pos) == Some(&b'.') && ends_with_ignore_ascii_case(host, base)
164}
165
166/// Common subdomain labels.
167static COMMON_SUBDOMAIN_LABELS: phf::Set<&'static str> = phf::phf_set! {
168    "www","m","amp","api","cdn","static","assets","img","images","media","files",
169    "login","auth","sso","id","account","accounts",
170    "app","apps","dashboard","admin","portal","console",
171    "status","support","help","docs","blog",
172    "dev","staging","stage","test","qa","uat","beta","alpha","preview","demo","sandbox",
173    "uploads","download","storage","origin","edge","cache",
174    "mail","email","smtp","mx","webmail",
175    "graphql","rpc","ws",
176};
177
178#[inline]
179/// Common sub domains.
180fn is_common_subdomain_label(lbl: &str) -> bool {
181    if lbl.is_empty() {
182        return false;
183    }
184    let lower = lbl.to_ascii_lowercase(); // alloc
185    COMMON_SUBDOMAIN_LABELS.contains(lower.as_str())
186}
187
188#[inline]
189pub fn base_domain_from_url<'a>(main_url: &'a str) -> Option<&'a str> {
190    let (host, _) = host_and_rest(main_url)?;
191    Some(base_domain_from_host(host))
192}
193
194/// Given a base domain (already computed) and a URL, returns the “relative” path
195/// for same-site/subdomain URLs, otherwise returns the original URL.
196#[inline]
197pub fn rel_for_ignore_script<'a>(main_host_or_base: &str, url: &'a str) -> Cow<'a, str> {
198    if url.starts_with('/') {
199        return Cow::Borrowed(url);
200    }
201
202    let base = base_domain_from_host(main_host_or_base.trim_end_matches('.'));
203    let base = base.trim_end_matches('.');
204    if base.is_empty() {
205        return Cow::Borrowed(url);
206    }
207
208    let brand = first_label(base);
209
210    if let Some((host, rest)) = host_and_rest(url) {
211        if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
212            if rest.starts_with('/') {
213                return Cow::Borrowed(rest);
214            }
215            return Cow::Borrowed("/");
216        }
217    }
218
219    Cow::Borrowed(url)
220}
221
222#[inline]
223/// Common cc.
224fn is_common_cc_sld(sld: &str) -> bool {
225    let s = sld.as_bytes();
226    match s.len() {
227        2 => matches!(
228            [s[0].to_ascii_lowercase(), s[1].to_ascii_lowercase()],
229            [b'c', b'o'] | // co
230            [b'a', b'c'] | // ac
231            [b'g', b'o'] | // go
232            [b'o', b'r'] | // or
233            [b'n', b'e'] | // ne
234            [b'e', b'd'] | // ed
235            [b'g', b'r'] | // gr
236            [b'l', b'g'] | // lg
237            [b'a', b'd'] // ad
238        ),
239        3 => matches!(
240            [
241                s[0].to_ascii_lowercase(),
242                s[1].to_ascii_lowercase(),
243                s[2].to_ascii_lowercase()
244            ],
245            // globally common
246            [b'c', b'o', b'm'] | // com
247            [b'n', b'e', b't'] | // net
248            [b'o', b'r', b'g'] | // org
249            [b'g', b'o', b'v'] | // gov
250            [b'e', b'd', b'u'] | // edu
251            [b'm', b'i', b'l'] | // mil
252            [b'n', b'i', b'c'] | // nic
253            [b's', b'c', b'h'] | // sch
254            // MX / some LATAM
255            [b'g', b'o', b'b'] // gob
256        ),
257        4 => matches!(
258            [
259                s[0].to_ascii_lowercase(),
260                s[1].to_ascii_lowercase(),
261                s[2].to_ascii_lowercase(),
262                s[3].to_ascii_lowercase()
263            ],
264            [b'g', b'o', b'u', b'v'] // gouv (seen in some places)
265        ),
266        _ => false,
267    }
268}
269
270#[inline]
271/// Get the base “site” domain from a host.
272///
273/// - Normal sites: `staging.mainr.com` -> `mainr.com`
274/// - ccTLD-ish: `a.b.example.co.uk` -> `example.co.uk` (existing heuristic)
275/// - Multi-tenant SaaS: `mainr.chilipiper.com` -> `mainr.chilipiper.com`
276///   (keeps one extra label when it looks like a tenant, not `www`/`cdn`/etc.)
277pub fn base_domain_from_host(host: &str) -> &str {
278    let mut h = host.trim_end_matches('.');
279    if let Some(x) = h.strip_prefix("www.") {
280        h = x;
281    }
282    if let Some(x) = h.strip_prefix("m.") {
283        h = x;
284    }
285
286    // Find last two dots
287    let last_dot = match h.rfind('.') {
288        Some(p) => p,
289        None => return h,
290    };
291    let prev_dot = match h[..last_dot].rfind('.') {
292        Some(p) => p,
293        None => return h, // only 1 dot
294    };
295
296    let tld = &h[last_dot + 1..];
297    let sld = &h[prev_dot + 1..last_dot];
298
299    let mut base = &h[prev_dot + 1..]; // "example.com" or "co.uk"
300
301    if tld.len() == 2 && is_common_cc_sld(sld) {
302        if let Some(prev2_dot) = h[..prev_dot].rfind('.') {
303            base = &h[prev2_dot + 1..]; // "example.co.uk"
304        }
305    }
306
307    if h.len() > base.len() + 1 {
308        let base_start = h.len() - base.len();
309        let boundary = base_start - 1;
310        if h.as_bytes().get(boundary) == Some(&b'.') {
311            let left_part = &h[..boundary];
312            // label immediately to the left of base
313            let (lbl_start, lbl) = match left_part.rfind('.') {
314                Some(p) => (p + 1, &left_part[p + 1..]),
315                None => (0, left_part),
316            };
317
318            if !lbl.is_empty() && !is_common_subdomain_label(lbl) {
319                // return "tenant.base" => slice starting at lbl_start
320                return &h[lbl_start..];
321            }
322        }
323    }
324
325    base
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn test_domain_match_basic_and_subdomains() {
334        let base = "mainr.com";
335
336        assert!(host_is_subdomain_of("mainr.com", base));
337        assert!(host_is_subdomain_of("staging.mainr.com", base));
338        assert!(host_is_subdomain_of("a.b.c.mainr.com", base));
339
340        // case-insensitive
341        assert!(host_is_subdomain_of("StAgInG.mainr.CoM", "mainr.COM"));
342    }
343
344    #[test]
345    fn test_domain_match_no_false_positives() {
346        let base = "mainr.com";
347
348        // must be dot-boundary
349        assert!(!host_is_subdomain_of("evil-mainr.com", base));
350        assert!(!host_is_subdomain_of("mainr.com.evil.com", base));
351        assert!(!host_is_subdomain_of("stagingmainr.com", base));
352        assert!(!host_is_subdomain_of("mainr.co", base));
353    }
354
355    #[test]
356    fn test_host_and_rest_handles_userinfo_port_ipv6() {
357        let (h, rest) =
358            host_and_rest("https://user:pass@staging.mainr.com:8443/a.js?x=1#y").unwrap();
359        assert_eq!(h, "staging.mainr.com");
360        assert_eq!(rest, "/a.js?x=1#y");
361
362        let (h, rest) = host_and_rest("http://[::1]:8080/path").unwrap();
363        assert_eq!(h, "::1");
364        assert_eq!(rest, "/path");
365    }
366
367    #[test]
368    fn test_rel_for_ignore_script_mainr_example() {
369        let base = "mainr.com";
370
371        let main = "https://mainr.com/careers";
372        assert_eq!(rel_for_ignore_script(base, main).as_ref(), "/careers");
373
374        let script = "https://staging.mainr.com/mainr.min.js";
375        assert_eq!(
376            rel_for_ignore_script(base, script).as_ref(),
377            "/mainr.min.js"
378        );
379
380        // Different site stays absolute
381        let other = "https://cdn.other.com/app.js";
382        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
383
384        // Root-relative stays as-is
385        assert_eq!(
386            rel_for_ignore_script(base, "/static/app.js").as_ref(),
387            "/static/app.js"
388        );
389    }
390
391    #[test]
392    fn test_rel_for_ignore_script_query_only_same_site() {
393        let base = "example.com";
394        let u = "https://sub.example.com?x=1";
395        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/");
396    }
397
398    #[test]
399    fn test_rel_for_ignore_script_special_schemes() {
400        let base = "example.com";
401        let u = "blob:https://example.com/path/to/blob";
402        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/path/to/blob");
403    }
404
405    #[test]
406    fn test_base_domain_tenant_subdomain() {
407        let base = base_domain_from_host("mainr.chilipiper.com");
408        assert_eq!(base, "mainr.chilipiper.com");
409
410        // same tenant (subdomain) becomes relative
411        let u = "https://assets.mainr.chilipiper.com/a.js";
412        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/a.js");
413
414        // different tenant must NOT match
415        let other = "https://othertenant.chilipiper.com/a.js";
416        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
417    }
418
419    #[test]
420    fn test_brand_label_allows_vendor_subdomain() {
421        let base = "mainr.com";
422        let u = "https://mainr.chilipiper.com/concierge-js/cjs/concierge.js";
423        assert_eq!(
424            rel_for_ignore_script(base, u).as_ref(),
425            "/concierge-js/cjs/concierge.js"
426        );
427
428        // Important: not a substring match
429        let bad = "https://evil-mainr.com/x.js";
430        assert_eq!(rel_for_ignore_script(base, bad).as_ref(), bad);
431    }
432
433    #[test]
434    fn test_allows_vendor_host_when_brand_label_matches_main_site() {
435        // main page host is www.mainr.com
436        let main_host = "www.mainr.com";
437
438        let u = "https://mainr.chilipiper.com/concierge-js/cjs/concierge.js";
439        assert_eq!(
440            rel_for_ignore_script(main_host, u).as_ref(),
441            "/concierge-js/cjs/concierge.js"
442        );
443    }
444}