Skip to main content

chromiumoxide/handler/
network_utils.rs

1use std::borrow::Cow;
2
3#[inline]
4fn strip_special_schemes(url: &str) -> &str {
5    let url = url.strip_prefix("blob:").unwrap_or(url);
6    url.strip_prefix("filesystem:").unwrap_or(url)
7}
8
9/// Returns (host_without_port, rest_starting_at_/ ? # or empty)
10/// Robust: handles protocol-relative, userinfo, IPv6 literals, ports.
11#[inline]
12pub fn host_and_rest(url: &str) -> Option<(&str, &str)> {
13    let url = strip_special_schemes(url);
14
15    let host_start = if let Some(pos) = url.find("://") {
16        pos + 3
17    } else if url.starts_with("//") {
18        2
19    } else {
20        return None;
21    };
22
23    // End of authority (first / ? # after host_start)
24    let mut rest_start = url.len();
25    if let Some(i) = url[host_start..].find('/') {
26        rest_start = host_start + i;
27    }
28    if let Some(i) = url[host_start..].find('?') {
29        rest_start = rest_start.min(host_start + i);
30    }
31    if let Some(i) = url[host_start..].find('#') {
32        rest_start = rest_start.min(host_start + i);
33    }
34
35    let authority = &url[host_start..rest_start];
36    if authority.is_empty() {
37        return None;
38    }
39
40    // Drop userinfo if present: user:pass@host
41    let authority = authority.rsplit('@').next().unwrap_or(authority);
42
43    // IPv6: [::1]:8080
44    if authority.as_bytes().first() == Some(&b'[') {
45        let close = authority.find(']')?;
46        let host = &authority[1..close];
47        return Some((host, &url[rest_start..]));
48    }
49
50    // IPv4/hostname: host:port
51    let host_end = authority.find(':').unwrap_or(authority.len());
52    let host = &authority[..host_end];
53    if host.is_empty() {
54        return None;
55    }
56
57    Some((host, &url[rest_start..]))
58}
59
60#[inline]
61fn eq_ignore_ascii_case(a: &str, b: &str) -> bool {
62    a.eq_ignore_ascii_case(b)
63}
64
65#[inline]
66pub fn ends_with_ignore_ascii_case(hay: &str, suf: &str) -> bool {
67    if suf.len() > hay.len() {
68        return false;
69    }
70    hay[hay.len() - suf.len()..].eq_ignore_ascii_case(suf)
71}
72
73#[inline]
74pub fn base_domain_from_any(s: &str) -> &str {
75    if let Some((h, _)) = host_and_rest(s) {
76        base_domain_from_host(h)
77    } else {
78        base_domain_from_host(s)
79    }
80}
81
82#[inline]
83pub fn first_label(host: &str) -> &str {
84    let h = host.trim_end_matches('.');
85    match h.find('.') {
86        Some(i) => &h[..i],
87        None => h,
88    }
89}
90
91#[inline]
92pub fn host_contains_label_icase(host: &str, label: &str) -> bool {
93    let host = host.trim_end_matches('.');
94    let label = label.trim_matches('.');
95
96    if host.is_empty() || label.is_empty() {
97        return false;
98    }
99
100    let hb = host.as_bytes();
101    let lb = label.as_bytes();
102
103    let mut i = 0usize;
104    while i < hb.len() {
105        while i < hb.len() && hb[i] == b'.' {
106            i += 1;
107        }
108        if i >= hb.len() {
109            break;
110        }
111
112        let start = i;
113        while i < hb.len() && hb[i] != b'.' {
114            i += 1;
115        }
116        let end = i;
117
118        if end - start == lb.len() && hb[start..end].eq_ignore_ascii_case(lb) {
119            return true;
120        }
121    }
122
123    false
124}
125
126/// Host matches base if host == base OR host ends with ".{base}" (case-insensitive),
127/// with a required dot boundary to prevent "evil-mainr.com" matching "mainr.com".
128#[inline]
129pub fn host_is_subdomain_of(host: &str, base: &str) -> bool {
130    let host = host.trim_end_matches('.');
131    let base = base.trim_end_matches('.');
132
133    if base.is_empty() {
134        return false;
135    }
136
137    if eq_ignore_ascii_case(host, base) {
138        return true;
139    }
140
141    if host.len() <= base.len() {
142        return false;
143    }
144
145    let dot_pos = host.len() - base.len() - 1;
146    host.as_bytes().get(dot_pos) == Some(&b'.') && ends_with_ignore_ascii_case(host, base)
147}
148
149/// Common subdomain labels.
150static COMMON_SUBDOMAIN_LABELS: phf::Set<&'static str> = phf::phf_set! {
151    "www","m","amp","api","cdn","static","assets","img","images","media","files",
152    "login","auth","sso","id","account","accounts",
153    "app","apps","dashboard","admin","portal","console",
154    "status","support","help","docs","blog",
155    "dev","staging","stage","test","qa","uat","beta","alpha","preview","demo","sandbox",
156    "uploads","download","storage","origin","edge","cache",
157    "mail","email","smtp","mx","webmail",
158    "graphql","rpc","ws",
159};
160
161#[inline]
162/// Common sub domains.
163fn is_common_subdomain_label(lbl: &str) -> bool {
164    if lbl.is_empty() {
165        return false;
166    }
167    let lower = lbl.to_ascii_lowercase(); // alloc
168    COMMON_SUBDOMAIN_LABELS.contains(lower.as_str())
169}
170
171#[inline]
172pub fn base_domain_from_url(main_url: &str) -> Option<&str> {
173    let (host, _) = host_and_rest(main_url)?;
174    Some(base_domain_from_host(host))
175}
176
177/// Given a base domain (already computed) and a URL, returns the “relative” path
178/// for same-site/subdomain URLs, otherwise returns the original URL.
179#[inline]
180pub fn rel_for_ignore_script<'a>(main_host_or_base: &str, url: &'a str) -> Cow<'a, str> {
181    if url.starts_with('/') {
182        return Cow::Borrowed(url);
183    }
184
185    let base = base_domain_from_host(main_host_or_base.trim_end_matches('.'));
186    let base = base.trim_end_matches('.');
187    if base.is_empty() {
188        return Cow::Borrowed(url);
189    }
190
191    let brand = first_label(base);
192
193    if let Some((host, rest)) = host_and_rest(url) {
194        if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
195            if rest.starts_with('/') {
196                return Cow::Borrowed(rest);
197            }
198            return Cow::Borrowed("/");
199        }
200    }
201
202    Cow::Borrowed(url)
203}
204
205#[inline]
206/// Common cc.
207fn is_common_cc_sld(sld: &str) -> bool {
208    let s = sld.as_bytes();
209    match s.len() {
210        2 => matches!(
211            [s[0].to_ascii_lowercase(), s[1].to_ascii_lowercase()],
212            [b'c', b'o'] | // co
213            [b'a', b'c'] | // ac
214            [b'g', b'o'] | // go
215            [b'o', b'r'] | // or
216            [b'n', b'e'] | // ne
217            [b'e', b'd'] | // ed
218            [b'g', b'r'] | // gr
219            [b'l', b'g'] | // lg
220            [b'a', b'd'] // ad
221        ),
222        3 => matches!(
223            [
224                s[0].to_ascii_lowercase(),
225                s[1].to_ascii_lowercase(),
226                s[2].to_ascii_lowercase()
227            ],
228            // globally common
229            [b'c', b'o', b'm'] | // com
230            [b'n', b'e', b't'] | // net
231            [b'o', b'r', b'g'] | // org
232            [b'g', b'o', b'v'] | // gov
233            [b'e', b'd', b'u'] | // edu
234            [b'm', b'i', b'l'] | // mil
235            [b'n', b'i', b'c'] | // nic
236            [b's', b'c', b'h'] | // sch
237            // MX / some LATAM
238            [b'g', b'o', b'b'] // gob
239        ),
240        4 => matches!(
241            [
242                s[0].to_ascii_lowercase(),
243                s[1].to_ascii_lowercase(),
244                s[2].to_ascii_lowercase(),
245                s[3].to_ascii_lowercase()
246            ],
247            [b'g', b'o', b'u', b'v'] // gouv (seen in some places)
248        ),
249        _ => false,
250    }
251}
252
253#[inline]
254/// Get the base “site” domain from a host.
255///
256/// - Normal sites: `staging.mainr.com` -> `mainr.com`
257/// - ccTLD-ish: `a.b.example.co.uk` -> `example.co.uk` (existing heuristic)
258/// - Multi-tenant SaaS: `mainr.chilipiper.com` -> `mainr.chilipiper.com`
259///   (keeps one extra label when it looks like a tenant, not `www`/`cdn`/etc.)
260pub fn base_domain_from_host(host: &str) -> &str {
261    let mut h = host.trim_end_matches('.');
262    if let Some(x) = h.strip_prefix("www.") {
263        h = x;
264    }
265    if let Some(x) = h.strip_prefix("m.") {
266        h = x;
267    }
268
269    // Find last two dots
270    let last_dot = match h.rfind('.') {
271        Some(p) => p,
272        None => return h,
273    };
274    let prev_dot = match h[..last_dot].rfind('.') {
275        Some(p) => p,
276        None => return h, // only 1 dot
277    };
278
279    let tld = &h[last_dot + 1..];
280    let sld = &h[prev_dot + 1..last_dot];
281
282    let mut base = &h[prev_dot + 1..]; // "example.com" or "co.uk"
283
284    if tld.len() == 2 && is_common_cc_sld(sld) {
285        if let Some(prev2_dot) = h[..prev_dot].rfind('.') {
286            base = &h[prev2_dot + 1..]; // "example.co.uk"
287        }
288    }
289
290    if h.len() > base.len() + 1 {
291        let base_start = h.len() - base.len();
292        let boundary = base_start - 1;
293        if h.as_bytes().get(boundary) == Some(&b'.') {
294            let left_part = &h[..boundary];
295            // label immediately to the left of base
296            let (lbl_start, lbl) = match left_part.rfind('.') {
297                Some(p) => (p + 1, &left_part[p + 1..]),
298                None => (0, left_part),
299            };
300
301            if !lbl.is_empty() && !is_common_subdomain_label(lbl) {
302                // return "tenant.base" => slice starting at lbl_start
303                return &h[lbl_start..];
304            }
305        }
306    }
307
308    base
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_domain_match_basic_and_subdomains() {
317        let base = "mainr.com";
318
319        assert!(host_is_subdomain_of("mainr.com", base));
320        assert!(host_is_subdomain_of("staging.mainr.com", base));
321        assert!(host_is_subdomain_of("a.b.c.mainr.com", base));
322
323        // case-insensitive
324        assert!(host_is_subdomain_of("StAgInG.mainr.CoM", "mainr.COM"));
325    }
326
327    #[test]
328    fn test_domain_match_no_false_positives() {
329        let base = "mainr.com";
330
331        // must be dot-boundary
332        assert!(!host_is_subdomain_of("evil-mainr.com", base));
333        assert!(!host_is_subdomain_of("mainr.com.evil.com", base));
334        assert!(!host_is_subdomain_of("stagingmainr.com", base));
335        assert!(!host_is_subdomain_of("mainr.co", base));
336    }
337
338    #[test]
339    fn test_host_and_rest_handles_userinfo_port_ipv6() {
340        let (h, rest) =
341            host_and_rest("https://user:pass@staging.mainr.com:8443/a.js?x=1#y").unwrap();
342        assert_eq!(h, "staging.mainr.com");
343        assert_eq!(rest, "/a.js?x=1#y");
344
345        let (h, rest) = host_and_rest("http://[::1]:8080/path").unwrap();
346        assert_eq!(h, "::1");
347        assert_eq!(rest, "/path");
348    }
349
350    #[test]
351    fn test_rel_for_ignore_script_mainr_example() {
352        let base = "mainr.com";
353
354        let main = "https://mainr.com/careers";
355        assert_eq!(rel_for_ignore_script(base, main).as_ref(), "/careers");
356
357        let script = "https://staging.mainr.com/mainr.min.js";
358        assert_eq!(
359            rel_for_ignore_script(base, script).as_ref(),
360            "/mainr.min.js"
361        );
362
363        // Different site stays absolute
364        let other = "https://cdn.other.com/app.js";
365        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
366
367        // Root-relative stays as-is
368        assert_eq!(
369            rel_for_ignore_script(base, "/static/app.js").as_ref(),
370            "/static/app.js"
371        );
372    }
373
374    #[test]
375    fn test_rel_for_ignore_script_query_only_same_site() {
376        let base = "example.com";
377        let u = "https://sub.example.com?x=1";
378        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/");
379    }
380
381    #[test]
382    fn test_rel_for_ignore_script_special_schemes() {
383        let base = "example.com";
384        let u = "blob:https://example.com/path/to/blob";
385        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/path/to/blob");
386    }
387
388    #[test]
389    fn test_base_domain_tenant_subdomain() {
390        let base = base_domain_from_host("mainr.chilipiper.com");
391        assert_eq!(base, "mainr.chilipiper.com");
392
393        // same tenant (subdomain) becomes relative
394        let u = "https://assets.mainr.chilipiper.com/a.js";
395        assert_eq!(rel_for_ignore_script(base, u).as_ref(), "/a.js");
396
397        // different tenant must NOT match
398        let other = "https://othertenant.chilipiper.com/a.js";
399        assert_eq!(rel_for_ignore_script(base, other).as_ref(), other);
400    }
401
402    #[test]
403    fn test_brand_label_allows_vendor_subdomain() {
404        let base = "mainr.com";
405        let u = "https://mainr.chilipiper.com/concierge-js/cjs/concierge.js";
406        assert_eq!(
407            rel_for_ignore_script(base, u).as_ref(),
408            "/concierge-js/cjs/concierge.js"
409        );
410
411        // Important: not a substring match
412        let bad = "https://evil-mainr.com/x.js";
413        assert_eq!(rel_for_ignore_script(base, bad).as_ref(), bad);
414    }
415
416    #[test]
417    fn test_allows_vendor_host_when_brand_label_matches_main_site() {
418        // main page host is www.mainr.com
419        let main_host = "www.mainr.com";
420
421        let u = "https://mainr.chilipiper.com/concierge-js/cjs/concierge.js";
422        assert_eq!(
423            rel_for_ignore_script(main_host, u).as_ref(),
424            "/concierge-js/cjs/concierge.js"
425        );
426    }
427}