chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/enterprise.js",
73        "https://www.google.com/recaptcha/api.js",
74        "https://google.com/recaptcha/api.js",
75        "https://captcha.px-cloud.net/",
76        "https://cdn.auth0.com/js/lock/",
77        "https://captcha.gtimg.com",
78        "https://cdn.auth0.com/client",
79        "https://js.stripe.com/",
80        "https://cdn.prod.website-files.com/", // webflow cdn scripts
81        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
82        "https://code.jquery.com/jquery-"
83    ];
84
85    /// Determine if a script should be rendered in the browser by name.
86    ///
87    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
88    /// but we keep it for compatibility and other call sites.
89    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
90
91    /// General patterns for popular libraries and resources
92    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
93        // Verified 3rd parties for request
94        "https://m.stripe.network/",
95        "https://challenges.cloudflare.com/",
96        "https://www.google.com/recaptcha/api.js",
97        "https://google.com/recaptcha/api.js",
98        "https://www.google.com/recaptcha/enterprise.js",
99        "https://js.stripe.com/",
100        "https://cdn.prod.website-files.com/", // webflow cdn scripts
101        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
102        "https://code.jquery.com/jquery-",
103        "https://ct.captcha-delivery.com/",
104        "https://geo.captcha-delivery.com/captcha/",
105        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
106        "https://ct.captcha-delivery.com/",
107        "https://cdn.auth0.com/client",
108        "https://captcha.px-cloud.net/",
109        "https://www.gstatic.com/recaptcha/",
110        "https://www.google.com/recaptcha/api2/",
111        "https://www.recaptcha.net/recaptcha/",
112        "https://js.hcaptcha.com/1/api.js",
113        "https://hcaptcha.com/1/api.js",
114        "https://js.datadome.co/tags.js",
115        "https://api-js.datadome.co/",
116        "https://client.perimeterx.net/",
117        "https://captcha.px-cdn.net/",
118        "https://captcha.px-cloud.net/",
119        "https://s.perimeterx.net/",
120        "https://client-api.arkoselabs.com/v2/",
121        "https://static.geetest.com/v4/gt4.js",
122        "https://static.geetest.com/",
123        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
124        "https://cdn.perfdrive.com/aperture/",
125        "https://assets.queue-it.net/",
126        "discourse-cdn.com/",
127        "/cdn-cgi/challenge-platform/",
128        "/_Incapsula_Resource"
129    ];
130
131    /// Determine if a script should be rendered in the browser by name.
132    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
133
134    /// path of a js framework
135    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
136        phf::phf_set! {
137            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
138            "_astro/", "_app/immutable"
139        }
140    };
141
142    /// Ignore the content types.
143    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
144        "application/pdf",
145        "application/zip",
146        "application/x-rar-compressed",
147        "application/x-tar",
148        "image/png",
149        "image/jpeg",
150        "image/gif",
151        "image/bmp",
152        "image/webp",
153        "image/svg+xml",
154        "video/mp4",
155        "video/x-msvideo",
156        "video/x-matroska",
157        "video/webm",
158        "audio/mpeg",
159        "audio/ogg",
160        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
161        "application/vnd.ms-excel",
162        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
163        "application/vnd.ms-powerpoint",
164        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
165        "application/x-7z-compressed",
166        "application/x-rpm",
167        "application/x-shockwave-flash",
168        "application/rtf",
169    };
170
171    /// Ignore the resources for visual content types.
172    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
173        "Image",
174        "Media",
175        "Font"
176    };
177
178    /// Ignore the resources for visual content types.
179    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180        "CspViolationReport",
181        "Manifest",
182        "Other",
183        "Prefetch",
184        "Ping",
185    };
186
187    /// Case insenstive css matching
188    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
189
190    /// The command chain.
191    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
192        let enable = EnableParams::default();
193
194        if let Ok(c) = serde_json::to_value(&enable) {
195            vec![(enable.identifier(), c)]
196        } else {
197            vec![]
198        }
199    };
200
201    /// The command chain with https ignore.
202    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
203        let enable = EnableParams::default();
204        let mut v = vec![];
205        if let Ok(c) = serde_json::to_value(&enable) {
206            v.push((enable.identifier(), c));
207        }
208        let ignore = SetIgnoreCertificateErrorsParams::new(true);
209        if let Ok(ignored) = serde_json::to_value(&ignore) {
210            v.push((ignore.identifier(), ignored));
211        }
212
213        v
214    };
215
216    /// Enable the fetch intercept command
217    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
218        fetch::EnableParams::builder()
219        .handle_auth_requests(true)
220        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
221        .build()
222    };
223}
224
225/// Determine if a redirect is true.
226pub(crate) fn is_redirect_status(status: i64) -> bool {
227    matches!(status, 301 | 302 | 303 | 307 | 308)
228}
229
230/// Precomputed site keyword used to allow “related” 3rd party resources
231/// (e.g. "shopify", "stripe", "discourse", etc) by host/path heuristics.
232#[derive(Debug, Clone)]
233struct SiteKeyword {
234    /// lowercase ASCII bytes of the keyword (e.g. b"shopify")
235    kw_lower: Box<[u8]>,
236    /// b"/" + kw_lower + b"/" (e.g. b"/shopify/")
237    kw_slash: Box<[u8]>,
238}
239
240impl SiteKeyword {
241    /// Derive a keyword from a base domain:
242    /// - base: "example.com" -> kw = "example"
243    /// - rejects kw < 4 chars (too noisy)
244    #[inline]
245    fn new_from_base_domain(base: &str) -> Option<Self> {
246        let s = base.trim().trim_matches('.');
247        if s.is_empty() {
248            return None;
249        }
250
251        let kw = s.split('.').next().unwrap_or(s).trim();
252        if kw.len() < 4 {
253            return None;
254        }
255
256        // Precompute lowercase bytes once.
257        let mut kw_lower = Vec::with_capacity(kw.len());
258        for &b in kw.as_bytes() {
259            kw_lower.push(b.to_ascii_lowercase());
260        }
261
262        // Build "/kw/" once.
263        let mut kw_slash = Vec::with_capacity(kw_lower.len() + 2);
264        kw_slash.push(b'/');
265        kw_slash.extend_from_slice(&kw_lower);
266        kw_slash.push(b'/');
267
268        Some(Self {
269            kw_lower: kw_lower.into_boxed_slice(),
270            kw_slash: kw_slash.into_boxed_slice(),
271        })
272    }
273}
274
275/// ASCII-only case-insensitive "contains" without allocations.
276/// `needle_lower` must be lowercase ASCII bytes.
277#[inline]
278fn contains_ascii_ci(haystack: &[u8], needle_lower: &[u8]) -> bool {
279    let n = needle_lower.len();
280    if n == 0 {
281        return true;
282    }
283    if haystack.len() < n {
284        return false;
285    }
286
287    // Naive scan but tight + branch-light; fast enough for short needles (brand keywords).
288    // Compare by lowercasing each haystack byte on-the-fly.
289    for i in 0..=(haystack.len() - n) {
290        let mut ok = true;
291        for j in 0..n {
292            if haystack[i + j].to_ascii_lowercase() != needle_lower[j] {
293                ok = false;
294                break;
295            }
296        }
297        if ok {
298            return true;
299        }
300    }
301    false
302}
303
304#[derive(Debug)]
305/// The base network manager.
306pub struct NetworkManager {
307    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
308    ///
309    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
310    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
311    /// Consumers pull from this queue via `poll()`.
312    queued_events: VecDeque<NetworkEvent>,
313    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
314    ///
315    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
316    /// certificates (self-signed, expired, MITM proxies, etc.).
317    ignore_httpserrors: bool,
318    /// Active in-flight requests keyed by CDP `RequestId`.
319    ///
320    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
321    /// and final state used to emit `RequestFinished` / `RequestFailed`.
322    requests: HashMap<RequestId, HttpRequest>,
323    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
324    /// `Fetch.requestPaused` arrives later (or vice versa).
325    ///
326    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
327    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
328    // TODO put event in an Arc?
329    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
330    /// Extra HTTP headers to apply to subsequent network requests via CDP.
331    ///
332    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
333    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
334    extra_headers: std::collections::HashMap<String, String>,
335    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
336    ///
337    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
338    /// store the interception id here so it can be attached to the `HttpRequest` once the
339    /// network request is observed.
340    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
341    /// Whether the user has disabled the browser cache.
342    ///
343    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
344    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
345    user_cache_disabled: bool,
346    /// Tracks which requests have already attempted authentication.
347    ///
348    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
349    /// authentication challenges (407/401). Once a request id is present here, subsequent
350    /// challenges for the same request are canceled.
351    attempted_authentications: HashSet<RequestId>,
352    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
353    ///
354    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
355    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
356    credentials: Option<Credentials>,
357    /// User-facing toggle indicating whether request interception is desired.
358    ///
359    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
360    /// not guarantee interception is active; interception is actually enabled/disabled by
361    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
362    ///
363    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
364    /// enabled to satisfy auth challenges.
365    pub(crate) user_request_interception_enabled: bool,
366    /// Hard kill-switch to block all network traffic.
367    ///
368    /// When `true`, the manager immediately blocks requests (typically via
369    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
370    /// and short-circuits most decision logic. This is used for safety conditions such as
371    /// exceeding `max_bytes_allowed` or other runtime protections.
372    block_all: bool,
373    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
374    ///
375    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
376    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
377    /// when `user_request_interception_enabled` or `credentials` change.
378    pub(crate) protocol_request_interception_enabled: bool,
379    /// The network is offline.
380    offline: bool,
381    /// The page request timeout.
382    pub request_timeout: Duration,
383    // made_request: bool,
384    /// Ignore visuals (no pings, prefetching, and etc).
385    pub ignore_visuals: bool,
386    /// Block CSS stylesheets.
387    pub block_stylesheets: bool,
388    /// Block javascript that is not critical to rendering.
389    pub block_javascript: bool,
390    /// Block analytics from rendering
391    pub block_analytics: bool,
392    /// Only html from loading.
393    pub only_html: bool,
394    /// Is xml document?
395    pub xml_document: bool,
396    /// The custom intercept handle logic to run on the website.
397    pub intercept_manager: NetworkInterceptManager,
398    /// Track the amount of times the document reloaded.
399    pub document_reload_tracker: u8,
400    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
401    pub document_target_url: String,
402    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
403    pub document_target_domain: String,
404    /// Derived from document_target_domain (first label). Used to allow related 3p.
405    site_keyword: Option<SiteKeyword>,
406    /// The max bytes to receive.
407    pub max_bytes_allowed: Option<u64>,
408    #[cfg(feature = "_cache")]
409    /// The cache site_key to use.
410    pub cache_site_key: Option<String>,
411    /// The cache policy to use.
412    #[cfg(feature = "_cache")]
413    pub cache_policy: Option<BasicCachePolicy>,
414    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
415    whitelist_patterns: Vec<String>,
416    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
417    whitelist_matcher: Option<AhoCorasick>,
418    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
419    blacklist_patterns: Vec<String>,
420    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
421    blacklist_matcher: Option<AhoCorasick>,
422    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
423    blacklist_strict: bool,
424}
425
426impl NetworkManager {
427    /// A new network manager.
428    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
429        Self {
430            queued_events: Default::default(),
431            ignore_httpserrors,
432            requests: Default::default(),
433            requests_will_be_sent: Default::default(),
434            extra_headers: Default::default(),
435            request_id_to_interception_id: Default::default(),
436            user_cache_disabled: false,
437            attempted_authentications: Default::default(),
438            credentials: None,
439            block_all: false,
440            user_request_interception_enabled: false,
441            protocol_request_interception_enabled: false,
442            offline: false,
443            request_timeout,
444            ignore_visuals: false,
445            block_javascript: false,
446            block_stylesheets: false,
447            block_analytics: true,
448            only_html: false,
449            xml_document: false,
450            intercept_manager: NetworkInterceptManager::Unknown,
451            document_reload_tracker: 0,
452            document_target_url: String::new(),
453            document_target_domain: String::new(),
454            whitelist_patterns: Vec::new(),
455            whitelist_matcher: None,
456            blacklist_patterns: Vec::new(),
457            blacklist_matcher: None,
458            blacklist_strict: true,
459            max_bytes_allowed: None,
460            #[cfg(feature = "_cache")]
461            cache_site_key: None,
462            #[cfg(feature = "_cache")]
463            cache_policy: None,
464            site_keyword: None,
465        }
466    }
467
468    /// Replace the whitelist patterns (compiled once).
469    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
470    where
471        I: IntoIterator<Item = S>,
472        S: Into<String>,
473    {
474        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
475        self.rebuild_whitelist_matcher();
476    }
477
478    /// Replace the blacklist patterns (compiled once).
479    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
480    where
481        I: IntoIterator<Item = S>,
482        S: Into<String>,
483    {
484        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
485        self.rebuild_blacklist_matcher();
486    }
487
488    /// Add one pattern (cheap) and rebuild (call this sparingly).
489    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
490        self.blacklist_patterns.push(pattern.into());
491        self.rebuild_blacklist_matcher();
492    }
493
494    /// Add many patterns and rebuild once.
495    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
496    where
497        I: IntoIterator<Item = S>,
498        S: Into<String>,
499    {
500        self.blacklist_patterns
501            .extend(patterns.into_iter().map(Into::into));
502        self.rebuild_blacklist_matcher();
503    }
504
505    /// Clear blacklist entirely.
506    pub fn clear_blacklist(&mut self) {
507        self.blacklist_patterns.clear();
508        self.blacklist_matcher = None;
509    }
510
511    /// Control precedence: when true, blacklist always wins.
512    pub fn set_blacklist_strict(&mut self, strict: bool) {
513        self.blacklist_strict = strict;
514    }
515
516    #[inline]
517    fn rebuild_blacklist_matcher(&mut self) {
518        if self.blacklist_patterns.is_empty() {
519            self.blacklist_matcher = None;
520            return;
521        }
522
523        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
524        self.blacklist_matcher = AhoCorasick::new(refs).ok();
525    }
526
527    #[inline]
528    fn is_blacklisted(&self, url: &str) -> bool {
529        self.blacklist_matcher
530            .as_ref()
531            .map(|m| m.is_match(url))
532            .unwrap_or(false)
533    }
534
535    /// Add one pattern (cheap) and rebuild (call this sparingly).
536    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
537        self.whitelist_patterns.push(pattern.into());
538        self.rebuild_whitelist_matcher();
539    }
540
541    /// Add many patterns and rebuild once.
542    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
543    where
544        I: IntoIterator<Item = S>,
545        S: Into<String>,
546    {
547        self.whitelist_patterns
548            .extend(patterns.into_iter().map(Into::into));
549        self.rebuild_whitelist_matcher();
550    }
551
552    #[inline]
553    fn rebuild_whitelist_matcher(&mut self) {
554        if self.whitelist_patterns.is_empty() {
555            self.whitelist_matcher = None;
556            return;
557        }
558
559        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
560
561        // If building fails (shouldn’t for simple patterns), just disable matcher.
562        self.whitelist_matcher = AhoCorasick::new(refs).ok();
563    }
564
565    #[inline]
566    fn is_whitelisted(&self, url: &str) -> bool {
567        self.whitelist_matcher
568            .as_ref()
569            .map(|m| m.is_match(url))
570            .unwrap_or(false)
571    }
572
573    /// Commands to init the chain with.
574    pub fn init_commands(&self) -> CommandChain {
575        let cmds = if self.ignore_httpserrors {
576            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
577        } else {
578            INIT_CHAIN.clone()
579        };
580        CommandChain::new(cmds, self.request_timeout)
581    }
582
583    /// Push the CDP request.
584    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
585        let method = cmd.identifier();
586        if let Ok(params) = serde_json::to_value(cmd) {
587            self.queued_events
588                .push_back(NetworkEvent::SendCdpRequest((method, params)));
589        }
590    }
591
592    /// The next event to handle.
593    pub fn poll(&mut self) -> Option<NetworkEvent> {
594        self.queued_events.pop_front()
595    }
596
597    /// Get the extra headers.
598    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
599        &self.extra_headers
600    }
601
602    /// Set extra HTTP headers.
603    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
604        self.extra_headers = headers;
605        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
606        self.extra_headers.remove("Proxy-Authorization");
607        if !self.extra_headers.is_empty() {
608            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
609                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
610            }
611        }
612    }
613
614    /// Fast “related 3p” allow:
615    /// - host contains site keyword (ASCII CI), OR
616    /// - path contains "/kw/" (ASCII CI)
617    ///
618    /// IMPORTANT: This is only a heuristic “allow” used when you're otherwise blocking.
619    #[inline]
620    fn is_related_3rd_party_by_keyword_fast(&self, url: &str) -> bool {
621        let Some(sk) = self.site_keyword.as_ref() else {
622            return false;
623        };
624
625        // Don't treat relative as 3p.
626        if url.starts_with('/') {
627            return false;
628        }
629
630        // Need host + rest
631        let Some((host, rest)) = host_and_rest(url) else {
632            return false;
633        };
634
635        // If same-site, ignore.
636        let base = self.document_target_domain.trim().trim_matches('.');
637        if !base.is_empty() {
638            // strict same-site/subdomain check
639            if host == base
640                || host.ends_with(base)
641                    && host
642                        .as_bytes()
643                        .get(host.len().saturating_sub(base.len() + 1))
644                        == Some(&b'.')
645            {
646                return false;
647            }
648        }
649
650        // 1) host contains keyword (case-insensitive ASCII)
651        if contains_ascii_ci(host.as_bytes(), &sk.kw_lower) {
652            return true;
653        }
654
655        // 2) path contains "/kw/" (case-insensitive ASCII)
656        contains_ascii_ci(rest.as_bytes(), &sk.kw_slash)
657    }
658
659    /// Url matches analytics / trackers / ignorable script noise.
660    #[inline]
661    pub(crate) fn ignore_script(
662        &self,
663        url: &str,
664        block_analytics: bool,
665        intercept_manager: NetworkInterceptManager,
666    ) -> bool {
667        // allow relative domains/scripts by default.
668        // we only treat non-relative (absolute) as "ignore candidates"
669        // unless we later determine otherwise.
670        let mut ignore_script = !url.starts_with('/');
671
672        // Full URL ignore trie (only when analytics blocking enabled)
673        if !ignore_script
674            && block_analytics
675            && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
676        {
677            ignore_script = true;
678        }
679
680        // Path-based parsing / ignore lists for absolute URLs
681        if !ignore_script {
682            if let Some(index) = url.find("//") {
683                let pos = index + 2;
684
685                // Ensure there is something after `//`
686                if pos < url.len() {
687                    // Find the first slash after the `//`
688                    if let Some(slash_index) = url[pos..].find('/') {
689                        let base_path_index = pos + slash_index + 1;
690
691                        if url.len() > base_path_index {
692                            let new_url: &str = &url[base_path_index..];
693
694                            // ignore assets we do not need for frameworks
695                            if intercept_manager == NetworkInterceptManager::Unknown {
696                                let hydration_file =
697                                    JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
698
699                                // ignore astro hydration paths (only when it ends with .js)
700                                if hydration_file && new_url.ends_with(".js") {
701                                    ignore_script = true;
702                                }
703                            }
704
705                            if !ignore_script
706                                && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
707                            {
708                                ignore_script = true;
709                            }
710
711                            if !ignore_script
712                                && self.ignore_visuals
713                                && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
714                            {
715                                ignore_script = true;
716                            }
717                        }
718                    }
719                }
720            }
721        }
722
723        // Fallback for filename/path-based analytics ignores (only when analytics blocking enabled)
724        if !ignore_script && block_analytics {
725            ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
726        }
727
728        ignore_script
729    }
730
731    /// STRICT relative normalization for “script ignore tries”:
732    #[inline]
733    fn rel_block_strict<'a>(&self, url: &'a str) -> std::borrow::Cow<'a, str> {
734        if url.starts_with('/') {
735            return std::borrow::Cow::Borrowed(url);
736        }
737
738        let base = self.document_target_domain.trim().trim_matches('.');
739        if base.is_empty() {
740            return std::borrow::Cow::Borrowed(url);
741        }
742
743        let Some((host, rest)) = host_and_rest(url) else {
744            return std::borrow::Cow::Borrowed(url);
745        };
746
747        let same_site = host == base
748            || (host.ends_with(base)
749                && host
750                    .as_bytes()
751                    .get(host.len().saturating_sub(base.len() + 1))
752                    == Some(&b'.'));
753
754        if same_site {
755            if rest.starts_with('/') {
756                std::borrow::Cow::Borrowed(rest)
757            } else {
758                std::borrow::Cow::Borrowed("/")
759            }
760        } else {
761            std::borrow::Cow::Borrowed(url)
762        }
763    }
764
765    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
766        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
767    }
768
769    pub fn set_block_all(&mut self, block_all: bool) {
770        self.block_all = block_all;
771    }
772
773    pub fn set_request_interception(&mut self, enabled: bool) {
774        self.user_request_interception_enabled = enabled;
775        self.update_protocol_request_interception();
776    }
777
778    pub fn set_cache_enabled(&mut self, enabled: bool) {
779        let run = self.user_cache_disabled != !enabled;
780        self.user_cache_disabled = !enabled;
781        if run {
782            self.update_protocol_cache_disabled();
783        }
784    }
785
786    /// Enable fetch interception.
787    pub fn enable_request_intercept(&mut self) {
788        self.protocol_request_interception_enabled = true;
789    }
790
791    /// Disable fetch interception.
792    pub fn disable_request_intercept(&mut self) {
793        self.protocol_request_interception_enabled = false;
794    }
795
796    /// Set the cache site key.
797    #[cfg(feature = "_cache")]
798    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
799        self.cache_site_key = cache_site_key;
800    }
801
802    /// Set the cache policy.
803    #[cfg(feature = "_cache")]
804    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
805        self.cache_policy = cache_policy;
806    }
807
808    pub fn update_protocol_cache_disabled(&mut self) {
809        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
810    }
811
812    pub fn authenticate(&mut self, credentials: Credentials) {
813        self.credentials = Some(credentials);
814        self.update_protocol_request_interception();
815        self.protocol_request_interception_enabled = true;
816    }
817
818    fn update_protocol_request_interception(&mut self) {
819        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
820
821        if enabled == self.protocol_request_interception_enabled {
822            return;
823        }
824
825        if enabled {
826            self.push_cdp_request(ENABLE_FETCH.clone())
827        } else {
828            self.push_cdp_request(DisableParams::default())
829        }
830    }
831
832    /// Determine if the request should be skipped.
833    #[inline]
834    fn skip_xhr(
835        &self,
836        skip_networking: bool,
837        event: &EventRequestPaused,
838        network_event: bool,
839    ) -> bool {
840        // XHR check
841        if !skip_networking && network_event {
842            let request_url = event.request.url.as_str();
843
844            // check if part of ignore scripts.
845            let skip_analytics =
846                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
847
848            if skip_analytics {
849                true
850            } else if self.block_stylesheets || self.ignore_visuals {
851                let block_css = self.block_stylesheets;
852                let block_media = self.ignore_visuals;
853
854                let mut block_request = false;
855
856                if let Some(position) = request_url.rfind('.') {
857                    let hlen = request_url.len();
858                    let has_asset = hlen - position;
859
860                    if has_asset >= 3 {
861                        let next_position = position + 1;
862
863                        if block_media
864                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
865                                &request_url[next_position..].into(),
866                            )
867                        {
868                            block_request = true;
869                        } else if block_css {
870                            block_request =
871                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
872                                    .contains(&**CSS_EXTENSION)
873                        }
874                    }
875                }
876
877                if !block_request {
878                    block_request = ignore_script_xhr_media(request_url);
879                }
880
881                block_request
882            } else {
883                skip_networking
884            }
885        } else {
886            skip_networking
887        }
888    }
889
890    #[cfg(feature = "adblock")]
891    #[inline]
892    /// Detect if ad enabled.
893    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
894        if skip_networking {
895            true
896        } else {
897            self.detect_ad(event)
898        }
899    }
900
901    /// When adblock feature is disabled, this is a no-op.
902    #[cfg(not(feature = "adblock"))]
903    #[inline]
904    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
905        skip_networking
906    }
907
908    #[inline]
909    /// Fail request
910    fn fail_request_blocked(
911        &mut self,
912        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
913    ) {
914        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
915            request_id.clone(),
916            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
917        );
918        self.push_cdp_request(params);
919    }
920
921    #[inline]
922    /// Fulfill request
923    fn fulfill_request_empty_200(
924        &mut self,
925        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
926    ) {
927        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
928            request_id.clone(),
929            200,
930        );
931        self.push_cdp_request(params);
932    }
933
934    #[cfg(feature = "_cache")]
935    #[inline]
936    /// Fulfill a paused Fetch request from cached bytes + header map.
937    ///
938    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
939    fn fulfill_request_from_cache(
940        &mut self,
941        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
942        body: &[u8],
943        headers: &std::collections::HashMap<String, String>,
944        status: i64,
945    ) {
946        use crate::cdp::browser_protocol::fetch::HeaderEntry;
947        use crate::handler::network::fetch::FulfillRequestParams;
948        use base64::Engine;
949
950        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
951
952        for (k, v) in headers.iter() {
953            resp_headers.push(HeaderEntry {
954                name: k.clone().into(),
955                value: v.clone().into(),
956            });
957        }
958
959        let mut params = FulfillRequestParams::new(request_id.clone(), status);
960
961        // TODO: have this already encoded prior.
962        params.body = Some(
963            base64::engine::general_purpose::STANDARD
964                .encode(body)
965                .into(),
966        );
967
968        params.response_headers = Some(resp_headers);
969
970        self.push_cdp_request(params);
971    }
972
973    #[inline]
974    /// Continue the request url.
975    fn continue_request_with_url(
976        &mut self,
977        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
978        url: Option<&str>,
979        intercept_response: bool,
980    ) {
981        let mut params = ContinueRequestParams::new(request_id.clone());
982        if let Some(url) = url {
983            params.url = Some(url.to_string());
984            params.intercept_response = Some(intercept_response);
985        }
986        self.push_cdp_request(params);
987    }
988
989    /// On fetch request paused interception.
990    #[inline]
991    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
992        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
993            return;
994        }
995
996        let resource_type = &event.resource_type;
997
998        // Hard kill-switch.
999        if self.block_all {
1000            tracing::debug!(
1001                "Blocked (block_all): {:?} - {}",
1002                event.resource_type,
1003                event.request.url
1004            );
1005            return self.fail_request_blocked(&event.request_id);
1006        }
1007
1008        // Attach interception_id to HttpRequest tracking (race-safe).
1009        if let Some(network_id) = event.network_id.as_ref() {
1010            if let Some(request_will_be_sent) =
1011                self.requests_will_be_sent.remove(network_id.as_ref())
1012            {
1013                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
1014            } else {
1015                self.request_id_to_interception_id
1016                    .insert(network_id.clone(), event.request_id.clone().into());
1017            }
1018        }
1019
1020        // Resource flags.
1021        let javascript_resource = *resource_type == ResourceType::Script;
1022        let document_resource = *resource_type == ResourceType::Document;
1023        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
1024
1025        // Start with static / cheap skip checks.
1026        let mut skip_networking =
1027            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
1028
1029        // Short-circuit if we've reloaded this document too many times.
1030        if !skip_networking {
1031            skip_networking = self.document_reload_tracker >= 3;
1032        }
1033
1034        // Handle document redirect masking / tracking and xml detection.
1035        let (current_url_cow, had_replacer) =
1036            self.handle_document_replacement_and_tracking(event, document_resource);
1037
1038        let current_url: &str = current_url_cow.as_ref();
1039
1040        // Dynamic blacklist (still supported). If strict, blacklist always wins at the end.
1041        let blacklisted = self.is_blacklisted(current_url);
1042        if !self.blacklist_strict && blacklisted {
1043            skip_networking = true;
1044        }
1045
1046        // ---------------------------------------------------------------------
1047        // 1) Base blocking: visuals + stylesheets + (optionally) scripts
1048        //
1049        //   ALLOW when it matches:
1050        //   - ALLOWED_MATCHER (first-party-ish patterns, but applied after strict relative normalize)
1051        //   - ALLOWED_MATCHER_3RD_PARTY (explicit 3p allow)
1052        //   - dynamic whitelist (per-run)
1053        //   - site-keyword allow (related 3p heuristic)
1054        // ---------------------------------------------------------------------
1055        if !skip_networking {
1056            // Allow XSL for sitemap XML.
1057            if self.xml_document && current_url.ends_with(".xsl") {
1058                skip_networking = false;
1059            } else {
1060                // Visuals + stylesheet blocks.
1061                skip_networking = (self.ignore_visuals
1062                    && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1063                    || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet);
1064
1065                // Script allowlist gate.
1066                if !skip_networking && javascript_resource && self.block_javascript {
1067                    // 1) dynamic whitelist always allows
1068                    let mut allowed = self.is_whitelisted(current_url);
1069
1070                    // 2) explicit 3rd-party allow patterns require full URL match
1071                    if !allowed && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url) {
1072                        allowed = true;
1073                    }
1074
1075                    // 3) first-party-ish allow patterns: apply strict relative normalization
1076                    // so absolute same-site URLs become "/path" (stable matching)
1077                    if !allowed {
1078                        let rel = self.rel_block_strict(current_url);
1079                        if ALLOWED_MATCHER.is_match(rel.as_ref()) {
1080                            allowed = true;
1081                        }
1082                    }
1083
1084                    // 4) site keyword allow (related 3p heuristic)
1085                    if !allowed && self.is_related_3rd_party_by_keyword_fast(current_url) {
1086                        allowed = true;
1087                    }
1088
1089                    if !allowed {
1090                        skip_networking = true;
1091                    }
1092                }
1093            }
1094        }
1095
1096        // Ad blocking (only active when feature = "adblock").
1097        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1098
1099        // Ignore embedded scripts when only_html or ignore_visuals is set.
1100        if !skip_networking
1101            && (self.only_html || self.ignore_visuals)
1102            && (javascript_resource || document_resource)
1103        {
1104            skip_networking = ignore_script_embedded(current_url);
1105        }
1106
1107        // Analytics / ignore tries (strict-relative) for scripts.
1108        if !skip_networking && javascript_resource {
1109            let rel = self.rel_block_strict(current_url);
1110            if self.ignore_script(rel.as_ref(), self.block_analytics, self.intercept_manager) {
1111                skip_networking = true;
1112            }
1113        }
1114
1115        // XHR / data resources.
1116        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1117
1118        // Custom interception layer.
1119        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1120            skip_networking = self.intercept_manager.intercept_detection(
1121                current_url,
1122                self.ignore_visuals,
1123                network_resource,
1124            );
1125        }
1126
1127        // Custom website block list.
1128        if !skip_networking && (javascript_resource || network_resource) {
1129            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1130        }
1131
1132        // If strict mode, blacklist always wins at the end (cannot be unblocked).
1133        if self.blacklist_strict && blacklisted {
1134            skip_networking = true;
1135        }
1136
1137        // Final decision.
1138        if skip_networking {
1139            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
1140            self.fulfill_request_empty_200(&event.request_id);
1141        } else {
1142            #[cfg(feature = "_cache")]
1143            {
1144                if let (Some(policy), Some(cache_site_key)) =
1145                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1146                {
1147                    let cache_key = format!("{}:{}", event.request.method, &current_url);
1148
1149                    if let Some((res, cache_policy)) =
1150                        crate::cache::remote::get_session_cache_item(cache_site_key, &cache_key)
1151                    {
1152                        if policy.allows_cached(&cache_policy) {
1153                            tracing::debug!("Remote Cached: {:?} - {}", resource_type, &cache_key);
1154                            return self.fulfill_request_from_cache(
1155                                &event.request_id,
1156                                &res.body,
1157                                &res.headers,
1158                                res.status as i64,
1159                            );
1160                        }
1161                    }
1162                }
1163            }
1164
1165            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1166            self.continue_request_with_url(
1167                &event.request_id,
1168                if had_replacer {
1169                    Some(current_url)
1170                } else {
1171                    None
1172                },
1173                !had_replacer,
1174            );
1175        }
1176    }
1177
1178    /// Does the network manager have a target domain?
1179    pub fn has_target_domain(&self) -> bool {
1180        !self.document_target_url.is_empty()
1181    }
1182
1183    /// Set the target page url for tracking.
1184    pub fn set_page_url(&mut self, page_target_url: String) {
1185        let host_base = host_and_rest(&page_target_url)
1186            .map(|(h, _)| base_domain_from_host(h))
1187            .unwrap_or("");
1188
1189        self.document_target_domain = host_base.to_string();
1190        self.document_target_url = page_target_url;
1191        self.site_keyword = SiteKeyword::new_from_base_domain(&self.document_target_domain);
1192    }
1193
1194    /// Clear the initial target domain on every navigation.
1195    pub fn clear_target_domain(&mut self) {
1196        self.document_reload_tracker = 0;
1197        self.document_target_url = Default::default();
1198        self.document_target_domain = Default::default();
1199        self.site_keyword = None;
1200    }
1201
1202    /// Handles:
1203    /// - document reload tracking (`document_reload_tracker`)
1204    /// - redirect masking / replacement
1205    /// - xml document detection (`xml_document`)
1206    /// - `document_target_url` updates
1207    ///
1208    /// Returns (current_url, had_replacer).
1209    #[inline]
1210    fn handle_document_replacement_and_tracking<'a>(
1211        &mut self,
1212        event: &'a EventRequestPaused,
1213        document_resource: bool,
1214    ) -> (Cow<'a, str>, bool) {
1215        let mut replacer: Option<String> = None;
1216        let current_url = event.request.url.as_str();
1217
1218        if document_resource {
1219            if self.document_target_url == current_url {
1220                self.document_reload_tracker += 1;
1221            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1222            {
1223                let (http_document_replacement, mut https_document_replacement) =
1224                    if self.document_target_url.starts_with("http://") {
1225                        (
1226                            self.document_target_url.replacen("http://", "http//", 1),
1227                            self.document_target_url.replacen("http://", "https://", 1),
1228                        )
1229                    } else {
1230                        (
1231                            self.document_target_url.replacen("https://", "https//", 1),
1232                            self.document_target_url.replacen("https://", "http://", 1),
1233                        )
1234                    };
1235
1236                // Track trailing slash to restore later.
1237                let trailing = https_document_replacement.ends_with('/');
1238                if trailing {
1239                    https_document_replacement.pop();
1240                }
1241                if https_document_replacement.ends_with('/') {
1242                    https_document_replacement.pop();
1243                }
1244
1245                let redirect_mask = format!(
1246                    "{}{}",
1247                    https_document_replacement, http_document_replacement
1248                );
1249
1250                if current_url == redirect_mask {
1251                    replacer = Some(if trailing {
1252                        format!("{}/", https_document_replacement)
1253                    } else {
1254                        https_document_replacement
1255                    });
1256                }
1257            }
1258
1259            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1260                self.xml_document = true;
1261            }
1262
1263            // Track last seen document URL.
1264            self.document_target_url = event.request.url.clone();
1265            self.document_target_domain = host_and_rest(&self.document_target_url)
1266                .map(|(h, _)| base_domain_from_host(h).to_string())
1267                .unwrap_or_default();
1268        }
1269
1270        let current_url_cow = match replacer {
1271            Some(r) => Cow::Owned(r),
1272            None => Cow::Borrowed(event.request.url.as_str()),
1273        };
1274
1275        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1276        (current_url_cow, had_replacer)
1277    }
1278
1279    /// Perform a page intercept for chrome
1280    #[cfg(feature = "adblock")]
1281    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1282        use adblock::{
1283            lists::{FilterSet, ParseOptions, RuleTypes},
1284            Engine,
1285        };
1286
1287        lazy_static::lazy_static! {
1288            static ref AD_ENGINE: Engine = {
1289                let mut filter_set = FilterSet::new(false);
1290                let mut rules = ParseOptions::default();
1291                rules.rule_types = RuleTypes::All;
1292
1293                filter_set.add_filters(
1294                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1295                    rules,
1296                );
1297
1298                Engine::from_filter_set(filter_set, true)
1299            };
1300        };
1301
1302        let blockable = ResourceType::Image == event.resource_type
1303            || event.resource_type == ResourceType::Media
1304            || event.resource_type == ResourceType::Stylesheet
1305            || event.resource_type == ResourceType::Document
1306            || event.resource_type == ResourceType::Fetch
1307            || event.resource_type == ResourceType::Xhr;
1308
1309        let u = &event.request.url;
1310
1311        let block_request = blockable
1312            // set it to example.com for 3rd party handling is_same_site
1313        && {
1314            let request = adblock::request::Request::preparsed(
1315                 &u,
1316                 "example.com",
1317                 "example.com",
1318                 &event.resource_type.as_ref().to_lowercase(),
1319                 !event.request.is_same_site.unwrap_or_default());
1320
1321            AD_ENGINE.check_network_request(&request).matched
1322        };
1323
1324        block_request
1325    }
1326
1327    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1328        let response = if self
1329            .attempted_authentications
1330            .contains(event.request_id.as_ref())
1331        {
1332            AuthChallengeResponseResponse::CancelAuth
1333        } else if self.credentials.is_some() {
1334            self.attempted_authentications
1335                .insert(event.request_id.clone().into());
1336            AuthChallengeResponseResponse::ProvideCredentials
1337        } else {
1338            AuthChallengeResponseResponse::Default
1339        };
1340
1341        let mut auth = AuthChallengeResponse::new(response);
1342        if let Some(creds) = self.credentials.clone() {
1343            auth.username = Some(creds.username);
1344            auth.password = Some(creds.password);
1345        }
1346        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1347    }
1348
1349    /// Set the page offline network emulation condition.
1350    pub fn set_offline_mode(&mut self, value: bool) {
1351        if self.offline == value {
1352            return;
1353        }
1354        self.offline = value;
1355        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1356            .offline(self.offline)
1357            .latency(0)
1358            .download_throughput(-1.)
1359            .upload_throughput(-1.)
1360            .build()
1361        {
1362            self.push_cdp_request(network);
1363        }
1364    }
1365
1366    /// Request interception doesn't happen for data URLs with Network Service.
1367    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1368        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1369            if let Some(interception_id) = self
1370                .request_id_to_interception_id
1371                .remove(event.request_id.as_ref())
1372            {
1373                self.on_request(event, Some(interception_id));
1374            } else {
1375                // TODO remove the clone for event
1376                self.requests_will_be_sent
1377                    .insert(event.request_id.clone(), event.clone());
1378            }
1379        } else {
1380            self.on_request(event, None);
1381        }
1382    }
1383
1384    /// The request was served from the cache.
1385    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1386        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1387            request.from_memory_cache = true;
1388        }
1389    }
1390
1391    /// On network response received.
1392    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1393        let mut request_failed = false;
1394
1395        // Track how many bytes we actually deducted from this target.
1396        let mut deducted: u64 = 0;
1397
1398        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1399            let before = *max_bytes;
1400
1401            // encoded_data_length -> saturating cast to u64
1402            let received_bytes: u64 = event.response.encoded_data_length as u64;
1403
1404            // Safe parse of Content-Length
1405            let content_length: Option<u64> = event
1406                .response
1407                .headers
1408                .inner()
1409                .get("content-length")
1410                .and_then(|v| v.as_str())
1411                .and_then(|s| s.trim().parse::<u64>().ok());
1412
1413            // Deduct what we actually received
1414            *max_bytes = max_bytes.saturating_sub(received_bytes);
1415
1416            // If the declared size can't fit, zero out now
1417            if let Some(cl) = content_length {
1418                if cl > *max_bytes {
1419                    *max_bytes = 0;
1420                }
1421            }
1422
1423            request_failed = *max_bytes == 0;
1424
1425            // Compute exact delta deducted on this event
1426            deducted = before.saturating_sub(*max_bytes);
1427        }
1428
1429        // Bubble up the deduction (even if request continues)
1430        if deducted > 0 {
1431            self.queued_events
1432                .push_back(NetworkEvent::BytesConsumed(deducted));
1433        }
1434
1435        // block all network request moving forward.
1436        if request_failed && self.max_bytes_allowed.is_some() {
1437            self.set_block_all(true);
1438        }
1439
1440        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1441            request.set_response(event.response.clone());
1442            self.queued_events.push_back(if request_failed {
1443                NetworkEvent::RequestFailed(request)
1444            } else {
1445                NetworkEvent::RequestFinished(request)
1446            });
1447        }
1448    }
1449
1450    /// On network loading finished.
1451    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1452        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1453            if let Some(interception_id) = request.interception_id.as_ref() {
1454                self.attempted_authentications
1455                    .remove(interception_id.as_ref());
1456            }
1457            self.queued_events
1458                .push_back(NetworkEvent::RequestFinished(request));
1459        }
1460    }
1461
1462    /// On network loading failed.
1463    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1464        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1465            request.failure_text = Some(event.error_text.clone());
1466            if let Some(interception_id) = request.interception_id.as_ref() {
1467                self.attempted_authentications
1468                    .remove(interception_id.as_ref());
1469            }
1470            self.queued_events
1471                .push_back(NetworkEvent::RequestFailed(request));
1472        }
1473    }
1474
1475    /// On request will be sent.
1476    fn on_request(
1477        &mut self,
1478        event: &EventRequestWillBeSent,
1479        interception_id: Option<InterceptionId>,
1480    ) {
1481        let mut redirect_chain = Vec::new();
1482        let mut redirect_location = None;
1483
1484        if let Some(redirect_resp) = &event.redirect_response {
1485            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1486                if is_redirect_status(redirect_resp.status) {
1487                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1488                        if redirect_resp.url != location {
1489                            let fixed_location = location.replace(&redirect_resp.url, "");
1490
1491                            if !fixed_location.is_empty() {
1492                                request.response.as_mut().map(|resp| {
1493                                    resp.headers.0["Location"] =
1494                                        serde_json::Value::String(fixed_location.clone());
1495                                });
1496                            }
1497
1498                            redirect_location = Some(fixed_location);
1499                        }
1500                    }
1501                }
1502
1503                self.handle_request_redirect(
1504                    &mut request,
1505                    if let Some(redirect_location) = redirect_location {
1506                        let mut redirect_resp = redirect_resp.clone();
1507
1508                        if !redirect_location.is_empty() {
1509                            redirect_resp.headers.0["Location"] =
1510                                serde_json::Value::String(redirect_location);
1511                        }
1512
1513                        redirect_resp
1514                    } else {
1515                        redirect_resp.clone()
1516                    },
1517                );
1518
1519                redirect_chain = std::mem::take(&mut request.redirect_chain);
1520                redirect_chain.push(request);
1521            }
1522        }
1523
1524        let request = HttpRequest::new(
1525            event.request_id.clone(),
1526            event.frame_id.clone(),
1527            interception_id,
1528            self.user_request_interception_enabled,
1529            redirect_chain,
1530        );
1531
1532        self.requests.insert(event.request_id.clone(), request);
1533        self.queued_events
1534            .push_back(NetworkEvent::Request(event.request_id.clone()));
1535    }
1536
1537    /// Handle request redirect.
1538    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1539        request.set_response(response);
1540        if let Some(interception_id) = request.interception_id.as_ref() {
1541            self.attempted_authentications
1542                .remove(interception_id.as_ref());
1543        }
1544    }
1545}
1546
1547#[derive(Debug)]
1548pub enum NetworkEvent {
1549    /// Send a CDP request.
1550    SendCdpRequest((MethodId, serde_json::Value)),
1551    /// Request.
1552    Request(RequestId),
1553    /// Response
1554    Response(RequestId),
1555    /// Request failed.
1556    RequestFailed(HttpRequest),
1557    /// Request finished.
1558    RequestFinished(HttpRequest),
1559    /// Bytes consumed.
1560    BytesConsumed(u64),
1561}
1562
1563#[cfg(test)]
1564mod tests {
1565    use super::ALLOWED_MATCHER_3RD_PARTY;
1566    use crate::handler::network::NetworkManager;
1567    use std::time::Duration;
1568
1569    #[test]
1570    fn test_allowed_matcher_3rd_party() {
1571        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1572        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1573        assert!(
1574            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1575            "expected Cloudflare challenge script to be allowed"
1576        );
1577
1578        // Should NOT be allowed (not in allow-list)
1579        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1580        assert!(
1581            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1582            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1583        );
1584
1585        // A couple sanity checks for existing allow patterns
1586        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1587        assert!(ALLOWED_MATCHER_3RD_PARTY
1588            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1589        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1590    }
1591
1592    #[test]
1593    fn test_allowed_matcher_3rd_party_sanity() {
1594        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1595        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1596        assert!(
1597            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1598            "expected Cloudflare challenge script to be allowed"
1599        );
1600
1601        // Should NOT be allowed (not in allow-list)
1602        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1603        assert!(
1604            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1605            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1606        );
1607
1608        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1609        assert!(ALLOWED_MATCHER_3RD_PARTY
1610            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1611        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1612    }
1613    #[test]
1614    fn test_dynamic_blacklist_blocks_url() {
1615        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1616        nm.set_page_url("https://example.com/".to_string());
1617
1618        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1619        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1620        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1621
1622        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1623    }
1624
1625    #[test]
1626    fn test_blacklist_strict_wins_over_whitelist() {
1627        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1628        nm.set_page_url("https://example.com/".to_string());
1629
1630        // Same URL in both lists.
1631        nm.set_blacklist_patterns(["beacon.min.js"]);
1632        nm.set_whitelist_patterns(["beacon.min.js"]);
1633
1634        nm.set_blacklist_strict(true);
1635
1636        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1637        assert!(nm.is_whitelisted(u));
1638        assert!(nm.is_blacklisted(u));
1639
1640        // In strict mode, it should still be considered blocked at decision time.
1641        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1642        assert!(nm.blacklist_strict);
1643    }
1644
1645    #[test]
1646    fn test_blacklist_non_strict_allows_whitelist_override() {
1647        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1648        nm.set_page_url("https://example.com/".to_string());
1649
1650        nm.set_blacklist_patterns(["beacon.min.js"]);
1651        nm.set_whitelist_patterns(["beacon.min.js"]);
1652
1653        nm.set_blacklist_strict(false);
1654
1655        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1656        assert!(nm.is_blacklisted(u));
1657        assert!(nm.is_whitelisted(u));
1658        assert!(!nm.blacklist_strict);
1659    }
1660}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs