chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "/cdn-cgi/challenge-platform/",
66        "/wp-content/js/",  // Covers Wordpress content
67        // Verified 3rd parties for request
68        "https://m.stripe.network/",
69        "https://challenges.cloudflare.com/",
70        "https://www.google.com/recaptcha/enterprise.js",
71        "https://www.google.com/recaptcha/api.js",
72        "https://google.com/recaptcha/api.js",
73        "https://captcha.px-cloud.net/",
74        "https://cdn.auth0.com/js/lock/",
75        "https://cdn.auth0.com/client",
76        "https://js.stripe.com/",
77        "https://cdn.prod.website-files.com/", // webflow cdn scripts
78        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
79        "https://code.jquery.com/jquery-"
80    ];
81
82    /// Determine if a script should be rendered in the browser by name.
83    ///
84    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
85    /// but we keep it for compatibility and other call sites.
86    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88    /// General patterns for popular libraries and resources
89    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90        // Verified 3rd parties for request
91        "https://m.stripe.network/",
92        "https://challenges.cloudflare.com/",
93        "https://www.google.com/recaptcha/api.js",
94        "https://google.com/recaptcha/api.js",
95        "https://www.google.com/recaptcha/enterprise.js",
96        "https://js.stripe.com/",
97        "https://cdn.prod.website-files.com/", // webflow cdn scripts
98        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
99        "https://code.jquery.com/jquery-",
100        "https://ct.captcha-delivery.com/",
101        "https://geo.captcha-delivery.com/captcha/",
102        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
103        "https://ct.captcha-delivery.com/",
104        "https://cdn.auth0.com/client",
105        "https://captcha.px-cloud.net/",
106        "https://www.gstatic.com/recaptcha/",
107        "https://www.google.com/recaptcha/api2/",
108        "https://www.recaptcha.net/recaptcha/",
109        "https://www.recaptcha.net/recaptcha/api2/",
110        "https://js.hcaptcha.com/1/api.js",
111        "https://hcaptcha.com/1/api.js",
112        "https://js.datadome.co/tags.js",
113        "https://api-js.datadome.co/",
114        "https://client.perimeterx.net/",
115        "https://captcha.px-cdn.net/",
116        "https://captcha.px-cloud.net/",
117        "https://s.perimeterx.net/",
118        "https://client-api.arkoselabs.com/v2/",
119        "https://static.geetest.com/v4/gt4.js",
120        "https://static.geetest.com/",
121        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
122        "https://cdn.perfdrive.com/aperture/",
123        "https://assets.queue-it.net/",
124        "/cdn-cgi/challenge-platform/",
125        "/_Incapsula_Resource",
126        "discourse-cdn.com/"
127    ];
128
129    /// Determine if a script should be rendered in the browser by name.
130    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
131
132    /// path of a js framework
133    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
134        phf::phf_set! {
135            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
136            "_astro/", "_app/immutable"
137        }
138    };
139
140    /// Ignore the content types.
141    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
142        "application/pdf",
143        "application/zip",
144        "application/x-rar-compressed",
145        "application/x-tar",
146        "image/png",
147        "image/jpeg",
148        "image/gif",
149        "image/bmp",
150        "image/webp",
151        "image/svg+xml",
152        "video/mp4",
153        "video/x-msvideo",
154        "video/x-matroska",
155        "video/webm",
156        "audio/mpeg",
157        "audio/ogg",
158        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
159        "application/vnd.ms-excel",
160        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
161        "application/vnd.ms-powerpoint",
162        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
163        "application/x-7z-compressed",
164        "application/x-rpm",
165        "application/x-shockwave-flash",
166        "application/rtf",
167    };
168
169    /// Ignore the resources for visual content types.
170    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
171        "Image",
172        "Media",
173        "Font"
174    };
175
176    /// Ignore the resources for visual content types.
177    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
178        "CspViolationReport",
179        "Manifest",
180        "Other",
181        "Prefetch",
182        "Ping",
183    };
184
185    /// Case insenstive css matching
186    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
187
188    /// The command chain.
189    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
190        let enable = EnableParams::default();
191
192        if let Ok(c) = serde_json::to_value(&enable) {
193            vec![(enable.identifier(), c)]
194        } else {
195            vec![]
196        }
197    };
198
199    /// The command chain with https ignore.
200    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
201        let enable = EnableParams::default();
202        let mut v = vec![];
203        if let Ok(c) = serde_json::to_value(&enable) {
204            v.push((enable.identifier(), c));
205        }
206        let ignore = SetIgnoreCertificateErrorsParams::new(true);
207        if let Ok(ignored) = serde_json::to_value(&ignore) {
208            v.push((ignore.identifier(), ignored));
209        }
210
211        v
212    };
213
214    /// Enable the fetch intercept command
215    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
216        fetch::EnableParams::builder()
217        .handle_auth_requests(true)
218        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
219        .build()
220    };
221}
222
223/// Determine if a redirect is true.
224pub(crate) fn is_redirect_status(status: i64) -> bool {
225    matches!(status, 301 | 302 | 303 | 307 | 308)
226}
227
228#[derive(Debug)]
229/// The base network manager.
230pub struct NetworkManager {
231    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
232    ///
233    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
234    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
235    /// Consumers pull from this queue via `poll()`.
236    queued_events: VecDeque<NetworkEvent>,
237    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
238    ///
239    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
240    /// certificates (self-signed, expired, MITM proxies, etc.).
241    ignore_httpserrors: bool,
242    /// Active in-flight requests keyed by CDP `RequestId`.
243    ///
244    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
245    /// and final state used to emit `RequestFinished` / `RequestFailed`.
246    requests: HashMap<RequestId, HttpRequest>,
247    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
248    /// `Fetch.requestPaused` arrives later (or vice versa).
249    ///
250    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
251    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
252    // TODO put event in an Arc?
253    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
254    /// Extra HTTP headers to apply to subsequent network requests via CDP.
255    ///
256    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
257    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
258    extra_headers: std::collections::HashMap<String, String>,
259    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
260    ///
261    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
262    /// store the interception id here so it can be attached to the `HttpRequest` once the
263    /// network request is observed.
264    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
265    /// Whether the user has disabled the browser cache.
266    ///
267    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
268    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
269    user_cache_disabled: bool,
270    /// Tracks which requests have already attempted authentication.
271    ///
272    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
273    /// authentication challenges (407/401). Once a request id is present here, subsequent
274    /// challenges for the same request are canceled.
275    attempted_authentications: HashSet<RequestId>,
276    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
277    ///
278    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
279    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
280    credentials: Option<Credentials>,
281    /// User-facing toggle indicating whether request interception is desired.
282    ///
283    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
284    /// not guarantee interception is active; interception is actually enabled/disabled by
285    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
286    ///
287    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
288    /// enabled to satisfy auth challenges.
289    pub(crate) user_request_interception_enabled: bool,
290    /// Hard kill-switch to block all network traffic.
291    ///
292    /// When `true`, the manager immediately blocks requests (typically via
293    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
294    /// and short-circuits most decision logic. This is used for safety conditions such as
295    /// exceeding `max_bytes_allowed` or other runtime protections.
296    block_all: bool,
297    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
298    ///
299    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
300    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
301    /// when `user_request_interception_enabled` or `credentials` change.
302    pub(crate) protocol_request_interception_enabled: bool,
303    /// The network is offline.
304    offline: bool,
305    /// The page request timeout.
306    pub request_timeout: Duration,
307    // made_request: bool,
308    /// Ignore visuals (no pings, prefetching, and etc).
309    pub ignore_visuals: bool,
310    /// Block CSS stylesheets.
311    pub block_stylesheets: bool,
312    /// Block javascript that is not critical to rendering.
313    ///
314    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
315    /// by itself (it remains for config compatibility).
316    pub block_javascript: bool,
317    /// Block analytics from rendering
318    pub block_analytics: bool,
319    /// Only html from loading.
320    pub only_html: bool,
321    /// Is xml document?
322    pub xml_document: bool,
323    /// The custom intercept handle logic to run on the website.
324    pub intercept_manager: NetworkInterceptManager,
325    /// Track the amount of times the document reloaded.
326    pub document_reload_tracker: u8,
327    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
328    pub document_target_url: String,
329    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
330    pub document_target_domain: String,
331    /// The max bytes to receive.
332    pub max_bytes_allowed: Option<u64>,
333    #[cfg(feature = "_cache")]
334    /// The cache site_key to use.
335    pub cache_site_key: Option<String>,
336    /// The cache policy to use.
337    #[cfg(feature = "_cache")]
338    pub cache_policy: Option<BasicCachePolicy>,
339    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
340    whitelist_patterns: Vec<String>,
341    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
342    whitelist_matcher: Option<AhoCorasick>,
343}
344
345impl NetworkManager {
346    /// A new network manager.
347    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
348        Self {
349            queued_events: Default::default(),
350            ignore_httpserrors,
351            requests: Default::default(),
352            requests_will_be_sent: Default::default(),
353            extra_headers: Default::default(),
354            request_id_to_interception_id: Default::default(),
355            user_cache_disabled: false,
356            attempted_authentications: Default::default(),
357            credentials: None,
358            block_all: false,
359            user_request_interception_enabled: false,
360            protocol_request_interception_enabled: false,
361            offline: false,
362            request_timeout,
363            ignore_visuals: false,
364            block_javascript: false,
365            block_stylesheets: false,
366            block_analytics: true,
367            only_html: false,
368            xml_document: false,
369            intercept_manager: NetworkInterceptManager::Unknown,
370            document_reload_tracker: 0,
371            document_target_url: String::new(),
372            document_target_domain: String::new(),
373            whitelist_patterns: Vec::new(),
374            whitelist_matcher: None,
375            max_bytes_allowed: None,
376            #[cfg(feature = "_cache")]
377            cache_site_key: None,
378            #[cfg(feature = "_cache")]
379            cache_policy: None,
380        }
381    }
382
383    /// Replace the whitelist patterns (compiled once).
384    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
385    where
386        I: IntoIterator<Item = S>,
387        S: Into<String>,
388    {
389        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
390        self.rebuild_whitelist_matcher();
391    }
392
393    /// Add one pattern (cheap) and rebuild (call this sparingly).
394    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
395        self.whitelist_patterns.push(pattern.into());
396        self.rebuild_whitelist_matcher();
397    }
398
399    /// Add many patterns and rebuild once.
400    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
401    where
402        I: IntoIterator<Item = S>,
403        S: Into<String>,
404    {
405        self.whitelist_patterns
406            .extend(patterns.into_iter().map(Into::into));
407        self.rebuild_whitelist_matcher();
408    }
409
410    #[inline]
411    fn rebuild_whitelist_matcher(&mut self) {
412        if self.whitelist_patterns.is_empty() {
413            self.whitelist_matcher = None;
414            return;
415        }
416
417        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
418
419        // If building fails (shouldn’t for simple patterns), just disable matcher.
420        self.whitelist_matcher = AhoCorasick::new(refs).ok();
421    }
422
423    #[inline]
424    fn is_whitelisted(&self, url: &str) -> bool {
425        self.whitelist_matcher
426            .as_ref()
427            .map(|m| m.is_match(url))
428            .unwrap_or(false)
429    }
430
431    /// Commands to init the chain with.
432    pub fn init_commands(&self) -> CommandChain {
433        let cmds = if self.ignore_httpserrors {
434            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
435        } else {
436            INIT_CHAIN.clone()
437        };
438        CommandChain::new(cmds, self.request_timeout)
439    }
440
441    /// Push the CDP request.
442    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
443        let method = cmd.identifier();
444        if let Ok(params) = serde_json::to_value(cmd) {
445            self.queued_events
446                .push_back(NetworkEvent::SendCdpRequest((method, params)));
447        }
448    }
449
450    /// The next event to handle.
451    pub fn poll(&mut self) -> Option<NetworkEvent> {
452        self.queued_events.pop_front()
453    }
454
455    /// Get the extra headers.
456    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
457        &self.extra_headers
458    }
459
460    /// Set extra HTTP headers.
461    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
462        self.extra_headers = headers;
463        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
464        self.extra_headers.remove("Proxy-Authorization");
465        if !self.extra_headers.is_empty() {
466            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
467                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
468            }
469        }
470    }
471
472    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
473        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
474    }
475
476    pub fn set_block_all(&mut self, block_all: bool) {
477        self.block_all = block_all;
478    }
479
480    pub fn set_request_interception(&mut self, enabled: bool) {
481        self.user_request_interception_enabled = enabled;
482        self.update_protocol_request_interception();
483    }
484
485    pub fn set_cache_enabled(&mut self, enabled: bool) {
486        let run = self.user_cache_disabled != !enabled;
487        self.user_cache_disabled = !enabled;
488        if run {
489            self.update_protocol_cache_disabled();
490        }
491    }
492
493    /// Enable fetch interception.
494    pub fn enable_request_intercept(&mut self) {
495        self.protocol_request_interception_enabled = true;
496    }
497
498    /// Disable fetch interception.
499    pub fn disable_request_intercept(&mut self) {
500        self.protocol_request_interception_enabled = false;
501    }
502
503    /// Set the cache site key.
504    #[cfg(feature = "_cache")]
505    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
506        self.cache_site_key = cache_site_key;
507    }
508
509    /// Set the cache policy.
510    #[cfg(feature = "_cache")]
511    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
512        self.cache_policy = cache_policy;
513    }
514
515    pub fn update_protocol_cache_disabled(&mut self) {
516        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
517    }
518
519    pub fn authenticate(&mut self, credentials: Credentials) {
520        self.credentials = Some(credentials);
521        self.update_protocol_request_interception();
522        self.protocol_request_interception_enabled = true;
523    }
524
525    fn update_protocol_request_interception(&mut self) {
526        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
527
528        if enabled == self.protocol_request_interception_enabled {
529            return;
530        }
531
532        if enabled {
533            self.push_cdp_request(ENABLE_FETCH.clone())
534        } else {
535            self.push_cdp_request(DisableParams::default())
536        }
537    }
538
539    /// Extract the absolute URL path portion without any domain logic.
540    ///
541    /// Example:
542    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("js/app.js?x=y")
543    #[inline]
544    fn url_path_no_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
545        let idx = url.find("//")?;
546        let pos = idx + 2;
547        let slash = url[pos..].find('/')?;
548        let base_path_index = pos + slash + 1;
549        if base_path_index < url.len() {
550            Some(&url[base_path_index..])
551        } else {
552            None
553        }
554    }
555
556    /// Blocklist-only script blocking.
557    /// Returns true only when the URL matches an explicit blocklist condition.
558    ///
559    /// IMPORTANT:
560    /// - Scripts are ALLOW-BY-DEFAULT.
561    /// - We only block when explicit blocklist signals match.
562    /// - We do NOT call ignore_script() here because ignore_script() treats absolute URLs as
563    ///   "ignored by default", which is the opposite of what we want.
564    #[inline]
565    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
566        // If analytics blocking is off, skip all analytics tries.
567        let block_analytics = self.block_analytics;
568
569        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
570        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
571        {
572            return true;
573        }
574
575        // 2) Custom website block list (explicit).
576        if crate::handler::blockers::block_websites::block_website(url) {
577            return true;
578        }
579
580        // 3) Path-based explicit tries / fallbacks.
581        //
582        // We run these on:
583        // - path with leading slash ("/js/app.js")
584        // - path without leading slash ("js/app.js")
585        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
586        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
587            // Remove query/fragment so matching stays stable.
588            let p_slash = Self::strip_query_fragment(path_with_slash);
589            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
590
591            // Basename for filename-only lists.
592            let base = match p_slash.rsplit('/').next() {
593                Some(b) => b,
594                None => p_slash,
595            };
596
597            // ---- Filename fallback (VERY fast) ----
598            // This is the behavior your test expects: block "analytics.js" anywhere in the path.
599            // (You can add more filename-only fallbacks here if needed.)
600            if block_analytics && (base == "analytics.js" || p_noslash.ends_with("/analytics.js")) {
601                return true;
602            }
603
604            // ---- Trie checks ----
605            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
606            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
607                return true;
608            }
609            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
610                return true;
611            }
612            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
613                return true;
614            }
615
616            // Base-path ignore tries (framework noise / known ignorable script paths).
617            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
618            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
619                return true;
620            }
621
622            // Style path ignores only when visuals are ignored.
623            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
624                return true;
625            }
626        }
627
628        false
629    }
630
631    /// Extract the absolute URL path portion WITH the leading slash.
632    ///
633    /// Example:
634    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
635    #[inline]
636    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
637        // find scheme separator
638        let idx = url.find("//")?;
639        let after_slashes = idx + 2;
640
641        // find first slash after host
642        let slash_rel = url[after_slashes..].find('/')?;
643        let slash_idx = after_slashes + slash_rel;
644
645        if slash_idx < url.len() {
646            Some(&url[slash_idx..])
647        } else {
648            None
649        }
650    }
651
652    /// Strip query string and fragment from a path-ish string.
653    ///
654    /// Example:
655    /// - "/a/b.js?x=1#y" -> "/a/b.js"
656    #[inline]
657    fn strip_query_fragment(s: &str) -> &str {
658        let q = s.find('?');
659        let h = s.find('#');
660
661        match (q, h) {
662            (None, None) => s,
663            (Some(i), None) => &s[..i],
664            (None, Some(i)) => &s[..i],
665            (Some(i), Some(j)) => &s[..i.min(j)],
666        }
667    }
668
669    /// Determine if the request should be skipped.
670    #[inline]
671    fn skip_xhr(
672        &self,
673        skip_networking: bool,
674        event: &EventRequestPaused,
675        network_event: bool,
676    ) -> bool {
677        // XHR check
678        if !skip_networking && network_event {
679            let request_url = event.request.url.as_str();
680
681            // check if part of ignore scripts.
682            let skip_analytics =
683                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
684
685            if skip_analytics {
686                true
687            } else if self.block_stylesheets || self.ignore_visuals {
688                let block_css = self.block_stylesheets;
689                let block_media = self.ignore_visuals;
690
691                let mut block_request = false;
692
693                if let Some(position) = request_url.rfind('.') {
694                    let hlen = request_url.len();
695                    let has_asset = hlen - position;
696
697                    if has_asset >= 3 {
698                        let next_position = position + 1;
699
700                        if block_media
701                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
702                                &request_url[next_position..].into(),
703                            )
704                        {
705                            block_request = true;
706                        } else if block_css {
707                            block_request =
708                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
709                                    .contains(&**CSS_EXTENSION)
710                        }
711                    }
712                }
713
714                if !block_request {
715                    block_request = ignore_script_xhr_media(request_url);
716                }
717
718                block_request
719            } else {
720                skip_networking
721            }
722        } else {
723            skip_networking
724        }
725    }
726
727    #[cfg(feature = "adblock")]
728    #[inline]
729    /// Detect if ad enabled.
730    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
731        if skip_networking {
732            true
733        } else {
734            self.detect_ad(event)
735        }
736    }
737
738    /// When adblock feature is disabled, this is a no-op.
739    #[cfg(not(feature = "adblock"))]
740    #[inline]
741    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
742        skip_networking
743    }
744
745    #[inline]
746    /// Fail request
747    fn fail_request_blocked(
748        &mut self,
749        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
750    ) {
751        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
752            request_id.clone(),
753            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
754        );
755        self.push_cdp_request(params);
756    }
757
758    #[inline]
759    /// Fulfill request
760    fn fulfill_request_empty_200(
761        &mut self,
762        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
763    ) {
764        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
765            request_id.clone(),
766            200,
767        );
768        self.push_cdp_request(params);
769    }
770
771    #[cfg(feature = "_cache")]
772    #[inline]
773    /// Fulfill a paused Fetch request from cached bytes + header map.
774    ///
775    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
776    fn fulfill_request_from_cache(
777        &mut self,
778        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
779        body: &[u8],
780        headers: &std::collections::HashMap<String, String>,
781        status: i64,
782    ) {
783        use crate::cdp::browser_protocol::fetch::HeaderEntry;
784        use crate::handler::network::fetch::FulfillRequestParams;
785        use base64::Engine;
786
787        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
788
789        for (k, v) in headers.iter() {
790            resp_headers.push(HeaderEntry {
791                name: k.clone().into(),
792                value: v.clone().into(),
793            });
794        }
795
796        let mut params = FulfillRequestParams::new(request_id.clone(), status);
797
798        // TODO: have this already encoded prior.
799        params.body = Some(
800            base64::engine::general_purpose::STANDARD
801                .encode(body)
802                .into(),
803        );
804
805        params.response_headers = Some(resp_headers);
806
807        self.push_cdp_request(params);
808    }
809
810    #[inline]
811    /// Continue the request url.
812    fn continue_request_with_url(
813        &mut self,
814        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
815        url: Option<&str>,
816        intercept_response: bool,
817    ) {
818        let mut params = ContinueRequestParams::new(request_id.clone());
819        if let Some(url) = url {
820            params.url = Some(url.to_string());
821            params.intercept_response = Some(intercept_response);
822        }
823        self.push_cdp_request(params);
824    }
825
826    /// On fetch requesdt paused interception.
827    #[inline]
828    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
829        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
830            return;
831        }
832
833        let resource_type = &event.resource_type;
834
835        if self.block_all {
836            tracing::debug!(
837                "Blocked (block_all): {:?} - {}",
838                event.resource_type,
839                event.request.url
840            );
841            return self.fail_request_blocked(&event.request_id);
842        }
843
844        if let Some(network_id) = event.network_id.as_ref() {
845            if let Some(request_will_be_sent) =
846                self.requests_will_be_sent.remove(network_id.as_ref())
847            {
848                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
849            } else {
850                self.request_id_to_interception_id
851                    .insert(network_id.clone(), event.request_id.clone().into());
852            }
853        }
854
855        // From here on, we handle the full decision tree.
856        let javascript_resource = *resource_type == ResourceType::Script;
857        let document_resource = *resource_type == ResourceType::Document;
858        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
859
860        // Start with static / cheap skip checks.
861        let mut skip_networking =
862            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
863
864        // Also short-circuit if we've reloaded this document too many times.
865        if !skip_networking {
866            skip_networking = self.document_reload_tracker >= 3;
867        }
868
869        // Handle document redirect / masking and track xml documents.
870        let (current_url_cow, had_replacer) =
871            self.handle_document_replacement_and_tracking(event, document_resource);
872
873        let current_url: &str = current_url_cow.as_ref();
874
875        // Main initial check (visuals, stylesheets).
876        //
877        // IMPORTANT: Scripts are NOT blocked here anymore.
878        // Scripts are allowed by default and only blocked via explicit blocklists
879        // (adblock / block_websites / intercept_manager / URL tries).
880        if !skip_networking {
881            // Allow XSL for sitemap XML.
882            if self.xml_document && current_url.ends_with(".xsl") {
883                skip_networking = false;
884            } else {
885                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
886            }
887        }
888
889        // Ad blocking (only active when feature = "adblock").
890        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
891
892        // Ignore embedded scripts when only_html or ignore_visuals is set.
893        if !skip_networking
894            && (self.only_html || self.ignore_visuals)
895            && (javascript_resource || document_resource)
896        {
897            skip_networking = ignore_script_embedded(current_url);
898        }
899
900        // Script policy: allow-by-default.
901        // Block only if explicit block list patterns match.
902        if !skip_networking && javascript_resource {
903            skip_networking = self.should_block_script_blocklist_only(current_url);
904        }
905
906        // XHR / data resources.
907        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
908
909        // Custom interception layer.
910        if !skip_networking && (javascript_resource || network_resource || document_resource) {
911            skip_networking = self.intercept_manager.intercept_detection(
912                current_url,
913                self.ignore_visuals,
914                network_resource,
915            );
916        }
917
918        // Custom website block list.
919        if !skip_networking && (javascript_resource || network_resource) {
920            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
921        }
922
923        // whitelist 3rd party
924        // not required unless explicit blocking.
925        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
926        {
927            skip_networking = false;
928        }
929
930        // check if the url is in the whitelist.
931        if skip_networking && self.is_whitelisted(current_url) {
932            skip_networking = false;
933        }
934
935        if skip_networking {
936            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
937            self.fulfill_request_empty_200(&event.request_id);
938        } else {
939            #[cfg(feature = "_cache")]
940            {
941                if let (Some(policy), Some(cache_site_key)) =
942                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
943                {
944                    let current_url = format!("{}:{}", event.request.method, &current_url);
945
946                    if let Some((res, cache_policy)) =
947                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
948                    {
949                        if policy.allows_cached(&cache_policy) {
950                            tracing::debug!(
951                                "Remote Cached: {:?} - {}",
952                                resource_type,
953                                &current_url
954                            );
955                            return self.fulfill_request_from_cache(
956                                &event.request_id,
957                                &res.body,
958                                &res.headers,
959                                res.status as i64,
960                            );
961                        }
962                    }
963                }
964            }
965
966            // check our frame cache for the run.
967            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
968            self.continue_request_with_url(
969                &event.request_id,
970                if had_replacer {
971                    Some(current_url)
972                } else {
973                    None
974                },
975                !had_replacer,
976            );
977        }
978    }
979
980    /// Shared "visuals + basic blocking" logic.
981    ///
982    /// IMPORTANT: Scripts are NOT blocked here anymore.
983    /// Scripts are allowed by default and only blocked via explicit blocklists
984    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
985    #[inline]
986    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
987        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
988            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
989    }
990
991    /// Does the network manager have a target domain?
992    pub fn has_target_domain(&self) -> bool {
993        !self.document_target_url.is_empty()
994    }
995
996    /// Set the target page url for tracking.
997    pub fn set_page_url(&mut self, page_target_url: String) {
998        let host_base = host_and_rest(&page_target_url)
999            .map(|(h, _)| base_domain_from_host(h))
1000            .unwrap_or("");
1001
1002        self.document_target_domain = host_base.to_string();
1003        self.document_target_url = page_target_url;
1004    }
1005
1006    /// Clear the initial target domain on every navigation.
1007    pub fn clear_target_domain(&mut self) {
1008        self.document_reload_tracker = 0;
1009        self.document_target_url = Default::default();
1010        self.document_target_domain = Default::default();
1011    }
1012
1013    /// Handles:
1014    /// - document reload tracking (`document_reload_tracker`)
1015    /// - redirect masking / replacement
1016    /// - xml document detection (`xml_document`)
1017    /// - `document_target_url` updates
1018    ///
1019    /// Returns (current_url, had_replacer).
1020    #[inline]
1021    fn handle_document_replacement_and_tracking<'a>(
1022        &mut self,
1023        event: &'a EventRequestPaused,
1024        document_resource: bool,
1025    ) -> (Cow<'a, str>, bool) {
1026        let mut replacer: Option<String> = None;
1027        let current_url = event.request.url.as_str();
1028
1029        if document_resource {
1030            if self.document_target_url == current_url {
1031                self.document_reload_tracker += 1;
1032            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1033            {
1034                let (http_document_replacement, mut https_document_replacement) =
1035                    if self.document_target_url.starts_with("http://") {
1036                        (
1037                            self.document_target_url.replacen("http://", "http//", 1),
1038                            self.document_target_url.replacen("http://", "https://", 1),
1039                        )
1040                    } else {
1041                        (
1042                            self.document_target_url.replacen("https://", "https//", 1),
1043                            self.document_target_url.replacen("https://", "http://", 1),
1044                        )
1045                    };
1046
1047                // Track trailing slash to restore later.
1048                let trailing = https_document_replacement.ends_with('/');
1049                if trailing {
1050                    https_document_replacement.pop();
1051                }
1052                if https_document_replacement.ends_with('/') {
1053                    https_document_replacement.pop();
1054                }
1055
1056                let redirect_mask = format!(
1057                    "{}{}",
1058                    https_document_replacement, http_document_replacement
1059                );
1060
1061                if current_url == redirect_mask {
1062                    replacer = Some(if trailing {
1063                        format!("{}/", https_document_replacement)
1064                    } else {
1065                        https_document_replacement
1066                    });
1067                }
1068            }
1069
1070            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1071                self.xml_document = true;
1072            }
1073
1074            // Track last seen document URL.
1075            self.document_target_url = event.request.url.clone();
1076            self.document_target_domain = host_and_rest(&self.document_target_url)
1077                .map(|(h, _)| base_domain_from_host(h).to_string())
1078                .unwrap_or_default();
1079        }
1080
1081        let current_url_cow = match replacer {
1082            Some(r) => Cow::Owned(r),
1083            None => Cow::Borrowed(event.request.url.as_str()),
1084        };
1085
1086        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1087        (current_url_cow, had_replacer)
1088    }
1089
1090    /// Perform a page intercept for chrome
1091    #[cfg(feature = "adblock")]
1092    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1093        use adblock::{
1094            lists::{FilterSet, ParseOptions, RuleTypes},
1095            Engine,
1096        };
1097
1098        lazy_static::lazy_static! {
1099            static ref AD_ENGINE: Engine = {
1100                let mut filter_set = FilterSet::new(false);
1101                let mut rules = ParseOptions::default();
1102                rules.rule_types = RuleTypes::All;
1103
1104                filter_set.add_filters(
1105                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1106                    rules,
1107                );
1108
1109                Engine::from_filter_set(filter_set, true)
1110            };
1111        };
1112
1113        let blockable = ResourceType::Image == event.resource_type
1114            || event.resource_type == ResourceType::Media
1115            || event.resource_type == ResourceType::Stylesheet
1116            || event.resource_type == ResourceType::Document
1117            || event.resource_type == ResourceType::Fetch
1118            || event.resource_type == ResourceType::Xhr;
1119
1120        let u = &event.request.url;
1121
1122        let block_request = blockable
1123            // set it to example.com for 3rd party handling is_same_site
1124        && {
1125            let request = adblock::request::Request::preparsed(
1126                 &u,
1127                 "example.com",
1128                 "example.com",
1129                 &event.resource_type.as_ref().to_lowercase(),
1130                 !event.request.is_same_site.unwrap_or_default());
1131
1132            AD_ENGINE.check_network_request(&request).matched
1133        };
1134
1135        block_request
1136    }
1137
1138    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1139        let response = if self
1140            .attempted_authentications
1141            .contains(event.request_id.as_ref())
1142        {
1143            AuthChallengeResponseResponse::CancelAuth
1144        } else if self.credentials.is_some() {
1145            self.attempted_authentications
1146                .insert(event.request_id.clone().into());
1147            AuthChallengeResponseResponse::ProvideCredentials
1148        } else {
1149            AuthChallengeResponseResponse::Default
1150        };
1151
1152        let mut auth = AuthChallengeResponse::new(response);
1153        if let Some(creds) = self.credentials.clone() {
1154            auth.username = Some(creds.username);
1155            auth.password = Some(creds.password);
1156        }
1157        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1158    }
1159
1160    /// Set the page offline network emulation condition.
1161    pub fn set_offline_mode(&mut self, value: bool) {
1162        if self.offline == value {
1163            return;
1164        }
1165        self.offline = value;
1166        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1167            .offline(self.offline)
1168            .latency(0)
1169            .download_throughput(-1.)
1170            .upload_throughput(-1.)
1171            .build()
1172        {
1173            self.push_cdp_request(network);
1174        }
1175    }
1176
1177    /// Request interception doesn't happen for data URLs with Network Service.
1178    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1179        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1180            if let Some(interception_id) = self
1181                .request_id_to_interception_id
1182                .remove(event.request_id.as_ref())
1183            {
1184                self.on_request(event, Some(interception_id));
1185            } else {
1186                // TODO remove the clone for event
1187                self.requests_will_be_sent
1188                    .insert(event.request_id.clone(), event.clone());
1189            }
1190        } else {
1191            self.on_request(event, None);
1192        }
1193    }
1194
1195    /// The request was served from the cache.
1196    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1197        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1198            request.from_memory_cache = true;
1199        }
1200    }
1201
1202    /// On network response received.
1203    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1204        let mut request_failed = false;
1205
1206        // Track how many bytes we actually deducted from this target.
1207        let mut deducted: u64 = 0;
1208
1209        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1210            let before = *max_bytes;
1211
1212            // encoded_data_length -> saturating cast to u64
1213            let received_bytes: u64 = event.response.encoded_data_length as u64;
1214
1215            // Safe parse of Content-Length
1216            let content_length: Option<u64> = event
1217                .response
1218                .headers
1219                .inner()
1220                .get("content-length")
1221                .and_then(|v| v.as_str())
1222                .and_then(|s| s.trim().parse::<u64>().ok());
1223
1224            // Deduct what we actually received
1225            *max_bytes = max_bytes.saturating_sub(received_bytes);
1226
1227            // If the declared size can't fit, zero out now
1228            if let Some(cl) = content_length {
1229                if cl > *max_bytes {
1230                    *max_bytes = 0;
1231                }
1232            }
1233
1234            request_failed = *max_bytes == 0;
1235
1236            // Compute exact delta deducted on this event
1237            deducted = before.saturating_sub(*max_bytes);
1238        }
1239
1240        // Bubble up the deduction (even if request continues)
1241        if deducted > 0 {
1242            self.queued_events
1243                .push_back(NetworkEvent::BytesConsumed(deducted));
1244        }
1245
1246        // block all network request moving forward.
1247        if request_failed && self.max_bytes_allowed.is_some() {
1248            self.set_block_all(true);
1249        }
1250
1251        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1252            request.set_response(event.response.clone());
1253            self.queued_events.push_back(if request_failed {
1254                NetworkEvent::RequestFailed(request)
1255            } else {
1256                NetworkEvent::RequestFinished(request)
1257            });
1258        }
1259    }
1260
1261    /// On network loading finished.
1262    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1263        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1264            if let Some(interception_id) = request.interception_id.as_ref() {
1265                self.attempted_authentications
1266                    .remove(interception_id.as_ref());
1267            }
1268            self.queued_events
1269                .push_back(NetworkEvent::RequestFinished(request));
1270        }
1271    }
1272
1273    /// On network loading failed.
1274    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1275        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1276            request.failure_text = Some(event.error_text.clone());
1277            if let Some(interception_id) = request.interception_id.as_ref() {
1278                self.attempted_authentications
1279                    .remove(interception_id.as_ref());
1280            }
1281            self.queued_events
1282                .push_back(NetworkEvent::RequestFailed(request));
1283        }
1284    }
1285
1286    /// On request will be sent.
1287    fn on_request(
1288        &mut self,
1289        event: &EventRequestWillBeSent,
1290        interception_id: Option<InterceptionId>,
1291    ) {
1292        let mut redirect_chain = Vec::new();
1293        let mut redirect_location = None;
1294
1295        if let Some(redirect_resp) = &event.redirect_response {
1296            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1297                if is_redirect_status(redirect_resp.status) {
1298                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1299                        if redirect_resp.url != location {
1300                            let fixed_location = location.replace(&redirect_resp.url, "");
1301
1302                            if !fixed_location.is_empty() {
1303                                request.response.as_mut().map(|resp| {
1304                                    resp.headers.0["Location"] =
1305                                        serde_json::Value::String(fixed_location.clone());
1306                                });
1307                            }
1308
1309                            redirect_location = Some(fixed_location);
1310                        }
1311                    }
1312                }
1313
1314                self.handle_request_redirect(
1315                    &mut request,
1316                    if let Some(redirect_location) = redirect_location {
1317                        let mut redirect_resp = redirect_resp.clone();
1318
1319                        if !redirect_location.is_empty() {
1320                            redirect_resp.headers.0["Location"] =
1321                                serde_json::Value::String(redirect_location);
1322                        }
1323
1324                        redirect_resp
1325                    } else {
1326                        redirect_resp.clone()
1327                    },
1328                );
1329
1330                redirect_chain = std::mem::take(&mut request.redirect_chain);
1331                redirect_chain.push(request);
1332            }
1333        }
1334
1335        let request = HttpRequest::new(
1336            event.request_id.clone(),
1337            event.frame_id.clone(),
1338            interception_id,
1339            self.user_request_interception_enabled,
1340            redirect_chain,
1341        );
1342
1343        self.requests.insert(event.request_id.clone(), request);
1344        self.queued_events
1345            .push_back(NetworkEvent::Request(event.request_id.clone()));
1346    }
1347
1348    /// Handle request redirect.
1349    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1350        request.set_response(response);
1351        if let Some(interception_id) = request.interception_id.as_ref() {
1352            self.attempted_authentications
1353                .remove(interception_id.as_ref());
1354        }
1355    }
1356}
1357
1358#[derive(Debug)]
1359pub enum NetworkEvent {
1360    /// Send a CDP request.
1361    SendCdpRequest((MethodId, serde_json::Value)),
1362    /// Request.
1363    Request(RequestId),
1364    /// Response
1365    Response(RequestId),
1366    /// Request failed.
1367    RequestFailed(HttpRequest),
1368    /// Request finished.
1369    RequestFinished(HttpRequest),
1370    /// Bytes consumed.
1371    BytesConsumed(u64),
1372}
1373
1374#[cfg(test)]
1375mod tests {
1376    use super::ALLOWED_MATCHER_3RD_PARTY;
1377    use crate::handler::network::NetworkManager;
1378    use std::time::Duration;
1379
1380    #[test]
1381    fn test_allowed_matcher_3rd_party() {
1382        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1383        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1384        assert!(
1385            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1386            "expected Cloudflare challenge script to be allowed"
1387        );
1388
1389        // Should NOT be allowed (not in allow-list)
1390        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1391        assert!(
1392            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1393            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1394        );
1395
1396        // A couple sanity checks for existing allow patterns
1397        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1398        assert!(ALLOWED_MATCHER_3RD_PARTY
1399            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1400        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1401    }
1402
1403    #[test]
1404    fn test_script_allowed_by_default_when_not_blocklisted() {
1405        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1406        nm.set_page_url(
1407            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1408        );
1409
1410        // A random script that should not match your block tries.
1411        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1412        assert!(
1413            !nm.should_block_script_blocklist_only(ok),
1414            "expected non-blocklisted script to be allowed"
1415        );
1416    }
1417
1418    #[test]
1419    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1420        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1421        nm.set_page_url(
1422            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1423        );
1424
1425        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1426        let bad = "https://cdn.example.net/js/analytics.js";
1427        assert!(
1428            nm.should_block_script_blocklist_only(bad),
1429            "expected analytics.js to be blocklisted"
1430        );
1431    }
1432
1433    #[test]
1434    fn test_allowed_matcher_3rd_party_sanity() {
1435        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1436        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1437        assert!(
1438            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1439            "expected Cloudflare challenge script to be allowed"
1440        );
1441
1442        // Should NOT be allowed (not in allow-list)
1443        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1444        assert!(
1445            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1446            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1447        );
1448
1449        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1450        assert!(ALLOWED_MATCHER_3RD_PARTY
1451            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1452        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1453    }
1454}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs