chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/enterprise.js",
73        "https://www.google.com/recaptcha/api.js",
74        "https://google.com/recaptcha/api.js",
75        "https://captcha.px-cloud.net/",
76        "https://cdn.auth0.com/js/lock/",
77        "https://captcha.gtimg.com",
78        "https://cdn.auth0.com/client",
79        "https://js.stripe.com/",
80        "https://cdn.prod.website-files.com/", // webflow cdn scripts
81        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
82        "https://code.jquery.com/jquery-"
83    ];
84
85    /// Determine if a script should be rendered in the browser by name.
86    ///
87    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
88    /// but we keep it for compatibility and other call sites.
89    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
90
91    /// General patterns for popular libraries and resources
92    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
93        // Verified 3rd parties for request
94        "https://m.stripe.network/",
95        "https://challenges.cloudflare.com/",
96        "https://www.google.com/recaptcha/api.js",
97        "https://google.com/recaptcha/api.js",
98        "https://www.google.com/recaptcha/enterprise.js",
99        "https://js.stripe.com/",
100        "https://cdn.prod.website-files.com/", // webflow cdn scripts
101        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
102        "https://code.jquery.com/jquery-",
103        "https://ct.captcha-delivery.com/",
104        "https://geo.captcha-delivery.com/captcha/",
105        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
106        "https://ct.captcha-delivery.com/",
107        "https://cdn.auth0.com/client",
108        "https://captcha.px-cloud.net/",
109        "https://www.gstatic.com/recaptcha/",
110        "https://www.google.com/recaptcha/api2/",
111        "https://www.recaptcha.net/recaptcha/",
112        "https://js.hcaptcha.com/1/api.js",
113        "https://hcaptcha.com/1/api.js",
114        "https://js.datadome.co/tags.js",
115        "https://api-js.datadome.co/",
116        "https://client.perimeterx.net/",
117        "https://captcha.px-cdn.net/",
118        "https://captcha.px-cloud.net/",
119        "https://s.perimeterx.net/",
120        "https://client-api.arkoselabs.com/v2/",
121        "https://static.geetest.com/v4/gt4.js",
122        "https://static.geetest.com/",
123        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
124        "https://cdn.perfdrive.com/aperture/",
125        "https://assets.queue-it.net/",
126        "discourse-cdn.com/",
127        "/cdn-cgi/challenge-platform/",
128        "/_Incapsula_Resource"
129    ];
130
131    /// Determine if a script should be rendered in the browser by name.
132    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
133
134    /// path of a js framework
135    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
136        phf::phf_set! {
137            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
138            "_astro/", "_app/immutable"
139        }
140    };
141
142    /// Ignore the content types.
143    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
144        "application/pdf",
145        "application/zip",
146        "application/x-rar-compressed",
147        "application/x-tar",
148        "image/png",
149        "image/jpeg",
150        "image/gif",
151        "image/bmp",
152        "image/webp",
153        "image/svg+xml",
154        "video/mp4",
155        "video/x-msvideo",
156        "video/x-matroska",
157        "video/webm",
158        "audio/mpeg",
159        "audio/ogg",
160        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
161        "application/vnd.ms-excel",
162        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
163        "application/vnd.ms-powerpoint",
164        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
165        "application/x-7z-compressed",
166        "application/x-rpm",
167        "application/x-shockwave-flash",
168        "application/rtf",
169    };
170
171    /// Ignore the resources for visual content types.
172    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
173        "Image",
174        "Media",
175        "Font"
176    };
177
178    /// Ignore the resources for visual content types.
179    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180        "CspViolationReport",
181        "Manifest",
182        "Other",
183        "Prefetch",
184        "Ping",
185    };
186
187    /// Case insenstive css matching
188    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
189
190    /// The command chain.
191    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
192        let enable = EnableParams::default();
193
194        if let Ok(c) = serde_json::to_value(&enable) {
195            vec![(enable.identifier(), c)]
196        } else {
197            vec![]
198        }
199    };
200
201    /// The command chain with https ignore.
202    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
203        let enable = EnableParams::default();
204        let mut v = vec![];
205        if let Ok(c) = serde_json::to_value(&enable) {
206            v.push((enable.identifier(), c));
207        }
208        let ignore = SetIgnoreCertificateErrorsParams::new(true);
209        if let Ok(ignored) = serde_json::to_value(&ignore) {
210            v.push((ignore.identifier(), ignored));
211        }
212
213        v
214    };
215
216    /// Enable the fetch intercept command
217    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
218        fetch::EnableParams::builder()
219        .handle_auth_requests(true)
220        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
221        .build()
222    };
223}
224
225/// Determine if a redirect is true.
226pub(crate) fn is_redirect_status(status: i64) -> bool {
227    matches!(status, 301 | 302 | 303 | 307 | 308)
228}
229
230#[derive(Debug)]
231/// The base network manager.
232pub struct NetworkManager {
233    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
234    ///
235    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
236    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
237    /// Consumers pull from this queue via `poll()`.
238    queued_events: VecDeque<NetworkEvent>,
239    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
240    ///
241    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
242    /// certificates (self-signed, expired, MITM proxies, etc.).
243    ignore_httpserrors: bool,
244    /// Active in-flight requests keyed by CDP `RequestId`.
245    ///
246    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
247    /// and final state used to emit `RequestFinished` / `RequestFailed`.
248    requests: HashMap<RequestId, HttpRequest>,
249    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
250    /// `Fetch.requestPaused` arrives later (or vice versa).
251    ///
252    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
253    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
254    // TODO put event in an Arc?
255    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
256    /// Extra HTTP headers to apply to subsequent network requests via CDP.
257    ///
258    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
259    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
260    extra_headers: std::collections::HashMap<String, String>,
261    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
262    ///
263    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
264    /// store the interception id here so it can be attached to the `HttpRequest` once the
265    /// network request is observed.
266    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
267    /// Whether the user has disabled the browser cache.
268    ///
269    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
270    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
271    user_cache_disabled: bool,
272    /// Tracks which requests have already attempted authentication.
273    ///
274    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
275    /// authentication challenges (407/401). Once a request id is present here, subsequent
276    /// challenges for the same request are canceled.
277    attempted_authentications: HashSet<RequestId>,
278    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
279    ///
280    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
281    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
282    credentials: Option<Credentials>,
283    /// User-facing toggle indicating whether request interception is desired.
284    ///
285    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
286    /// not guarantee interception is active; interception is actually enabled/disabled by
287    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
288    ///
289    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
290    /// enabled to satisfy auth challenges.
291    pub(crate) user_request_interception_enabled: bool,
292    /// Hard kill-switch to block all network traffic.
293    ///
294    /// When `true`, the manager immediately blocks requests (typically via
295    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
296    /// and short-circuits most decision logic. This is used for safety conditions such as
297    /// exceeding `max_bytes_allowed` or other runtime protections.
298    block_all: bool,
299    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
300    ///
301    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
302    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
303    /// when `user_request_interception_enabled` or `credentials` change.
304    pub(crate) protocol_request_interception_enabled: bool,
305    /// The network is offline.
306    offline: bool,
307    /// The page request timeout.
308    pub request_timeout: Duration,
309    // made_request: bool,
310    /// Ignore visuals (no pings, prefetching, and etc).
311    pub ignore_visuals: bool,
312    /// Block CSS stylesheets.
313    pub block_stylesheets: bool,
314    /// Block javascript that is not critical to rendering.
315    ///
316    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
317    /// by itself (it remains for config compatibility).
318    pub block_javascript: bool,
319    /// Block analytics from rendering
320    pub block_analytics: bool,
321    /// Only html from loading.
322    pub only_html: bool,
323    /// Is xml document?
324    pub xml_document: bool,
325    /// The custom intercept handle logic to run on the website.
326    pub intercept_manager: NetworkInterceptManager,
327    /// Track the amount of times the document reloaded.
328    pub document_reload_tracker: u8,
329    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
330    pub document_target_url: String,
331    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
332    pub document_target_domain: String,
333    /// The max bytes to receive.
334    pub max_bytes_allowed: Option<u64>,
335    #[cfg(feature = "_cache")]
336    /// The cache site_key to use.
337    pub cache_site_key: Option<String>,
338    /// The cache policy to use.
339    #[cfg(feature = "_cache")]
340    pub cache_policy: Option<BasicCachePolicy>,
341    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
342    whitelist_patterns: Vec<String>,
343    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
344    whitelist_matcher: Option<AhoCorasick>,
345    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
346    blacklist_patterns: Vec<String>,
347    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
348    blacklist_matcher: Option<AhoCorasick>,
349    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
350    blacklist_strict: bool,
351}
352
353impl NetworkManager {
354    /// A new network manager.
355    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
356        Self {
357            queued_events: Default::default(),
358            ignore_httpserrors,
359            requests: Default::default(),
360            requests_will_be_sent: Default::default(),
361            extra_headers: Default::default(),
362            request_id_to_interception_id: Default::default(),
363            user_cache_disabled: false,
364            attempted_authentications: Default::default(),
365            credentials: None,
366            block_all: false,
367            user_request_interception_enabled: false,
368            protocol_request_interception_enabled: false,
369            offline: false,
370            request_timeout,
371            ignore_visuals: false,
372            block_javascript: false,
373            block_stylesheets: false,
374            block_analytics: true,
375            only_html: false,
376            xml_document: false,
377            intercept_manager: NetworkInterceptManager::Unknown,
378            document_reload_tracker: 0,
379            document_target_url: String::new(),
380            document_target_domain: String::new(),
381            whitelist_patterns: Vec::new(),
382            whitelist_matcher: None,
383            blacklist_patterns: Vec::new(),
384            blacklist_matcher: None,
385            blacklist_strict: true,
386            max_bytes_allowed: None,
387            #[cfg(feature = "_cache")]
388            cache_site_key: None,
389            #[cfg(feature = "_cache")]
390            cache_policy: None,
391        }
392    }
393
394    /// Replace the whitelist patterns (compiled once).
395    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
396    where
397        I: IntoIterator<Item = S>,
398        S: Into<String>,
399    {
400        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
401        self.rebuild_whitelist_matcher();
402    }
403
404    /// Replace the blacklist patterns (compiled once).
405    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
406    where
407        I: IntoIterator<Item = S>,
408        S: Into<String>,
409    {
410        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
411        self.rebuild_blacklist_matcher();
412    }
413
414    /// Add one pattern (cheap) and rebuild (call this sparingly).
415    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
416        self.blacklist_patterns.push(pattern.into());
417        self.rebuild_blacklist_matcher();
418    }
419
420    /// Add many patterns and rebuild once.
421    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
422    where
423        I: IntoIterator<Item = S>,
424        S: Into<String>,
425    {
426        self.blacklist_patterns
427            .extend(patterns.into_iter().map(Into::into));
428        self.rebuild_blacklist_matcher();
429    }
430
431    /// Clear blacklist entirely.
432    pub fn clear_blacklist(&mut self) {
433        self.blacklist_patterns.clear();
434        self.blacklist_matcher = None;
435    }
436
437    /// Control precedence: when true, blacklist always wins.
438    pub fn set_blacklist_strict(&mut self, strict: bool) {
439        self.blacklist_strict = strict;
440    }
441
442    #[inline]
443    fn rebuild_blacklist_matcher(&mut self) {
444        if self.blacklist_patterns.is_empty() {
445            self.blacklist_matcher = None;
446            return;
447        }
448
449        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
450        self.blacklist_matcher = AhoCorasick::new(refs).ok();
451    }
452
453    #[inline]
454    fn is_blacklisted(&self, url: &str) -> bool {
455        self.blacklist_matcher
456            .as_ref()
457            .map(|m| m.is_match(url))
458            .unwrap_or(false)
459    }
460
461    /// Add one pattern (cheap) and rebuild (call this sparingly).
462    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
463        self.whitelist_patterns.push(pattern.into());
464        self.rebuild_whitelist_matcher();
465    }
466
467    /// Add many patterns and rebuild once.
468    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
469    where
470        I: IntoIterator<Item = S>,
471        S: Into<String>,
472    {
473        self.whitelist_patterns
474            .extend(patterns.into_iter().map(Into::into));
475        self.rebuild_whitelist_matcher();
476    }
477
478    #[inline]
479    fn rebuild_whitelist_matcher(&mut self) {
480        if self.whitelist_patterns.is_empty() {
481            self.whitelist_matcher = None;
482            return;
483        }
484
485        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
486
487        // If building fails (shouldn’t for simple patterns), just disable matcher.
488        self.whitelist_matcher = AhoCorasick::new(refs).ok();
489    }
490
491    #[inline]
492    fn is_whitelisted(&self, url: &str) -> bool {
493        self.whitelist_matcher
494            .as_ref()
495            .map(|m| m.is_match(url))
496            .unwrap_or(false)
497    }
498
499    /// Commands to init the chain with.
500    pub fn init_commands(&self) -> CommandChain {
501        let cmds = if self.ignore_httpserrors {
502            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
503        } else {
504            INIT_CHAIN.clone()
505        };
506        CommandChain::new(cmds, self.request_timeout)
507    }
508
509    /// Push the CDP request.
510    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
511        let method = cmd.identifier();
512        if let Ok(params) = serde_json::to_value(cmd) {
513            self.queued_events
514                .push_back(NetworkEvent::SendCdpRequest((method, params)));
515        }
516    }
517
518    /// The next event to handle.
519    pub fn poll(&mut self) -> Option<NetworkEvent> {
520        self.queued_events.pop_front()
521    }
522
523    /// Get the extra headers.
524    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
525        &self.extra_headers
526    }
527
528    /// Set extra HTTP headers.
529    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
530        self.extra_headers = headers;
531        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
532        self.extra_headers.remove("Proxy-Authorization");
533        if !self.extra_headers.is_empty() {
534            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
535                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
536            }
537        }
538    }
539
540    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
541        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
542    }
543
544    pub fn set_block_all(&mut self, block_all: bool) {
545        self.block_all = block_all;
546    }
547
548    pub fn set_request_interception(&mut self, enabled: bool) {
549        self.user_request_interception_enabled = enabled;
550        self.update_protocol_request_interception();
551    }
552
553    pub fn set_cache_enabled(&mut self, enabled: bool) {
554        let run = self.user_cache_disabled != !enabled;
555        self.user_cache_disabled = !enabled;
556        if run {
557            self.update_protocol_cache_disabled();
558        }
559    }
560
561    /// Enable fetch interception.
562    pub fn enable_request_intercept(&mut self) {
563        self.protocol_request_interception_enabled = true;
564    }
565
566    /// Disable fetch interception.
567    pub fn disable_request_intercept(&mut self) {
568        self.protocol_request_interception_enabled = false;
569    }
570
571    /// Set the cache site key.
572    #[cfg(feature = "_cache")]
573    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
574        self.cache_site_key = cache_site_key;
575    }
576
577    /// Set the cache policy.
578    #[cfg(feature = "_cache")]
579    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
580        self.cache_policy = cache_policy;
581    }
582
583    pub fn update_protocol_cache_disabled(&mut self) {
584        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
585    }
586
587    pub fn authenticate(&mut self, credentials: Credentials) {
588        self.credentials = Some(credentials);
589        self.update_protocol_request_interception();
590        self.protocol_request_interception_enabled = true;
591    }
592
593    fn update_protocol_request_interception(&mut self) {
594        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
595
596        if enabled == self.protocol_request_interception_enabled {
597            return;
598        }
599
600        if enabled {
601            self.push_cdp_request(ENABLE_FETCH.clone())
602        } else {
603            self.push_cdp_request(DisableParams::default())
604        }
605    }
606
607    /// Blocklist-only script blocking.
608    /// Returns true only when the URL matches an explicit blocklist condition.
609    #[inline]
610    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
611        // If analytics blocking is off, skip all analytics tries.
612        let block_analytics = self.block_analytics;
613
614        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
615        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
616        {
617            return true;
618        }
619
620        // 2) Custom website block list (explicit).
621        if crate::handler::blockers::block_websites::block_website(url) {
622            return true;
623        }
624
625        // 3) Path-based explicit tries / fallbacks.
626        //
627        // We run these on:
628        // - path with leading slash ("/js/app.js")
629        // - path without leading slash ("js/app.js")
630        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
631        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
632            // Remove query/fragment so matching stays stable.
633            let p_slash = Self::strip_query_fragment(path_with_slash);
634            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
635
636            // Basename for filename-only lists.
637            let base = match p_slash.rsplit('/').next() {
638                Some(b) => b,
639                None => p_slash,
640            };
641
642            // ---- Trie checks ----
643            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
644            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
645                return true;
646            }
647            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
648                return true;
649            }
650            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
651                return true;
652            }
653
654            // Base-path ignore tries (framework noise / known ignorable script paths).
655            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
656            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
657                return true;
658            }
659
660            // Style path ignores only when visuals are ignored.
661            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
662                return true;
663            }
664        }
665
666        false
667    }
668
669    /// Extract the absolute URL path portion WITH the leading slash.
670    ///
671    /// Example:
672    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
673    #[inline]
674    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
675        // find scheme separator
676        let idx = url.find("//")?;
677        let after_slashes = idx + 2;
678
679        // find first slash after host
680        let slash_rel = url[after_slashes..].find('/')?;
681        let slash_idx = after_slashes + slash_rel;
682
683        if slash_idx < url.len() {
684            Some(&url[slash_idx..])
685        } else {
686            None
687        }
688    }
689
690    /// Strip query string and fragment from a path-ish string.
691    ///
692    /// Example:
693    /// - "/a/b.js?x=1#y" -> "/a/b.js"
694    #[inline]
695    fn strip_query_fragment(s: &str) -> &str {
696        let q = s.find('?');
697        let h = s.find('#');
698
699        match (q, h) {
700            (None, None) => s,
701            (Some(i), None) => &s[..i],
702            (None, Some(i)) => &s[..i],
703            (Some(i), Some(j)) => &s[..i.min(j)],
704        }
705    }
706
707    /// Determine if the request should be skipped.
708    #[inline]
709    fn skip_xhr(
710        &self,
711        skip_networking: bool,
712        event: &EventRequestPaused,
713        network_event: bool,
714    ) -> bool {
715        // XHR check
716        if !skip_networking && network_event {
717            let request_url = event.request.url.as_str();
718
719            // check if part of ignore scripts.
720            let skip_analytics =
721                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
722
723            if skip_analytics {
724                true
725            } else if self.block_stylesheets || self.ignore_visuals {
726                let block_css = self.block_stylesheets;
727                let block_media = self.ignore_visuals;
728
729                let mut block_request = false;
730
731                if let Some(position) = request_url.rfind('.') {
732                    let hlen = request_url.len();
733                    let has_asset = hlen - position;
734
735                    if has_asset >= 3 {
736                        let next_position = position + 1;
737
738                        if block_media
739                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
740                                &request_url[next_position..].into(),
741                            )
742                        {
743                            block_request = true;
744                        } else if block_css {
745                            block_request =
746                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
747                                    .contains(&**CSS_EXTENSION)
748                        }
749                    }
750                }
751
752                if !block_request {
753                    block_request = ignore_script_xhr_media(request_url);
754                }
755
756                block_request
757            } else {
758                skip_networking
759            }
760        } else {
761            skip_networking
762        }
763    }
764
765    #[cfg(feature = "adblock")]
766    #[inline]
767    /// Detect if ad enabled.
768    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
769        if skip_networking {
770            true
771        } else {
772            self.detect_ad(event)
773        }
774    }
775
776    /// When adblock feature is disabled, this is a no-op.
777    #[cfg(not(feature = "adblock"))]
778    #[inline]
779    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
780        skip_networking
781    }
782
783    #[inline]
784    /// Fail request
785    fn fail_request_blocked(
786        &mut self,
787        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
788    ) {
789        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
790            request_id.clone(),
791            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
792        );
793        self.push_cdp_request(params);
794    }
795
796    #[inline]
797    /// Fulfill request
798    fn fulfill_request_empty_200(
799        &mut self,
800        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
801    ) {
802        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
803            request_id.clone(),
804            200,
805        );
806        self.push_cdp_request(params);
807    }
808
809    #[cfg(feature = "_cache")]
810    #[inline]
811    /// Fulfill a paused Fetch request from cached bytes + header map.
812    ///
813    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
814    fn fulfill_request_from_cache(
815        &mut self,
816        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
817        body: &[u8],
818        headers: &std::collections::HashMap<String, String>,
819        status: i64,
820    ) {
821        use crate::cdp::browser_protocol::fetch::HeaderEntry;
822        use crate::handler::network::fetch::FulfillRequestParams;
823        use base64::Engine;
824
825        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
826
827        for (k, v) in headers.iter() {
828            resp_headers.push(HeaderEntry {
829                name: k.clone().into(),
830                value: v.clone().into(),
831            });
832        }
833
834        let mut params = FulfillRequestParams::new(request_id.clone(), status);
835
836        // TODO: have this already encoded prior.
837        params.body = Some(
838            base64::engine::general_purpose::STANDARD
839                .encode(body)
840                .into(),
841        );
842
843        params.response_headers = Some(resp_headers);
844
845        self.push_cdp_request(params);
846    }
847
848    #[inline]
849    /// Continue the request url.
850    fn continue_request_with_url(
851        &mut self,
852        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
853        url: Option<&str>,
854        intercept_response: bool,
855    ) {
856        let mut params = ContinueRequestParams::new(request_id.clone());
857        if let Some(url) = url {
858            params.url = Some(url.to_string());
859            params.intercept_response = Some(intercept_response);
860        }
861        self.push_cdp_request(params);
862    }
863
864    /// On fetch request paused interception.
865    #[inline]
866    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
867        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
868            return;
869        }
870
871        let resource_type = &event.resource_type;
872
873        if self.block_all {
874            tracing::debug!(
875                "Blocked (block_all): {:?} - {}",
876                event.resource_type,
877                event.request.url
878            );
879            return self.fail_request_blocked(&event.request_id);
880        }
881
882        if let Some(network_id) = event.network_id.as_ref() {
883            if let Some(request_will_be_sent) =
884                self.requests_will_be_sent.remove(network_id.as_ref())
885            {
886                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
887            } else {
888                self.request_id_to_interception_id
889                    .insert(network_id.clone(), event.request_id.clone().into());
890            }
891        }
892
893        // From here on, we handle the full decision tree.
894        let javascript_resource = *resource_type == ResourceType::Script;
895        let document_resource = *resource_type == ResourceType::Document;
896        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
897
898        // Start with static / cheap skip checks.
899        let mut skip_networking =
900            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
901
902        // Also short-circuit if we've reloaded this document too many times.
903        if !skip_networking {
904            skip_networking = self.document_reload_tracker >= 3;
905        }
906
907        // Handle document redirect / masking and track xml documents.
908        let (current_url_cow, had_replacer) =
909            self.handle_document_replacement_and_tracking(event, document_resource);
910
911        let current_url: &str = current_url_cow.as_ref();
912
913        let blacklisted = self.is_blacklisted(current_url);
914
915        if !self.blacklist_strict && blacklisted {
916            skip_networking = true;
917        }
918
919        if !skip_networking {
920            // Allow XSL for sitemap XML.
921            if self.xml_document && current_url.ends_with(".xsl") {
922                skip_networking = false;
923            } else {
924                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
925            }
926        }
927
928        // Ad blocking (only active when feature = "adblock").
929        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
930
931        // Ignore embedded scripts when only_html or ignore_visuals is set.
932        if !skip_networking
933            && self.block_javascript
934            && (self.only_html || self.ignore_visuals)
935            && (javascript_resource || document_resource)
936        {
937            skip_networking = ignore_script_embedded(current_url);
938        }
939
940        // Script policy: allow-by-default.
941        // Block only if explicit block list patterns match.
942        if !skip_networking && javascript_resource {
943            skip_networking = self.should_block_script_blocklist_only(current_url);
944        }
945
946        // XHR / data resources.
947        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
948
949        // Custom interception layer.
950        if !skip_networking && (javascript_resource || network_resource || document_resource) {
951            skip_networking = self.intercept_manager.intercept_detection(
952                current_url,
953                self.ignore_visuals,
954                network_resource,
955            );
956        }
957
958        // Custom website block list.
959        if !skip_networking && (javascript_resource || network_resource) {
960            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
961        }
962
963        // whitelist 3rd party
964        // not required unless explicit blocking.
965        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
966        {
967            skip_networking = false;
968        }
969
970        // check if the url is in the whitelist.
971        if skip_networking && self.is_whitelisted(current_url) {
972            skip_networking = false;
973        }
974
975        if self.blacklist_strict && blacklisted {
976            skip_networking = true;
977        }
978
979        if skip_networking {
980            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
981            self.fulfill_request_empty_200(&event.request_id);
982        } else {
983            #[cfg(feature = "_cache")]
984            {
985                if let (Some(policy), Some(cache_site_key)) =
986                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
987                {
988                    let current_url = format!("{}:{}", event.request.method, &current_url);
989
990                    if let Some((res, cache_policy)) =
991                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
992                    {
993                        if policy.allows_cached(&cache_policy) {
994                            tracing::debug!(
995                                "Remote Cached: {:?} - {}",
996                                resource_type,
997                                &current_url
998                            );
999                            return self.fulfill_request_from_cache(
1000                                &event.request_id,
1001                                &res.body,
1002                                &res.headers,
1003                                res.status as i64,
1004                            );
1005                        }
1006                    }
1007                }
1008            }
1009
1010            // check our frame cache for the run.
1011            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1012            self.continue_request_with_url(
1013                &event.request_id,
1014                if had_replacer {
1015                    Some(current_url)
1016                } else {
1017                    None
1018                },
1019                !had_replacer,
1020            );
1021        }
1022    }
1023
1024    /// Shared "visuals + basic blocking" logic.
1025    ///
1026    /// IMPORTANT: Scripts are NOT blocked here anymore.
1027    /// Scripts are allowed by default and only blocked via explicit blocklists
1028    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1029    #[inline]
1030    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1031        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1032            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1033    }
1034
1035    /// Does the network manager have a target domain?
1036    pub fn has_target_domain(&self) -> bool {
1037        !self.document_target_url.is_empty()
1038    }
1039
1040    /// Set the target page url for tracking.
1041    pub fn set_page_url(&mut self, page_target_url: String) {
1042        let host_base = host_and_rest(&page_target_url)
1043            .map(|(h, _)| base_domain_from_host(h))
1044            .unwrap_or("");
1045
1046        self.document_target_domain = host_base.to_string();
1047        self.document_target_url = page_target_url;
1048    }
1049
1050    /// Clear the initial target domain on every navigation.
1051    pub fn clear_target_domain(&mut self) {
1052        self.document_reload_tracker = 0;
1053        self.document_target_url = Default::default();
1054        self.document_target_domain = Default::default();
1055    }
1056
1057    /// Handles:
1058    /// - document reload tracking (`document_reload_tracker`)
1059    /// - redirect masking / replacement
1060    /// - xml document detection (`xml_document`)
1061    /// - `document_target_url` updates
1062    ///
1063    /// Returns (current_url, had_replacer).
1064    #[inline]
1065    fn handle_document_replacement_and_tracking<'a>(
1066        &mut self,
1067        event: &'a EventRequestPaused,
1068        document_resource: bool,
1069    ) -> (Cow<'a, str>, bool) {
1070        let mut replacer: Option<String> = None;
1071        let current_url = event.request.url.as_str();
1072
1073        if document_resource {
1074            if self.document_target_url == current_url {
1075                self.document_reload_tracker += 1;
1076            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1077            {
1078                let (http_document_replacement, mut https_document_replacement) =
1079                    if self.document_target_url.starts_with("http://") {
1080                        (
1081                            self.document_target_url.replacen("http://", "http//", 1),
1082                            self.document_target_url.replacen("http://", "https://", 1),
1083                        )
1084                    } else {
1085                        (
1086                            self.document_target_url.replacen("https://", "https//", 1),
1087                            self.document_target_url.replacen("https://", "http://", 1),
1088                        )
1089                    };
1090
1091                // Track trailing slash to restore later.
1092                let trailing = https_document_replacement.ends_with('/');
1093                if trailing {
1094                    https_document_replacement.pop();
1095                }
1096                if https_document_replacement.ends_with('/') {
1097                    https_document_replacement.pop();
1098                }
1099
1100                let redirect_mask = format!(
1101                    "{}{}",
1102                    https_document_replacement, http_document_replacement
1103                );
1104
1105                if current_url == redirect_mask {
1106                    replacer = Some(if trailing {
1107                        format!("{}/", https_document_replacement)
1108                    } else {
1109                        https_document_replacement
1110                    });
1111                }
1112            }
1113
1114            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1115                self.xml_document = true;
1116            }
1117
1118            // Track last seen document URL.
1119            self.document_target_url = event.request.url.clone();
1120            self.document_target_domain = host_and_rest(&self.document_target_url)
1121                .map(|(h, _)| base_domain_from_host(h).to_string())
1122                .unwrap_or_default();
1123        }
1124
1125        let current_url_cow = match replacer {
1126            Some(r) => Cow::Owned(r),
1127            None => Cow::Borrowed(event.request.url.as_str()),
1128        };
1129
1130        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1131        (current_url_cow, had_replacer)
1132    }
1133
1134    /// Perform a page intercept for chrome
1135    #[cfg(feature = "adblock")]
1136    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1137        use adblock::{
1138            lists::{FilterSet, ParseOptions, RuleTypes},
1139            Engine,
1140        };
1141
1142        lazy_static::lazy_static! {
1143            static ref AD_ENGINE: Engine = {
1144                let mut filter_set = FilterSet::new(false);
1145                let mut rules = ParseOptions::default();
1146                rules.rule_types = RuleTypes::All;
1147
1148                filter_set.add_filters(
1149                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1150                    rules,
1151                );
1152
1153                Engine::from_filter_set(filter_set, true)
1154            };
1155        };
1156
1157        let blockable = ResourceType::Image == event.resource_type
1158            || event.resource_type == ResourceType::Media
1159            || event.resource_type == ResourceType::Stylesheet
1160            || event.resource_type == ResourceType::Document
1161            || event.resource_type == ResourceType::Fetch
1162            || event.resource_type == ResourceType::Xhr;
1163
1164        let u = &event.request.url;
1165
1166        let block_request = blockable
1167            // set it to example.com for 3rd party handling is_same_site
1168        && {
1169            let request = adblock::request::Request::preparsed(
1170                 &u,
1171                 "example.com",
1172                 "example.com",
1173                 &event.resource_type.as_ref().to_lowercase(),
1174                 !event.request.is_same_site.unwrap_or_default());
1175
1176            AD_ENGINE.check_network_request(&request).matched
1177        };
1178
1179        block_request
1180    }
1181
1182    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1183        let response = if self
1184            .attempted_authentications
1185            .contains(event.request_id.as_ref())
1186        {
1187            AuthChallengeResponseResponse::CancelAuth
1188        } else if self.credentials.is_some() {
1189            self.attempted_authentications
1190                .insert(event.request_id.clone().into());
1191            AuthChallengeResponseResponse::ProvideCredentials
1192        } else {
1193            AuthChallengeResponseResponse::Default
1194        };
1195
1196        let mut auth = AuthChallengeResponse::new(response);
1197        if let Some(creds) = self.credentials.clone() {
1198            auth.username = Some(creds.username);
1199            auth.password = Some(creds.password);
1200        }
1201        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1202    }
1203
1204    /// Set the page offline network emulation condition.
1205    pub fn set_offline_mode(&mut self, value: bool) {
1206        if self.offline == value {
1207            return;
1208        }
1209        self.offline = value;
1210        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1211            .offline(self.offline)
1212            .latency(0)
1213            .download_throughput(-1.)
1214            .upload_throughput(-1.)
1215            .build()
1216        {
1217            self.push_cdp_request(network);
1218        }
1219    }
1220
1221    /// Request interception doesn't happen for data URLs with Network Service.
1222    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1223        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1224            if let Some(interception_id) = self
1225                .request_id_to_interception_id
1226                .remove(event.request_id.as_ref())
1227            {
1228                self.on_request(event, Some(interception_id));
1229            } else {
1230                // TODO remove the clone for event
1231                self.requests_will_be_sent
1232                    .insert(event.request_id.clone(), event.clone());
1233            }
1234        } else {
1235            self.on_request(event, None);
1236        }
1237    }
1238
1239    /// The request was served from the cache.
1240    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1241        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1242            request.from_memory_cache = true;
1243        }
1244    }
1245
1246    /// On network response received.
1247    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1248        let mut request_failed = false;
1249
1250        // Track how many bytes we actually deducted from this target.
1251        let mut deducted: u64 = 0;
1252
1253        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1254            let before = *max_bytes;
1255
1256            // encoded_data_length -> saturating cast to u64
1257            let received_bytes: u64 = event.response.encoded_data_length as u64;
1258
1259            // Safe parse of Content-Length
1260            let content_length: Option<u64> = event
1261                .response
1262                .headers
1263                .inner()
1264                .get("content-length")
1265                .and_then(|v| v.as_str())
1266                .and_then(|s| s.trim().parse::<u64>().ok());
1267
1268            // Deduct what we actually received
1269            *max_bytes = max_bytes.saturating_sub(received_bytes);
1270
1271            // If the declared size can't fit, zero out now
1272            if let Some(cl) = content_length {
1273                if cl > *max_bytes {
1274                    *max_bytes = 0;
1275                }
1276            }
1277
1278            request_failed = *max_bytes == 0;
1279
1280            // Compute exact delta deducted on this event
1281            deducted = before.saturating_sub(*max_bytes);
1282        }
1283
1284        // Bubble up the deduction (even if request continues)
1285        if deducted > 0 {
1286            self.queued_events
1287                .push_back(NetworkEvent::BytesConsumed(deducted));
1288        }
1289
1290        // block all network request moving forward.
1291        if request_failed && self.max_bytes_allowed.is_some() {
1292            self.set_block_all(true);
1293        }
1294
1295        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1296            request.set_response(event.response.clone());
1297            self.queued_events.push_back(if request_failed {
1298                NetworkEvent::RequestFailed(request)
1299            } else {
1300                NetworkEvent::RequestFinished(request)
1301            });
1302        }
1303    }
1304
1305    /// On network loading finished.
1306    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1307        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1308            if let Some(interception_id) = request.interception_id.as_ref() {
1309                self.attempted_authentications
1310                    .remove(interception_id.as_ref());
1311            }
1312            self.queued_events
1313                .push_back(NetworkEvent::RequestFinished(request));
1314        }
1315    }
1316
1317    /// On network loading failed.
1318    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1319        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1320            request.failure_text = Some(event.error_text.clone());
1321            if let Some(interception_id) = request.interception_id.as_ref() {
1322                self.attempted_authentications
1323                    .remove(interception_id.as_ref());
1324            }
1325            self.queued_events
1326                .push_back(NetworkEvent::RequestFailed(request));
1327        }
1328    }
1329
1330    /// On request will be sent.
1331    fn on_request(
1332        &mut self,
1333        event: &EventRequestWillBeSent,
1334        interception_id: Option<InterceptionId>,
1335    ) {
1336        let mut redirect_chain = Vec::new();
1337        let mut redirect_location = None;
1338
1339        if let Some(redirect_resp) = &event.redirect_response {
1340            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1341                if is_redirect_status(redirect_resp.status) {
1342                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1343                        if redirect_resp.url != location {
1344                            let fixed_location = location.replace(&redirect_resp.url, "");
1345
1346                            if !fixed_location.is_empty() {
1347                                request.response.as_mut().map(|resp| {
1348                                    resp.headers.0["Location"] =
1349                                        serde_json::Value::String(fixed_location.clone());
1350                                });
1351                            }
1352
1353                            redirect_location = Some(fixed_location);
1354                        }
1355                    }
1356                }
1357
1358                self.handle_request_redirect(
1359                    &mut request,
1360                    if let Some(redirect_location) = redirect_location {
1361                        let mut redirect_resp = redirect_resp.clone();
1362
1363                        if !redirect_location.is_empty() {
1364                            redirect_resp.headers.0["Location"] =
1365                                serde_json::Value::String(redirect_location);
1366                        }
1367
1368                        redirect_resp
1369                    } else {
1370                        redirect_resp.clone()
1371                    },
1372                );
1373
1374                redirect_chain = std::mem::take(&mut request.redirect_chain);
1375                redirect_chain.push(request);
1376            }
1377        }
1378
1379        let request = HttpRequest::new(
1380            event.request_id.clone(),
1381            event.frame_id.clone(),
1382            interception_id,
1383            self.user_request_interception_enabled,
1384            redirect_chain,
1385        );
1386
1387        self.requests.insert(event.request_id.clone(), request);
1388        self.queued_events
1389            .push_back(NetworkEvent::Request(event.request_id.clone()));
1390    }
1391
1392    /// Handle request redirect.
1393    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1394        request.set_response(response);
1395        if let Some(interception_id) = request.interception_id.as_ref() {
1396            self.attempted_authentications
1397                .remove(interception_id.as_ref());
1398        }
1399    }
1400}
1401
1402#[derive(Debug)]
1403pub enum NetworkEvent {
1404    /// Send a CDP request.
1405    SendCdpRequest((MethodId, serde_json::Value)),
1406    /// Request.
1407    Request(RequestId),
1408    /// Response
1409    Response(RequestId),
1410    /// Request failed.
1411    RequestFailed(HttpRequest),
1412    /// Request finished.
1413    RequestFinished(HttpRequest),
1414    /// Bytes consumed.
1415    BytesConsumed(u64),
1416}
1417
1418#[cfg(test)]
1419mod tests {
1420    use super::ALLOWED_MATCHER_3RD_PARTY;
1421    use crate::handler::network::NetworkManager;
1422    use std::time::Duration;
1423
1424    #[test]
1425    fn test_allowed_matcher_3rd_party() {
1426        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1427        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1428        assert!(
1429            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1430            "expected Cloudflare challenge script to be allowed"
1431        );
1432
1433        // Should NOT be allowed (not in allow-list)
1434        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1435        assert!(
1436            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1437            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1438        );
1439
1440        // A couple sanity checks for existing allow patterns
1441        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1442        assert!(ALLOWED_MATCHER_3RD_PARTY
1443            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1444        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1445    }
1446
1447    #[test]
1448    fn test_script_allowed_by_default_when_not_blocklisted() {
1449        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1450        nm.set_page_url(
1451            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1452        );
1453
1454        // A random script that should not match your block tries.
1455        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1456        assert!(
1457            !nm.should_block_script_blocklist_only(ok),
1458            "expected non-blocklisted script to be allowed"
1459        );
1460    }
1461
1462    #[test]
1463    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1464        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1465        nm.set_page_url(
1466            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1467        );
1468
1469        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1470        let bad = "https://cdn.example.net/js/analytics.js";
1471        assert!(
1472            nm.should_block_script_blocklist_only(bad),
1473            "expected analytics.js to be blocklisted"
1474        );
1475    }
1476
1477    #[test]
1478    fn test_allowed_matcher_3rd_party_sanity() {
1479        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1480        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1481        assert!(
1482            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1483            "expected Cloudflare challenge script to be allowed"
1484        );
1485
1486        // Should NOT be allowed (not in allow-list)
1487        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1488        assert!(
1489            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1490            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1491        );
1492
1493        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1494        assert!(ALLOWED_MATCHER_3RD_PARTY
1495            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1496        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1497    }
1498    #[test]
1499    fn test_dynamic_blacklist_blocks_url() {
1500        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1501        nm.set_page_url("https://example.com/".to_string());
1502
1503        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1504        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1505        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1506
1507        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1508    }
1509
1510    #[test]
1511    fn test_blacklist_strict_wins_over_whitelist() {
1512        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1513        nm.set_page_url("https://example.com/".to_string());
1514
1515        // Same URL in both lists.
1516        nm.set_blacklist_patterns(["beacon.min.js"]);
1517        nm.set_whitelist_patterns(["beacon.min.js"]);
1518
1519        nm.set_blacklist_strict(true);
1520
1521        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1522        assert!(nm.is_whitelisted(u));
1523        assert!(nm.is_blacklisted(u));
1524
1525        // In strict mode, it should still be considered blocked at decision time.
1526        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1527        assert!(nm.blacklist_strict);
1528    }
1529
1530    #[test]
1531    fn test_blacklist_non_strict_allows_whitelist_override() {
1532        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1533        nm.set_page_url("https://example.com/".to_string());
1534
1535        nm.set_blacklist_patterns(["beacon.min.js"]);
1536        nm.set_whitelist_patterns(["beacon.min.js"]);
1537
1538        nm.set_blacklist_strict(false);
1539
1540        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1541        assert!(nm.is_blacklisted(u));
1542        assert!(nm.is_whitelisted(u));
1543        assert!(!nm.blacklist_strict);
1544    }
1545}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs