chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/",
73        "https://google.com/recaptcha/api.js",
74        "https://www.gstatic.com/recaptcha/",
75        "https://captcha.px-cloud.net/",
76        "https://geo.captcha-delivery.com/",
77        "https://api.leminnow.com/captcha/",
78        "https://cdn.auth0.com/js/lock/",
79        "https://captcha.gtimg.com",
80        "https://client-api.arkoselabs.com/",
81        "https://www.capy.me/puzzle/",
82        "https://newassets.hcaptcha.com/",
83        "https://cdn.auth0.com/client",
84        "https://js.stripe.com/",
85        "https://cdn.prod.website-files.com/", // webflow cdn scripts
86        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
87        "https://code.jquery.com/jquery-"
88    ];
89
90    /// Determine if a script should be rendered in the browser by name.
91    ///
92    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
93    /// but we keep it for compatibility and other call sites.
94    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
95
96    /// General patterns for popular libraries and resources
97    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
98        // Verified 3rd parties for request
99        "https://m.stripe.network/",
100        "https://challenges.cloudflare.com/",
101        "https://js.stripe.com/",
102        "https://cdn.prod.website-files.com/", // webflow cdn scripts
103        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
104        "https://code.jquery.com/jquery-",
105        "https://ct.captcha-delivery.com/",
106        "https://geo.captcha-delivery.com/",
107        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
108        "https://cdn.auth0.com/client",
109        "https://captcha.px-cloud.net/",
110        "https://www.capy.me/puzzle/",
111        "https://www.gstatic.com/recaptcha/",
112        "https://google.com/recaptcha/",
113        "https://www.google.com/recaptcha/",
114        "https://www.recaptcha.net/recaptcha/",
115        "https://js.hcaptcha.com/1/api.js",
116        "https://hcaptcha.com/1/api.js",
117        "https://js.datadome.co/tags.js",
118        "https://api-js.datadome.co/",
119        "https://client.perimeterx.net/",
120        "https://captcha.px-cdn.net/",
121        "https://newassets.hcaptcha.com/",
122        "https://captcha.px-cloud.net/",
123        "https://s.perimeterx.net/",
124        "https://api.leminnow.com/captcha/",
125        "https://client-api.arkoselabs.com/",
126        "https://static.geetest.com/v4/gt4.js",
127        "https://static.geetest.com/",
128        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
129        "https://cdn.perfdrive.com/aperture/",
130        "https://assets.queue-it.net/",
131        "discourse-cdn.com/",
132        "hcaptcha.com",
133        "/cdn-cgi/challenge-platform/",
134        "/_Incapsula_Resource"
135    ];
136
137    /// Determine if a script should be rendered in the browser by name.
138    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
139
140    /// path of a js framework
141    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
142        phf::phf_set! {
143            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
144            "_astro/", "_app/immutable"
145        }
146    };
147
148    /// Ignore the content types.
149    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
150        "application/pdf",
151        "application/zip",
152        "application/x-rar-compressed",
153        "application/x-tar",
154        "image/png",
155        "image/jpeg",
156        "image/gif",
157        "image/bmp",
158        "image/webp",
159        "image/svg+xml",
160        "video/mp4",
161        "video/x-msvideo",
162        "video/x-matroska",
163        "video/webm",
164        "audio/mpeg",
165        "audio/ogg",
166        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
167        "application/vnd.ms-excel",
168        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
169        "application/vnd.ms-powerpoint",
170        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
171        "application/x-7z-compressed",
172        "application/x-rpm",
173        "application/x-shockwave-flash",
174        "application/rtf",
175    };
176
177    /// Ignore the resources for visual content types.
178    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
179        "Image",
180        "Media",
181        "Font"
182    };
183
184    /// Ignore the resources for visual content types.
185    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
186        "CspViolationReport",
187        "Manifest",
188        "Other",
189        "Prefetch",
190        "Ping",
191    };
192
193    /// Case insenstive css matching
194    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
195
196    /// The command chain.
197    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
198        let enable = EnableParams::default();
199
200        if let Ok(c) = serde_json::to_value(&enable) {
201            vec![(enable.identifier(), c)]
202        } else {
203            vec![]
204        }
205    };
206
207    /// The command chain with https ignore.
208    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
209        let enable = EnableParams::default();
210        let mut v = vec![];
211        if let Ok(c) = serde_json::to_value(&enable) {
212            v.push((enable.identifier(), c));
213        }
214        let ignore = SetIgnoreCertificateErrorsParams::new(true);
215        if let Ok(ignored) = serde_json::to_value(&ignore) {
216            v.push((ignore.identifier(), ignored));
217        }
218
219        v
220    };
221
222    /// Enable the fetch intercept command
223    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
224        fetch::EnableParams::builder()
225        .handle_auth_requests(true)
226        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
227        .build()
228    };
229}
230
231/// Determine if a redirect is true.
232pub(crate) fn is_redirect_status(status: i64) -> bool {
233    matches!(status, 301 | 302 | 303 | 307 | 308)
234}
235
236#[derive(Debug)]
237/// The base network manager.
238pub struct NetworkManager {
239    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
240    ///
241    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
242    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
243    /// Consumers pull from this queue via `poll()`.
244    queued_events: VecDeque<NetworkEvent>,
245    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
246    ///
247    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
248    /// certificates (self-signed, expired, MITM proxies, etc.).
249    ignore_httpserrors: bool,
250    /// Active in-flight requests keyed by CDP `RequestId`.
251    ///
252    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
253    /// and final state used to emit `RequestFinished` / `RequestFailed`.
254    requests: HashMap<RequestId, HttpRequest>,
255    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
256    /// `Fetch.requestPaused` arrives later (or vice versa).
257    ///
258    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
259    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
260    // TODO put event in an Arc?
261    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
262    /// Extra HTTP headers to apply to subsequent network requests via CDP.
263    ///
264    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
265    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
266    extra_headers: std::collections::HashMap<String, String>,
267    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
268    ///
269    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
270    /// store the interception id here so it can be attached to the `HttpRequest` once the
271    /// network request is observed.
272    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
273    /// Whether the user has disabled the browser cache.
274    ///
275    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
276    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
277    user_cache_disabled: bool,
278    /// Tracks which requests have already attempted authentication.
279    ///
280    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
281    /// authentication challenges (407/401). Once a request id is present here, subsequent
282    /// challenges for the same request are canceled.
283    attempted_authentications: HashSet<RequestId>,
284    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
285    ///
286    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
287    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
288    credentials: Option<Credentials>,
289    /// User-facing toggle indicating whether request interception is desired.
290    ///
291    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
292    /// not guarantee interception is active; interception is actually enabled/disabled by
293    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
294    ///
295    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
296    /// enabled to satisfy auth challenges.
297    pub(crate) user_request_interception_enabled: bool,
298    /// Hard kill-switch to block all network traffic.
299    ///
300    /// When `true`, the manager immediately blocks requests (typically via
301    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
302    /// and short-circuits most decision logic. This is used for safety conditions such as
303    /// exceeding `max_bytes_allowed` or other runtime protections.
304    block_all: bool,
305    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
306    ///
307    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
308    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
309    /// when `user_request_interception_enabled` or `credentials` change.
310    pub(crate) protocol_request_interception_enabled: bool,
311    /// The network is offline.
312    offline: bool,
313    /// The page request timeout.
314    pub request_timeout: Duration,
315    // made_request: bool,
316    /// Ignore visuals (no pings, prefetching, and etc).
317    pub ignore_visuals: bool,
318    /// Block CSS stylesheets.
319    pub block_stylesheets: bool,
320    /// Block javascript that is not critical to rendering.
321    ///
322    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
323    /// by itself (it remains for config compatibility).
324    pub block_javascript: bool,
325    /// Block analytics from rendering
326    pub block_analytics: bool,
327    /// Only html from loading.
328    pub only_html: bool,
329    /// Is xml document?
330    pub xml_document: bool,
331    /// The custom intercept handle logic to run on the website.
332    pub intercept_manager: NetworkInterceptManager,
333    /// Track the amount of times the document reloaded.
334    pub document_reload_tracker: u8,
335    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
336    pub document_target_url: String,
337    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
338    pub document_target_domain: String,
339    /// The max bytes to receive.
340    pub max_bytes_allowed: Option<u64>,
341    #[cfg(feature = "_cache")]
342    /// The cache site_key to use.
343    pub cache_site_key: Option<String>,
344    /// The cache policy to use.
345    #[cfg(feature = "_cache")]
346    pub cache_policy: Option<BasicCachePolicy>,
347    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
348    whitelist_patterns: Vec<String>,
349    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
350    whitelist_matcher: Option<AhoCorasick>,
351    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
352    blacklist_patterns: Vec<String>,
353    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
354    blacklist_matcher: Option<AhoCorasick>,
355    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
356    blacklist_strict: bool,
357}
358
359impl NetworkManager {
360    /// A new network manager.
361    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
362        Self {
363            queued_events: Default::default(),
364            ignore_httpserrors,
365            requests: Default::default(),
366            requests_will_be_sent: Default::default(),
367            extra_headers: Default::default(),
368            request_id_to_interception_id: Default::default(),
369            user_cache_disabled: false,
370            attempted_authentications: Default::default(),
371            credentials: None,
372            block_all: false,
373            user_request_interception_enabled: false,
374            protocol_request_interception_enabled: false,
375            offline: false,
376            request_timeout,
377            ignore_visuals: false,
378            block_javascript: false,
379            block_stylesheets: false,
380            block_analytics: true,
381            only_html: false,
382            xml_document: false,
383            intercept_manager: NetworkInterceptManager::Unknown,
384            document_reload_tracker: 0,
385            document_target_url: String::new(),
386            document_target_domain: String::new(),
387            whitelist_patterns: Vec::new(),
388            whitelist_matcher: None,
389            blacklist_patterns: Vec::new(),
390            blacklist_matcher: None,
391            blacklist_strict: true,
392            max_bytes_allowed: None,
393            #[cfg(feature = "_cache")]
394            cache_site_key: None,
395            #[cfg(feature = "_cache")]
396            cache_policy: None,
397        }
398    }
399
400    /// Replace the whitelist patterns (compiled once).
401    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
402    where
403        I: IntoIterator<Item = S>,
404        S: Into<String>,
405    {
406        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
407        self.rebuild_whitelist_matcher();
408    }
409
410    /// Replace the blacklist patterns (compiled once).
411    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
412    where
413        I: IntoIterator<Item = S>,
414        S: Into<String>,
415    {
416        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
417        self.rebuild_blacklist_matcher();
418    }
419
420    /// Add one pattern (cheap) and rebuild (call this sparingly).
421    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
422        self.blacklist_patterns.push(pattern.into());
423        self.rebuild_blacklist_matcher();
424    }
425
426    /// Add many patterns and rebuild once.
427    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
428    where
429        I: IntoIterator<Item = S>,
430        S: Into<String>,
431    {
432        self.blacklist_patterns
433            .extend(patterns.into_iter().map(Into::into));
434        self.rebuild_blacklist_matcher();
435    }
436
437    /// Clear blacklist entirely.
438    pub fn clear_blacklist(&mut self) {
439        self.blacklist_patterns.clear();
440        self.blacklist_matcher = None;
441    }
442
443    /// Control precedence: when true, blacklist always wins.
444    pub fn set_blacklist_strict(&mut self, strict: bool) {
445        self.blacklist_strict = strict;
446    }
447
448    #[inline]
449    fn rebuild_blacklist_matcher(&mut self) {
450        if self.blacklist_patterns.is_empty() {
451            self.blacklist_matcher = None;
452            return;
453        }
454
455        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
456        self.blacklist_matcher = AhoCorasick::new(refs).ok();
457    }
458
459    #[inline]
460    fn is_blacklisted(&self, url: &str) -> bool {
461        self.blacklist_matcher
462            .as_ref()
463            .map(|m| m.is_match(url))
464            .unwrap_or(false)
465    }
466
467    /// Add one pattern (cheap) and rebuild (call this sparingly).
468    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
469        self.whitelist_patterns.push(pattern.into());
470        self.rebuild_whitelist_matcher();
471    }
472
473    /// Add many patterns and rebuild once.
474    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
475    where
476        I: IntoIterator<Item = S>,
477        S: Into<String>,
478    {
479        self.whitelist_patterns
480            .extend(patterns.into_iter().map(Into::into));
481        self.rebuild_whitelist_matcher();
482    }
483
484    #[inline]
485    fn rebuild_whitelist_matcher(&mut self) {
486        if self.whitelist_patterns.is_empty() {
487            self.whitelist_matcher = None;
488            return;
489        }
490
491        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
492
493        // If building fails (shouldn’t for simple patterns), just disable matcher.
494        self.whitelist_matcher = AhoCorasick::new(refs).ok();
495    }
496
497    #[inline]
498    fn is_whitelisted(&self, url: &str) -> bool {
499        self.whitelist_matcher
500            .as_ref()
501            .map(|m| m.is_match(url))
502            .unwrap_or(false)
503    }
504
505    /// Commands to init the chain with.
506    pub fn init_commands(&self) -> CommandChain {
507        let cmds = if self.ignore_httpserrors {
508            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
509        } else {
510            INIT_CHAIN.clone()
511        };
512        CommandChain::new(cmds, self.request_timeout)
513    }
514
515    /// Push the CDP request.
516    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
517        let method = cmd.identifier();
518        if let Ok(params) = serde_json::to_value(cmd) {
519            self.queued_events
520                .push_back(NetworkEvent::SendCdpRequest((method, params)));
521        }
522    }
523
524    /// The next event to handle.
525    pub fn poll(&mut self) -> Option<NetworkEvent> {
526        self.queued_events.pop_front()
527    }
528
529    /// Get the extra headers.
530    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
531        &self.extra_headers
532    }
533
534    /// Set extra HTTP headers.
535    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
536        self.extra_headers = headers;
537        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
538        self.extra_headers.remove("Proxy-Authorization");
539        if !self.extra_headers.is_empty() {
540            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
541                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
542            }
543        }
544    }
545
546    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
547        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
548    }
549
550    pub fn set_block_all(&mut self, block_all: bool) {
551        self.block_all = block_all;
552    }
553
554    pub fn set_request_interception(&mut self, enabled: bool) {
555        self.user_request_interception_enabled = enabled;
556        self.update_protocol_request_interception();
557    }
558
559    pub fn set_cache_enabled(&mut self, enabled: bool) {
560        let run = self.user_cache_disabled != !enabled;
561        self.user_cache_disabled = !enabled;
562        if run {
563            self.update_protocol_cache_disabled();
564        }
565    }
566
567    /// Enable fetch interception.
568    pub fn enable_request_intercept(&mut self) {
569        self.protocol_request_interception_enabled = true;
570    }
571
572    /// Disable fetch interception.
573    pub fn disable_request_intercept(&mut self) {
574        self.protocol_request_interception_enabled = false;
575    }
576
577    /// Set the cache site key.
578    #[cfg(feature = "_cache")]
579    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
580        self.cache_site_key = cache_site_key;
581    }
582
583    /// Set the cache policy.
584    #[cfg(feature = "_cache")]
585    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
586        self.cache_policy = cache_policy;
587    }
588
589    pub fn update_protocol_cache_disabled(&mut self) {
590        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
591    }
592
593    pub fn authenticate(&mut self, credentials: Credentials) {
594        self.credentials = Some(credentials);
595        self.update_protocol_request_interception();
596        self.protocol_request_interception_enabled = true;
597    }
598
599    fn update_protocol_request_interception(&mut self) {
600        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
601
602        if enabled == self.protocol_request_interception_enabled {
603            return;
604        }
605
606        if enabled {
607            self.push_cdp_request(ENABLE_FETCH.clone())
608        } else {
609            self.push_cdp_request(DisableParams::default())
610        }
611    }
612
613    /// Blocklist-only script blocking.
614    /// Returns true only when the URL matches an explicit blocklist condition.
615    #[inline]
616    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
617        // If analytics blocking is off, skip all analytics tries.
618        let block_analytics = self.block_analytics;
619
620        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
621        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
622        {
623            return true;
624        }
625
626        // 2) Custom website block list (explicit).
627        if crate::handler::blockers::block_websites::block_website(url) {
628            return true;
629        }
630
631        // 3) Path-based explicit tries / fallbacks.
632        //
633        // We run these on:
634        // - path with leading slash ("/js/app.js")
635        // - path without leading slash ("js/app.js")
636        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
637        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
638            // Remove query/fragment so matching stays stable.
639            let p_slash = Self::strip_query_fragment(path_with_slash);
640            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
641
642            // Basename for filename-only lists.
643            let base = match p_slash.rsplit('/').next() {
644                Some(b) => b,
645                None => p_slash,
646            };
647
648            // ---- Trie checks ----
649            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
650            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
651                return true;
652            }
653            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
654                return true;
655            }
656            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
657                return true;
658            }
659
660            // Base-path ignore tries (framework noise / known ignorable script paths).
661            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
662            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
663                return true;
664            }
665
666            // Style path ignores only when visuals are ignored.
667            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
668                return true;
669            }
670        }
671
672        false
673    }
674
675    /// Extract the absolute URL path portion WITH the leading slash.
676    ///
677    /// Example:
678    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
679    #[inline]
680    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
681        // find scheme separator
682        let idx = url.find("//")?;
683        let after_slashes = idx + 2;
684
685        // find first slash after host
686        let slash_rel = url[after_slashes..].find('/')?;
687        let slash_idx = after_slashes + slash_rel;
688
689        if slash_idx < url.len() {
690            Some(&url[slash_idx..])
691        } else {
692            None
693        }
694    }
695
696    /// Strip query string and fragment from a path-ish string.
697    ///
698    /// Example:
699    /// - "/a/b.js?x=1#y" -> "/a/b.js"
700    #[inline]
701    fn strip_query_fragment(s: &str) -> &str {
702        let q = s.find('?');
703        let h = s.find('#');
704
705        match (q, h) {
706            (None, None) => s,
707            (Some(i), None) => &s[..i],
708            (None, Some(i)) => &s[..i],
709            (Some(i), Some(j)) => &s[..i.min(j)],
710        }
711    }
712
713    /// Determine if the request should be skipped.
714    #[inline]
715    fn skip_xhr(
716        &self,
717        skip_networking: bool,
718        event: &EventRequestPaused,
719        network_event: bool,
720    ) -> bool {
721        // XHR check
722        if !skip_networking && network_event {
723            let request_url = event.request.url.as_str();
724
725            // check if part of ignore scripts.
726            let skip_analytics =
727                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
728
729            if skip_analytics {
730                true
731            } else if self.block_stylesheets || self.ignore_visuals {
732                let block_css = self.block_stylesheets;
733                let block_media = self.ignore_visuals;
734
735                let mut block_request = false;
736
737                if let Some(position) = request_url.rfind('.') {
738                    let hlen = request_url.len();
739                    let has_asset = hlen - position;
740
741                    if has_asset >= 3 {
742                        let next_position = position + 1;
743
744                        if block_media
745                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
746                                &request_url[next_position..].into(),
747                            )
748                        {
749                            block_request = true;
750                        } else if block_css {
751                            block_request =
752                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
753                                    .contains(&**CSS_EXTENSION)
754                        }
755                    }
756                }
757
758                if !block_request {
759                    block_request = ignore_script_xhr_media(request_url);
760                }
761
762                block_request
763            } else {
764                skip_networking
765            }
766        } else {
767            skip_networking
768        }
769    }
770
771    #[cfg(feature = "adblock")]
772    #[inline]
773    /// Detect if ad enabled.
774    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
775        if skip_networking {
776            true
777        } else {
778            block_ads(&event.request.url) || self.detect_ad(event)
779        }
780    }
781
782    /// When adblock feature is disabled, this is a no-op.
783    #[cfg(not(feature = "adblock"))]
784    #[inline]
785    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
786        use crate::handler::blockers::block_websites::block_ads;
787        if skip_networking {
788            true
789        } else {
790            block_ads(&event.request.url)
791        }
792    }
793
794    #[inline]
795    /// Fail request
796    fn fail_request_blocked(
797        &mut self,
798        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
799    ) {
800        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
801            request_id.clone(),
802            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
803        );
804        self.push_cdp_request(params);
805    }
806
807    #[inline]
808    /// Fulfill request
809    fn fulfill_request_empty_200(
810        &mut self,
811        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
812    ) {
813        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
814            request_id.clone(),
815            200,
816        );
817        self.push_cdp_request(params);
818    }
819
820    #[cfg(feature = "_cache")]
821    #[inline]
822    /// Fulfill a paused Fetch request from cached bytes + header map.
823    ///
824    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
825    fn fulfill_request_from_cache(
826        &mut self,
827        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
828        body: &[u8],
829        headers: &std::collections::HashMap<String, String>,
830        status: i64,
831    ) {
832        use crate::cdp::browser_protocol::fetch::HeaderEntry;
833        use crate::handler::network::fetch::FulfillRequestParams;
834        use base64::Engine;
835
836        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
837
838        for (k, v) in headers.iter() {
839            resp_headers.push(HeaderEntry {
840                name: k.clone().into(),
841                value: v.clone().into(),
842            });
843        }
844
845        let mut params = FulfillRequestParams::new(request_id.clone(), status);
846
847        // TODO: have this already encoded prior.
848        params.body = Some(
849            base64::engine::general_purpose::STANDARD
850                .encode(body)
851                .into(),
852        );
853
854        params.response_headers = Some(resp_headers);
855
856        self.push_cdp_request(params);
857    }
858
859    #[inline]
860    /// Continue the request url.
861    fn continue_request_with_url(
862        &mut self,
863        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
864        url: Option<&str>,
865        intercept_response: bool,
866    ) {
867        let mut params = ContinueRequestParams::new(request_id.clone());
868        if let Some(url) = url {
869            params.url = Some(url.to_string());
870            params.intercept_response = Some(intercept_response);
871        }
872        self.push_cdp_request(params);
873    }
874
875    /// On fetch request paused interception.
876    #[inline]
877    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
878        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
879            return;
880        }
881
882        let resource_type = &event.resource_type;
883
884        if self.block_all {
885            tracing::debug!(
886                "Blocked (block_all): {:?} - {}",
887                event.resource_type,
888                event.request.url
889            );
890            return self.fail_request_blocked(&event.request_id);
891        }
892
893        if let Some(network_id) = event.network_id.as_ref() {
894            if let Some(request_will_be_sent) =
895                self.requests_will_be_sent.remove(network_id.as_ref())
896            {
897                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
898            } else {
899                self.request_id_to_interception_id
900                    .insert(network_id.clone(), event.request_id.clone().into());
901            }
902        }
903
904        // From here on, we handle the full decision tree.
905        let javascript_resource = *resource_type == ResourceType::Script;
906        let document_resource = *resource_type == ResourceType::Document;
907        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
908
909        // Start with static / cheap skip checks.
910        let mut skip_networking =
911            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
912
913        // Also short-circuit if we've reloaded this document too many times.
914        if !skip_networking {
915            skip_networking = self.document_reload_tracker >= 3;
916        }
917
918        // Handle document redirect / masking and track xml documents.
919        let (current_url_cow, had_replacer) =
920            self.handle_document_replacement_and_tracking(event, document_resource);
921
922        let current_url: &str = current_url_cow.as_ref();
923
924        let blacklisted = self.is_blacklisted(current_url);
925
926        if !self.blacklist_strict && blacklisted {
927            skip_networking = true;
928        }
929
930        if !skip_networking {
931            // Allow XSL for sitemap XML.
932            if self.xml_document && current_url.ends_with(".xsl") {
933                skip_networking = false;
934            } else {
935                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
936            }
937        }
938
939        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
940
941        // Ignore embedded scripts when only_html or ignore_visuals is set.
942        if !skip_networking
943            && self.block_javascript
944            && (self.only_html || self.ignore_visuals)
945            && (javascript_resource || document_resource)
946        {
947            skip_networking = ignore_script_embedded(current_url);
948        }
949
950        // Script policy: allow-by-default.
951        // Block only if explicit block list patterns match.
952        if !skip_networking && javascript_resource {
953            skip_networking = self.should_block_script_blocklist_only(current_url);
954        }
955
956        // XHR / data resources.
957        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
958
959        // Custom interception layer.
960        if !skip_networking && (javascript_resource || network_resource || document_resource) {
961            skip_networking = self.intercept_manager.intercept_detection(
962                current_url,
963                self.ignore_visuals,
964                network_resource,
965            );
966        }
967
968        // Custom website block list.
969        if !skip_networking && (javascript_resource || network_resource) {
970            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
971        }
972
973        // whitelist 3rd party
974        // not required unless explicit blocking.
975        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
976        {
977            skip_networking = false;
978        }
979
980        // check if the url is in the whitelist.
981        if skip_networking && self.is_whitelisted(current_url) {
982            skip_networking = false;
983        }
984
985        if self.blacklist_strict && blacklisted {
986            skip_networking = true;
987        }
988
989        if skip_networking {
990            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
991            self.fulfill_request_empty_200(&event.request_id);
992        } else {
993            #[cfg(feature = "_cache")]
994            {
995                if let (Some(policy), Some(cache_site_key)) =
996                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
997                {
998                    let current_url = format!("{}:{}", event.request.method, &current_url);
999
1000                    if let Some((res, cache_policy)) =
1001                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1002                    {
1003                        if policy.allows_cached(&cache_policy) {
1004                            tracing::debug!(
1005                                "Remote Cached: {:?} - {}",
1006                                resource_type,
1007                                &current_url
1008                            );
1009                            return self.fulfill_request_from_cache(
1010                                &event.request_id,
1011                                &res.body,
1012                                &res.headers,
1013                                res.status as i64,
1014                            );
1015                        }
1016                    }
1017                }
1018            }
1019
1020            // check our frame cache for the run.
1021            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1022            self.continue_request_with_url(
1023                &event.request_id,
1024                if had_replacer {
1025                    Some(current_url)
1026                } else {
1027                    None
1028                },
1029                !had_replacer,
1030            );
1031        }
1032    }
1033
1034    /// Shared "visuals + basic blocking" logic.
1035    ///
1036    /// IMPORTANT: Scripts are NOT blocked here anymore.
1037    /// Scripts are allowed by default and only blocked via explicit blocklists
1038    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1039    #[inline]
1040    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1041        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1042            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1043    }
1044
1045    /// Does the network manager have a target domain?
1046    pub fn has_target_domain(&self) -> bool {
1047        !self.document_target_url.is_empty()
1048    }
1049
1050    /// Set the target page url for tracking.
1051    pub fn set_page_url(&mut self, page_target_url: String) {
1052        let host_base = host_and_rest(&page_target_url)
1053            .map(|(h, _)| base_domain_from_host(h))
1054            .unwrap_or("");
1055
1056        self.document_target_domain = host_base.to_string();
1057        self.document_target_url = page_target_url;
1058    }
1059
1060    /// Clear the initial target domain on every navigation.
1061    pub fn clear_target_domain(&mut self) {
1062        self.document_reload_tracker = 0;
1063        self.document_target_url = Default::default();
1064        self.document_target_domain = Default::default();
1065    }
1066
1067    /// Handles:
1068    /// - document reload tracking (`document_reload_tracker`)
1069    /// - redirect masking / replacement
1070    /// - xml document detection (`xml_document`)
1071    /// - `document_target_url` updates
1072    ///
1073    /// Returns (current_url, had_replacer).
1074    #[inline]
1075    fn handle_document_replacement_and_tracking<'a>(
1076        &mut self,
1077        event: &'a EventRequestPaused,
1078        document_resource: bool,
1079    ) -> (Cow<'a, str>, bool) {
1080        let mut replacer: Option<String> = None;
1081        let current_url = event.request.url.as_str();
1082
1083        if document_resource {
1084            if self.document_target_url == current_url {
1085                self.document_reload_tracker += 1;
1086            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1087            {
1088                let (http_document_replacement, mut https_document_replacement) =
1089                    if self.document_target_url.starts_with("http://") {
1090                        (
1091                            self.document_target_url.replacen("http://", "http//", 1),
1092                            self.document_target_url.replacen("http://", "https://", 1),
1093                        )
1094                    } else {
1095                        (
1096                            self.document_target_url.replacen("https://", "https//", 1),
1097                            self.document_target_url.replacen("https://", "http://", 1),
1098                        )
1099                    };
1100
1101                // Track trailing slash to restore later.
1102                let trailing = https_document_replacement.ends_with('/');
1103                if trailing {
1104                    https_document_replacement.pop();
1105                }
1106                if https_document_replacement.ends_with('/') {
1107                    https_document_replacement.pop();
1108                }
1109
1110                let redirect_mask = format!(
1111                    "{}{}",
1112                    https_document_replacement, http_document_replacement
1113                );
1114
1115                if current_url == redirect_mask {
1116                    replacer = Some(if trailing {
1117                        format!("{}/", https_document_replacement)
1118                    } else {
1119                        https_document_replacement
1120                    });
1121                }
1122            }
1123
1124            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1125                self.xml_document = true;
1126            }
1127
1128            // Track last seen document URL.
1129            self.document_target_url = event.request.url.clone();
1130            self.document_target_domain = host_and_rest(&self.document_target_url)
1131                .map(|(h, _)| base_domain_from_host(h).to_string())
1132                .unwrap_or_default();
1133        }
1134
1135        let current_url_cow = match replacer {
1136            Some(r) => Cow::Owned(r),
1137            None => Cow::Borrowed(event.request.url.as_str()),
1138        };
1139
1140        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1141        (current_url_cow, had_replacer)
1142    }
1143
1144    /// Perform a page intercept for chrome
1145    #[cfg(feature = "adblock")]
1146    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1147        use adblock::{
1148            lists::{FilterSet, ParseOptions, RuleTypes},
1149            Engine,
1150        };
1151
1152        lazy_static::lazy_static! {
1153            static ref AD_ENGINE: Engine = {
1154                let mut filter_set = FilterSet::new(false);
1155                let mut rules = ParseOptions::default();
1156                rules.rule_types = RuleTypes::All;
1157
1158                filter_set.add_filters(
1159                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1160                    rules,
1161                );
1162
1163                Engine::from_filter_set(filter_set, true)
1164            };
1165        };
1166
1167        let blockable = ResourceType::Image == event.resource_type
1168            || event.resource_type == ResourceType::Media
1169            || event.resource_type == ResourceType::Stylesheet
1170            || event.resource_type == ResourceType::Document
1171            || event.resource_type == ResourceType::Fetch
1172            || event.resource_type == ResourceType::Xhr;
1173
1174        let u = &event.request.url;
1175
1176        let block_request = blockable
1177            // set it to example.com for 3rd party handling is_same_site
1178        && {
1179            let request = adblock::request::Request::preparsed(
1180                 &u,
1181                 "example.com",
1182                 "example.com",
1183                 &event.resource_type.as_ref().to_lowercase(),
1184                 !event.request.is_same_site.unwrap_or_default());
1185
1186            AD_ENGINE.check_network_request(&request).matched
1187        };
1188
1189        block_request
1190    }
1191
1192    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1193        let response = if self
1194            .attempted_authentications
1195            .contains(event.request_id.as_ref())
1196        {
1197            AuthChallengeResponseResponse::CancelAuth
1198        } else if self.credentials.is_some() {
1199            self.attempted_authentications
1200                .insert(event.request_id.clone().into());
1201            AuthChallengeResponseResponse::ProvideCredentials
1202        } else {
1203            AuthChallengeResponseResponse::Default
1204        };
1205
1206        let mut auth = AuthChallengeResponse::new(response);
1207        if let Some(creds) = self.credentials.clone() {
1208            auth.username = Some(creds.username);
1209            auth.password = Some(creds.password);
1210        }
1211        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1212    }
1213
1214    /// Set the page offline network emulation condition.
1215    pub fn set_offline_mode(&mut self, value: bool) {
1216        if self.offline == value {
1217            return;
1218        }
1219        self.offline = value;
1220        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1221            .offline(self.offline)
1222            .latency(0)
1223            .download_throughput(-1.)
1224            .upload_throughput(-1.)
1225            .build()
1226        {
1227            self.push_cdp_request(network);
1228        }
1229    }
1230
1231    /// Request interception doesn't happen for data URLs with Network Service.
1232    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1233        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1234            if let Some(interception_id) = self
1235                .request_id_to_interception_id
1236                .remove(event.request_id.as_ref())
1237            {
1238                self.on_request(event, Some(interception_id));
1239            } else {
1240                // TODO remove the clone for event
1241                self.requests_will_be_sent
1242                    .insert(event.request_id.clone(), event.clone());
1243            }
1244        } else {
1245            self.on_request(event, None);
1246        }
1247    }
1248
1249    /// The request was served from the cache.
1250    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1251        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1252            request.from_memory_cache = true;
1253        }
1254    }
1255
1256    /// On network response received.
1257    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1258        let mut request_failed = false;
1259
1260        // Track how many bytes we actually deducted from this target.
1261        let mut deducted: u64 = 0;
1262
1263        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1264            let before = *max_bytes;
1265
1266            // encoded_data_length -> saturating cast to u64
1267            let received_bytes: u64 = event.response.encoded_data_length as u64;
1268
1269            // Safe parse of Content-Length
1270            let content_length: Option<u64> = event
1271                .response
1272                .headers
1273                .inner()
1274                .get("content-length")
1275                .and_then(|v| v.as_str())
1276                .and_then(|s| s.trim().parse::<u64>().ok());
1277
1278            // Deduct what we actually received
1279            *max_bytes = max_bytes.saturating_sub(received_bytes);
1280
1281            // If the declared size can't fit, zero out now
1282            if let Some(cl) = content_length {
1283                if cl > *max_bytes {
1284                    *max_bytes = 0;
1285                }
1286            }
1287
1288            request_failed = *max_bytes == 0;
1289
1290            // Compute exact delta deducted on this event
1291            deducted = before.saturating_sub(*max_bytes);
1292        }
1293
1294        // Bubble up the deduction (even if request continues)
1295        if deducted > 0 {
1296            self.queued_events
1297                .push_back(NetworkEvent::BytesConsumed(deducted));
1298        }
1299
1300        // block all network request moving forward.
1301        if request_failed && self.max_bytes_allowed.is_some() {
1302            self.set_block_all(true);
1303        }
1304
1305        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1306            request.set_response(event.response.clone());
1307            self.queued_events.push_back(if request_failed {
1308                NetworkEvent::RequestFailed(request)
1309            } else {
1310                NetworkEvent::RequestFinished(request)
1311            });
1312        }
1313    }
1314
1315    /// On network loading finished.
1316    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1317        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1318            if let Some(interception_id) = request.interception_id.as_ref() {
1319                self.attempted_authentications
1320                    .remove(interception_id.as_ref());
1321            }
1322            self.queued_events
1323                .push_back(NetworkEvent::RequestFinished(request));
1324        }
1325    }
1326
1327    /// On network loading failed.
1328    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1329        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1330            request.failure_text = Some(event.error_text.clone());
1331            if let Some(interception_id) = request.interception_id.as_ref() {
1332                self.attempted_authentications
1333                    .remove(interception_id.as_ref());
1334            }
1335            self.queued_events
1336                .push_back(NetworkEvent::RequestFailed(request));
1337        }
1338    }
1339
1340    /// On request will be sent.
1341    fn on_request(
1342        &mut self,
1343        event: &EventRequestWillBeSent,
1344        interception_id: Option<InterceptionId>,
1345    ) {
1346        let mut redirect_chain = Vec::new();
1347        let mut redirect_location = None;
1348
1349        if let Some(redirect_resp) = &event.redirect_response {
1350            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1351                if is_redirect_status(redirect_resp.status) {
1352                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1353                        if redirect_resp.url != location {
1354                            let fixed_location = location.replace(&redirect_resp.url, "");
1355
1356                            if !fixed_location.is_empty() {
1357                                request.response.as_mut().map(|resp| {
1358                                    resp.headers.0["Location"] =
1359                                        serde_json::Value::String(fixed_location.clone());
1360                                });
1361                            }
1362
1363                            redirect_location = Some(fixed_location);
1364                        }
1365                    }
1366                }
1367
1368                self.handle_request_redirect(
1369                    &mut request,
1370                    if let Some(redirect_location) = redirect_location {
1371                        let mut redirect_resp = redirect_resp.clone();
1372
1373                        if !redirect_location.is_empty() {
1374                            redirect_resp.headers.0["Location"] =
1375                                serde_json::Value::String(redirect_location);
1376                        }
1377
1378                        redirect_resp
1379                    } else {
1380                        redirect_resp.clone()
1381                    },
1382                );
1383
1384                redirect_chain = std::mem::take(&mut request.redirect_chain);
1385                redirect_chain.push(request);
1386            }
1387        }
1388
1389        let request = HttpRequest::new(
1390            event.request_id.clone(),
1391            event.frame_id.clone(),
1392            interception_id,
1393            self.user_request_interception_enabled,
1394            redirect_chain,
1395        );
1396
1397        self.requests.insert(event.request_id.clone(), request);
1398        self.queued_events
1399            .push_back(NetworkEvent::Request(event.request_id.clone()));
1400    }
1401
1402    /// Handle request redirect.
1403    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1404        request.set_response(response);
1405        if let Some(interception_id) = request.interception_id.as_ref() {
1406            self.attempted_authentications
1407                .remove(interception_id.as_ref());
1408        }
1409    }
1410}
1411
1412#[derive(Debug)]
1413pub enum NetworkEvent {
1414    /// Send a CDP request.
1415    SendCdpRequest((MethodId, serde_json::Value)),
1416    /// Request.
1417    Request(RequestId),
1418    /// Response
1419    Response(RequestId),
1420    /// Request failed.
1421    RequestFailed(HttpRequest),
1422    /// Request finished.
1423    RequestFinished(HttpRequest),
1424    /// Bytes consumed.
1425    BytesConsumed(u64),
1426}
1427
1428#[cfg(test)]
1429mod tests {
1430    use super::ALLOWED_MATCHER_3RD_PARTY;
1431    use crate::handler::network::NetworkManager;
1432    use std::time::Duration;
1433
1434    #[test]
1435    fn test_allowed_matcher_3rd_party() {
1436        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1437        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1438        assert!(
1439            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1440            "expected Cloudflare challenge script to be allowed"
1441        );
1442
1443        // Should NOT be allowed (not in allow-list)
1444        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1445        assert!(
1446            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1447            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1448        );
1449
1450        // A couple sanity checks for existing allow patterns
1451        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1452        assert!(ALLOWED_MATCHER_3RD_PARTY
1453            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1454        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1455    }
1456
1457    #[test]
1458    fn test_script_allowed_by_default_when_not_blocklisted() {
1459        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1460        nm.set_page_url(
1461            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1462        );
1463
1464        // A random script that should not match your block tries.
1465        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1466        assert!(
1467            !nm.should_block_script_blocklist_only(ok),
1468            "expected non-blocklisted script to be allowed"
1469        );
1470    }
1471
1472    #[test]
1473    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1474        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1475        nm.set_page_url(
1476            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1477        );
1478
1479        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1480        let bad = "https://cdn.example.net/js/analytics.js";
1481        assert!(
1482            nm.should_block_script_blocklist_only(bad),
1483            "expected analytics.js to be blocklisted"
1484        );
1485    }
1486
1487    #[test]
1488    fn test_allowed_matcher_3rd_party_sanity() {
1489        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1490        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1491        assert!(
1492            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1493            "expected Cloudflare challenge script to be allowed"
1494        );
1495
1496        // Should NOT be allowed (not in allow-list)
1497        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1498        assert!(
1499            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1500            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1501        );
1502
1503        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1504        assert!(ALLOWED_MATCHER_3RD_PARTY
1505            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1506        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1507    }
1508    #[test]
1509    fn test_dynamic_blacklist_blocks_url() {
1510        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1511        nm.set_page_url("https://example.com/".to_string());
1512
1513        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1514        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1515        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1516
1517        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1518    }
1519
1520    #[test]
1521    fn test_blacklist_strict_wins_over_whitelist() {
1522        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1523        nm.set_page_url("https://example.com/".to_string());
1524
1525        // Same URL in both lists.
1526        nm.set_blacklist_patterns(["beacon.min.js"]);
1527        nm.set_whitelist_patterns(["beacon.min.js"]);
1528
1529        nm.set_blacklist_strict(true);
1530
1531        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1532        assert!(nm.is_whitelisted(u));
1533        assert!(nm.is_blacklisted(u));
1534
1535        // In strict mode, it should still be considered blocked at decision time.
1536        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1537        assert!(nm.blacklist_strict);
1538    }
1539
1540    #[test]
1541    fn test_blacklist_non_strict_allows_whitelist_override() {
1542        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1543        nm.set_page_url("https://example.com/".to_string());
1544
1545        nm.set_blacklist_patterns(["beacon.min.js"]);
1546        nm.set_whitelist_patterns(["beacon.min.js"]);
1547
1548        nm.set_blacklist_strict(false);
1549
1550        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1551        assert!(nm.is_blacklisted(u));
1552        assert!(nm.is_whitelisted(u));
1553        assert!(!nm.blacklist_strict);
1554    }
1555}