chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/",
73        "https://google.com/recaptcha/api.js",
74        "https://www.gstatic.com/recaptcha/",
75        "https://captcha.px-cloud.net/",
76        "https://geo.captcha-delivery.com/",
77        "https://api.leminnow.com/captcha/",
78        "https://cdn.auth0.com/js/lock/",
79        "https://captcha.gtimg.com",
80        "https://client-api.arkoselabs.com/",
81        "https://www.capy.me/puzzle/",
82        "https://newassets.hcaptcha.com/",
83        "https://cdn.auth0.com/client",
84        "https://js.stripe.com/",
85        "https://cdn.prod.website-files.com/", // webflow cdn scripts
86        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
87        "https://code.jquery.com/jquery-"
88    ];
89
90    /// Determine if a script should be rendered in the browser by name.
91    ///
92    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
93    /// but we keep it for compatibility and other call sites.
94    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
95
96    /// General patterns for popular libraries and resources
97    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
98        // Verified 3rd parties for request
99        "https://m.stripe.network/",
100        "https://challenges.cloudflare.com/",
101        "https://js.stripe.com/",
102        "https://cdn.prod.website-files.com/", // webflow cdn scripts
103        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
104        "https://code.jquery.com/jquery-",
105        "https://ct.captcha-delivery.com/",
106        "https://geo.captcha-delivery.com/",
107        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
108        "https://cdn.auth0.com/client",
109        "https://captcha.px-cloud.net/",
110        "https://www.capy.me/puzzle/",
111        "https://www.gstatic.com/recaptcha/",
112        "https://google.com/recaptcha/",
113        "https://www.google.com/recaptcha/",
114        "https://www.recaptcha.net/recaptcha/",
115        "https://js.hcaptcha.com/1/api.js",
116        "https://hcaptcha.com/1/api.js",
117        "https://js.datadome.co/tags.js",
118        "https://api-js.datadome.co/",
119        "https://client.perimeterx.net/",
120        "https://captcha.px-cdn.net/",
121        "https://newassets.hcaptcha.com/",
122        "https://captcha.px-cloud.net/",
123        "https://s.perimeterx.net/",
124        "https://api.leminnow.com/captcha/",
125        "https://client-api.arkoselabs.com/",
126        "https://static.geetest.com/v4/gt4.js",
127        "https://static.geetest.com/",
128        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
129        "https://cdn.perfdrive.com/aperture/",
130        "https://assets.queue-it.net/",
131        "discourse-cdn.com/",
132        "hcaptcha.com",
133        "/cdn-cgi/challenge-platform/",
134        "/_Incapsula_Resource"
135    ];
136
137    /// Determine if a script should be rendered in the browser by name.
138    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
139
140    /// path of a js framework
141    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
142        phf::phf_set! {
143            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
144            "_astro/", "_app/immutable"
145        }
146    };
147
148    /// Ignore the content types.
149    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
150        "application/pdf",
151        "application/zip",
152        "application/x-rar-compressed",
153        "application/x-tar",
154        "image/png",
155        "image/jpeg",
156        "image/gif",
157        "image/bmp",
158        "image/webp",
159        "image/svg+xml",
160        "video/mp4",
161        "video/x-msvideo",
162        "video/x-matroska",
163        "video/webm",
164        "audio/mpeg",
165        "audio/ogg",
166        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
167        "application/vnd.ms-excel",
168        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
169        "application/vnd.ms-powerpoint",
170        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
171        "application/x-7z-compressed",
172        "application/x-rpm",
173        "application/x-shockwave-flash",
174        "application/rtf",
175    };
176
177    /// Ignore the resources for visual content types.
178    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
179        "Image",
180        "Media",
181        "Font"
182    };
183
184    /// Ignore the resources for visual content types.
185    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
186        "CspViolationReport",
187        "Ping",
188    };
189
190    /// Case insenstive css matching
191    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
192
193    /// The command chain.
194    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
195        let enable = EnableParams::default();
196
197        if let Ok(c) = serde_json::to_value(&enable) {
198            vec![(enable.identifier(), c)]
199        } else {
200            vec![]
201        }
202    };
203
204    /// The command chain with https ignore.
205    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
206        let enable = EnableParams::default();
207        let mut v = vec![];
208        if let Ok(c) = serde_json::to_value(&enable) {
209            v.push((enable.identifier(), c));
210        }
211        let ignore = SetIgnoreCertificateErrorsParams::new(true);
212        if let Ok(ignored) = serde_json::to_value(&ignore) {
213            v.push((ignore.identifier(), ignored));
214        }
215
216        v
217    };
218
219    /// Enable the fetch intercept command
220    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
221        fetch::EnableParams::builder()
222        .handle_auth_requests(true)
223        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
224        .build()
225    };
226}
227
228/// Determine if a redirect is true.
229pub(crate) fn is_redirect_status(status: i64) -> bool {
230    matches!(status, 301 | 302 | 303 | 307 | 308)
231}
232
233#[derive(Debug)]
234/// The base network manager.
235pub struct NetworkManager {
236    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
237    ///
238    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
239    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
240    /// Consumers pull from this queue via `poll()`.
241    queued_events: VecDeque<NetworkEvent>,
242    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
243    ///
244    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
245    /// certificates (self-signed, expired, MITM proxies, etc.).
246    ignore_httpserrors: bool,
247    /// Active in-flight requests keyed by CDP `RequestId`.
248    ///
249    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
250    /// and final state used to emit `RequestFinished` / `RequestFailed`.
251    requests: HashMap<RequestId, HttpRequest>,
252    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
253    /// `Fetch.requestPaused` arrives later (or vice versa).
254    ///
255    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
256    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
257    // TODO put event in an Arc?
258    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
259    /// Extra HTTP headers to apply to subsequent network requests via CDP.
260    ///
261    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
262    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
263    extra_headers: std::collections::HashMap<String, String>,
264    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
265    ///
266    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
267    /// store the interception id here so it can be attached to the `HttpRequest` once the
268    /// network request is observed.
269    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
270    /// Whether the user has disabled the browser cache.
271    ///
272    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
273    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
274    user_cache_disabled: bool,
275    /// Tracks which requests have already attempted authentication.
276    ///
277    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
278    /// authentication challenges (407/401). Once a request id is present here, subsequent
279    /// challenges for the same request are canceled.
280    attempted_authentications: HashSet<RequestId>,
281    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
282    ///
283    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
284    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
285    credentials: Option<Credentials>,
286    /// User-facing toggle indicating whether request interception is desired.
287    ///
288    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
289    /// not guarantee interception is active; interception is actually enabled/disabled by
290    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
291    ///
292    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
293    /// enabled to satisfy auth challenges.
294    pub(crate) user_request_interception_enabled: bool,
295    /// Hard kill-switch to block all network traffic.
296    ///
297    /// When `true`, the manager immediately blocks requests (typically via
298    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
299    /// and short-circuits most decision logic. This is used for safety conditions such as
300    /// exceeding `max_bytes_allowed` or other runtime protections.
301    block_all: bool,
302    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
303    ///
304    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
305    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
306    /// when `user_request_interception_enabled` or `credentials` change.
307    pub(crate) protocol_request_interception_enabled: bool,
308    /// The network is offline.
309    offline: bool,
310    /// The page request timeout.
311    pub request_timeout: Duration,
312    // made_request: bool,
313    /// Ignore visuals (no pings, prefetching, and etc).
314    pub ignore_visuals: bool,
315    /// Block CSS stylesheets.
316    pub block_stylesheets: bool,
317    /// Block javascript that is not critical to rendering.
318    ///
319    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
320    /// by itself (it remains for config compatibility).
321    pub block_javascript: bool,
322    /// Block analytics from rendering
323    pub block_analytics: bool,
324    /// Block pre-fetch request
325    pub block_prefetch: bool,
326    /// Only html from loading.
327    pub only_html: bool,
328    /// Is xml document?
329    pub xml_document: bool,
330    /// The custom intercept handle logic to run on the website.
331    pub intercept_manager: NetworkInterceptManager,
332    /// Track the amount of times the document reloaded.
333    pub document_reload_tracker: u8,
334    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
335    pub document_target_url: String,
336    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
337    pub document_target_domain: String,
338    /// The max bytes to receive.
339    pub max_bytes_allowed: Option<u64>,
340    #[cfg(feature = "_cache")]
341    /// The cache site_key to use.
342    pub cache_site_key: Option<String>,
343    /// The cache policy to use.
344    #[cfg(feature = "_cache")]
345    pub cache_policy: Option<BasicCachePolicy>,
346    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
347    whitelist_patterns: Vec<String>,
348    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
349    whitelist_matcher: Option<AhoCorasick>,
350    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
351    blacklist_patterns: Vec<String>,
352    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
353    blacklist_matcher: Option<AhoCorasick>,
354    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
355    blacklist_strict: bool,
356}
357
358impl NetworkManager {
359    /// A new network manager.
360    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
361        Self {
362            queued_events: Default::default(),
363            ignore_httpserrors,
364            requests: Default::default(),
365            requests_will_be_sent: Default::default(),
366            extra_headers: Default::default(),
367            request_id_to_interception_id: Default::default(),
368            user_cache_disabled: false,
369            attempted_authentications: Default::default(),
370            credentials: None,
371            block_all: false,
372            user_request_interception_enabled: false,
373            protocol_request_interception_enabled: false,
374            offline: false,
375            request_timeout,
376            ignore_visuals: false,
377            block_javascript: false,
378            block_stylesheets: false,
379            block_prefetch: true,
380            block_analytics: true,
381            only_html: false,
382            xml_document: false,
383            intercept_manager: NetworkInterceptManager::Unknown,
384            document_reload_tracker: 0,
385            document_target_url: String::new(),
386            document_target_domain: String::new(),
387            whitelist_patterns: Vec::new(),
388            whitelist_matcher: None,
389            blacklist_patterns: Vec::new(),
390            blacklist_matcher: None,
391            blacklist_strict: true,
392            max_bytes_allowed: None,
393            #[cfg(feature = "_cache")]
394            cache_site_key: None,
395            #[cfg(feature = "_cache")]
396            cache_policy: None,
397        }
398    }
399
400    /// Replace the whitelist patterns (compiled once).
401    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
402    where
403        I: IntoIterator<Item = S>,
404        S: Into<String>,
405    {
406        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
407        self.rebuild_whitelist_matcher();
408    }
409
410    /// Replace the blacklist patterns (compiled once).
411    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
412    where
413        I: IntoIterator<Item = S>,
414        S: Into<String>,
415    {
416        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
417        self.rebuild_blacklist_matcher();
418    }
419
420    /// Add one pattern (cheap) and rebuild (call this sparingly).
421    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
422        self.blacklist_patterns.push(pattern.into());
423        self.rebuild_blacklist_matcher();
424    }
425
426    /// Add many patterns and rebuild once.
427    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
428    where
429        I: IntoIterator<Item = S>,
430        S: Into<String>,
431    {
432        self.blacklist_patterns
433            .extend(patterns.into_iter().map(Into::into));
434        self.rebuild_blacklist_matcher();
435    }
436
437    /// Clear blacklist entirely.
438    pub fn clear_blacklist(&mut self) {
439        self.blacklist_patterns.clear();
440        self.blacklist_matcher = None;
441    }
442
443    /// Control precedence: when true, blacklist always wins.
444    pub fn set_blacklist_strict(&mut self, strict: bool) {
445        self.blacklist_strict = strict;
446    }
447
448    #[inline]
449    fn rebuild_blacklist_matcher(&mut self) {
450        if self.blacklist_patterns.is_empty() {
451            self.blacklist_matcher = None;
452            return;
453        }
454
455        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
456        self.blacklist_matcher = AhoCorasick::new(refs).ok();
457    }
458
459    #[inline]
460    fn is_blacklisted(&self, url: &str) -> bool {
461        self.blacklist_matcher
462            .as_ref()
463            .map(|m| m.is_match(url))
464            .unwrap_or(false)
465    }
466
467    /// Add one pattern (cheap) and rebuild (call this sparingly).
468    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
469        self.whitelist_patterns.push(pattern.into());
470        self.rebuild_whitelist_matcher();
471    }
472
473    /// Add many patterns and rebuild once.
474    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
475    where
476        I: IntoIterator<Item = S>,
477        S: Into<String>,
478    {
479        self.whitelist_patterns
480            .extend(patterns.into_iter().map(Into::into));
481        self.rebuild_whitelist_matcher();
482    }
483
484    #[inline]
485    fn rebuild_whitelist_matcher(&mut self) {
486        if self.whitelist_patterns.is_empty() {
487            self.whitelist_matcher = None;
488            return;
489        }
490
491        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
492
493        // If building fails (shouldn’t for simple patterns), just disable matcher.
494        self.whitelist_matcher = AhoCorasick::new(refs).ok();
495    }
496
497    #[inline]
498    fn is_whitelisted(&self, url: &str) -> bool {
499        self.whitelist_matcher
500            .as_ref()
501            .map(|m| m.is_match(url))
502            .unwrap_or(false)
503    }
504
505    /// Commands to init the chain with.
506    pub fn init_commands(&self) -> CommandChain {
507        let cmds = if self.ignore_httpserrors {
508            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
509        } else {
510            INIT_CHAIN.clone()
511        };
512        CommandChain::new(cmds, self.request_timeout)
513    }
514
515    /// Push the CDP request.
516    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
517        let method = cmd.identifier();
518        if let Ok(params) = serde_json::to_value(cmd) {
519            self.queued_events
520                .push_back(NetworkEvent::SendCdpRequest((method, params)));
521        }
522    }
523
524    /// The next event to handle.
525    pub fn poll(&mut self) -> Option<NetworkEvent> {
526        self.queued_events.pop_front()
527    }
528
529    /// Get the extra headers.
530    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
531        &self.extra_headers
532    }
533
534    /// Set extra HTTP headers.
535    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
536        self.extra_headers = headers;
537        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
538        self.extra_headers.remove("Proxy-Authorization");
539        if !self.extra_headers.is_empty() {
540            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
541                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
542            }
543        }
544    }
545
546    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
547        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
548    }
549
550    pub fn set_block_all(&mut self, block_all: bool) {
551        self.block_all = block_all;
552    }
553
554    pub fn set_request_interception(&mut self, enabled: bool) {
555        self.user_request_interception_enabled = enabled;
556        self.update_protocol_request_interception();
557    }
558
559    pub fn set_cache_enabled(&mut self, enabled: bool) {
560        let run = self.user_cache_disabled != !enabled;
561        self.user_cache_disabled = !enabled;
562        if run {
563            self.update_protocol_cache_disabled();
564        }
565    }
566
567    /// Enable fetch interception.
568    pub fn enable_request_intercept(&mut self) {
569        self.protocol_request_interception_enabled = true;
570    }
571
572    /// Disable fetch interception.
573    pub fn disable_request_intercept(&mut self) {
574        self.protocol_request_interception_enabled = false;
575    }
576
577    /// Set the cache site key.
578    #[cfg(feature = "_cache")]
579    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
580        self.cache_site_key = cache_site_key;
581    }
582
583    /// Set the cache policy.
584    #[cfg(feature = "_cache")]
585    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
586        self.cache_policy = cache_policy;
587    }
588
589    pub fn update_protocol_cache_disabled(&mut self) {
590        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
591    }
592
593    pub fn authenticate(&mut self, credentials: Credentials) {
594        self.credentials = Some(credentials);
595        self.update_protocol_request_interception();
596        self.protocol_request_interception_enabled = true;
597    }
598
599    fn update_protocol_request_interception(&mut self) {
600        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
601
602        if enabled == self.protocol_request_interception_enabled {
603            return;
604        }
605
606        if enabled {
607            self.push_cdp_request(ENABLE_FETCH.clone())
608        } else {
609            self.push_cdp_request(DisableParams::default())
610        }
611    }
612
613    /// Blocklist-only script blocking.
614    /// Returns true only when the URL matches an explicit blocklist condition.
615    #[inline]
616    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
617        // If analytics blocking is off, skip all analytics tries.
618        let block_analytics = self.block_analytics;
619
620        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
621        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
622        {
623            return true;
624        }
625
626        // 2) Custom website block list (explicit).
627        if crate::handler::blockers::block_websites::block_website(url) {
628            return true;
629        }
630
631        // 3) Path-based explicit tries / fallbacks.
632        //
633        // We run these on:
634        // - path with leading slash ("/js/app.js")
635        // - path without leading slash ("js/app.js")
636        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
637        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
638            // Remove query/fragment so matching stays stable.
639            let p_slash = Self::strip_query_fragment(path_with_slash);
640            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
641
642            // Basename for filename-only lists.
643            let base = match p_slash.rsplit('/').next() {
644                Some(b) => b,
645                None => p_slash,
646            };
647
648            // ---- Trie checks ----
649            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
650            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
651                return true;
652            }
653            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
654                return true;
655            }
656            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
657                return true;
658            }
659
660            // Base-path ignore tries (framework noise / known ignorable script paths).
661            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
662            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
663                return true;
664            }
665
666            // Style path ignores only when visuals are ignored.
667            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
668                return true;
669            }
670        }
671
672        false
673    }
674
675    /// Extract the absolute URL path portion WITH the leading slash.
676    ///
677    /// Example:
678    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
679    #[inline]
680    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
681        // find scheme separator
682        let idx = url.find("//")?;
683        let after_slashes = idx + 2;
684
685        // find first slash after host
686        let slash_rel = url[after_slashes..].find('/')?;
687        let slash_idx = after_slashes + slash_rel;
688
689        if slash_idx < url.len() {
690            Some(&url[slash_idx..])
691        } else {
692            None
693        }
694    }
695
696    /// Strip query string and fragment from a path-ish string.
697    ///
698    /// Example:
699    /// - "/a/b.js?x=1#y" -> "/a/b.js"
700    #[inline]
701    fn strip_query_fragment(s: &str) -> &str {
702        let q = s.find('?');
703        let h = s.find('#');
704
705        match (q, h) {
706            (None, None) => s,
707            (Some(i), None) => &s[..i],
708            (None, Some(i)) => &s[..i],
709            (Some(i), Some(j)) => &s[..i.min(j)],
710        }
711    }
712
713    /// Determine if the request should be skipped.
714    #[inline]
715    fn skip_xhr(
716        &self,
717        skip_networking: bool,
718        event: &EventRequestPaused,
719        network_event: bool,
720    ) -> bool {
721        // XHR check
722        if !skip_networking && network_event {
723            let request_url = event.request.url.as_str();
724
725            // check if part of ignore scripts.
726            let skip_analytics =
727                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
728
729            if skip_analytics {
730                true
731            } else if self.block_stylesheets || self.ignore_visuals {
732                let block_css = self.block_stylesheets;
733                let block_media = self.ignore_visuals;
734
735                let mut block_request = false;
736
737                if let Some(position) = request_url.rfind('.') {
738                    let hlen = request_url.len();
739                    let has_asset = hlen - position;
740
741                    if has_asset >= 3 {
742                        let next_position = position + 1;
743
744                        if block_media
745                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
746                                &request_url[next_position..].into(),
747                            )
748                        {
749                            block_request = true;
750                        } else if block_css {
751                            block_request =
752                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
753                                    .contains(&**CSS_EXTENSION)
754                        }
755                    }
756                }
757
758                if !block_request {
759                    block_request = ignore_script_xhr_media(request_url);
760                }
761
762                block_request
763            } else {
764                skip_networking
765            }
766        } else {
767            skip_networking
768        }
769    }
770
771    #[cfg(feature = "adblock")]
772    #[inline]
773    /// Detect if ad enabled.
774    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
775        if skip_networking {
776            true
777        } else {
778            block_ads(&event.request.url) || self.detect_ad(event)
779        }
780    }
781
782    /// When adblock feature is disabled, this is a no-op.
783    #[cfg(not(feature = "adblock"))]
784    #[inline]
785    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
786        use crate::handler::blockers::block_websites::block_ads;
787        if skip_networking {
788            true
789        } else {
790            block_ads(&event.request.url)
791        }
792    }
793
794    #[inline]
795    /// Fail request
796    fn fail_request_blocked(
797        &mut self,
798        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
799    ) {
800        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
801            request_id.clone(),
802            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
803        );
804        self.push_cdp_request(params);
805    }
806
807    #[inline]
808    /// Fulfill request
809    fn fulfill_request_empty_200(
810        &mut self,
811        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
812    ) {
813        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
814            request_id.clone(),
815            200,
816        );
817        self.push_cdp_request(params);
818    }
819
820    #[cfg(feature = "_cache")]
821    #[inline]
822    /// Fulfill a paused Fetch request from cached bytes + header map.
823    ///
824    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
825    fn fulfill_request_from_cache(
826        &mut self,
827        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
828        body: &[u8],
829        headers: &std::collections::HashMap<String, String>,
830        status: i64,
831    ) {
832        use crate::cdp::browser_protocol::fetch::HeaderEntry;
833        use crate::handler::network::fetch::FulfillRequestParams;
834        use base64::Engine;
835
836        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
837
838        for (k, v) in headers.iter() {
839            resp_headers.push(HeaderEntry {
840                name: k.clone().into(),
841                value: v.clone().into(),
842            });
843        }
844
845        let mut params = FulfillRequestParams::new(request_id.clone(), status);
846
847        // TODO: have this already encoded prior.
848        params.body = Some(
849            base64::engine::general_purpose::STANDARD
850                .encode(body)
851                .into(),
852        );
853
854        params.response_headers = Some(resp_headers);
855
856        self.push_cdp_request(params);
857    }
858
859    #[inline]
860    /// Continue the request url.
861    fn continue_request_with_url(
862        &mut self,
863        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
864        url: Option<&str>,
865        intercept_response: bool,
866    ) {
867        let mut params = ContinueRequestParams::new(request_id.clone());
868        if let Some(url) = url {
869            params.url = Some(url.to_string());
870            params.intercept_response = Some(intercept_response);
871        }
872        self.push_cdp_request(params);
873    }
874
875    /// On fetch request paused interception.
876    #[inline]
877    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
878        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
879            return;
880        }
881
882        if self.block_all {
883            tracing::debug!(
884                "Blocked (block_all): {:?} - {}",
885                event.resource_type,
886                event.request.url
887            );
888            return self.fail_request_blocked(&event.request_id);
889        }
890
891        if let Some(network_id) = event.network_id.as_ref() {
892            if let Some(request_will_be_sent) =
893                self.requests_will_be_sent.remove(network_id.as_ref())
894            {
895                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
896            } else {
897                self.request_id_to_interception_id
898                    .insert(network_id.clone(), event.request_id.clone().into());
899            }
900        }
901
902        // From here on, we handle the full decision tree.
903        let javascript_resource = event.resource_type == ResourceType::Script;
904        let document_resource = event.resource_type == ResourceType::Document;
905        let network_resource =
906            !document_resource && crate::utils::is_data_resource(&event.resource_type);
907
908        // Start with static / cheap skip checks.
909        let mut skip_networking =
910            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
911
912        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
913            skip_networking = true;
914        }
915
916        // Also short-circuit if we've reloaded this document too many times.
917        if !skip_networking {
918            skip_networking = self.document_reload_tracker >= 3;
919        }
920
921        // Handle document redirect / masking and track xml documents.
922        let (current_url_cow, had_replacer) =
923            self.handle_document_replacement_and_tracking(event, document_resource);
924
925        let current_url: &str = current_url_cow.as_ref();
926
927        let blacklisted = self.is_blacklisted(current_url);
928
929        if !self.blacklist_strict && blacklisted {
930            skip_networking = true;
931        }
932
933        if !skip_networking {
934            // Allow XSL for sitemap XML.
935            if self.xml_document && current_url.ends_with(".xsl") {
936                skip_networking = false;
937            } else {
938                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
939            }
940        }
941
942        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
943
944        // Ignore embedded scripts when only_html or ignore_visuals is set.
945        if !skip_networking
946            && self.block_javascript
947            && (self.only_html || self.ignore_visuals)
948            && (javascript_resource || document_resource)
949        {
950            skip_networking = ignore_script_embedded(current_url);
951        }
952
953        // Script policy: allow-by-default.
954        // Block only if explicit block list patterns match.
955        if !skip_networking && javascript_resource {
956            skip_networking = self.should_block_script_blocklist_only(current_url);
957        }
958
959        // XHR / data resources.
960        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
961
962        // Custom interception layer.
963        if !skip_networking && (javascript_resource || network_resource || document_resource) {
964            skip_networking = self.intercept_manager.intercept_detection(
965                current_url,
966                self.ignore_visuals,
967                network_resource,
968            );
969        }
970
971        // Custom website block list.
972        if !skip_networking && (javascript_resource || network_resource) {
973            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
974        }
975
976        // whitelist 3rd party
977        // not required unless explicit blocking.
978        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
979        {
980            skip_networking = false;
981        }
982
983        // check if the url is in the whitelist.
984        if skip_networking && self.is_whitelisted(current_url) {
985            skip_networking = false;
986        }
987
988        if self.blacklist_strict && blacklisted {
989            skip_networking = true;
990        }
991
992        if skip_networking {
993            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
994            self.fulfill_request_empty_200(&event.request_id);
995        } else {
996            #[cfg(feature = "_cache")]
997            {
998                if let (Some(policy), Some(cache_site_key)) =
999                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1000                {
1001                    let current_url = format!("{}:{}", event.request.method, &current_url);
1002
1003                    if let Some((res, cache_policy)) =
1004                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1005                    {
1006                        if policy.allows_cached(&cache_policy) {
1007                            tracing::debug!(
1008                                "Remote Cached: {:?} - {}",
1009                                &event.resource_type,
1010                                &current_url
1011                            );
1012                            return self.fulfill_request_from_cache(
1013                                &event.request_id,
1014                                &res.body,
1015                                &res.headers,
1016                                res.status as i64,
1017                            );
1018                        }
1019                    }
1020                }
1021            }
1022
1023            // check our frame cache for the run.
1024            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1025            self.continue_request_with_url(
1026                &event.request_id,
1027                if had_replacer {
1028                    Some(current_url)
1029                } else {
1030                    None
1031                },
1032                !had_replacer,
1033            );
1034        }
1035    }
1036
1037    /// Shared "visuals + basic blocking" logic.
1038    ///
1039    /// IMPORTANT: Scripts are NOT blocked here anymore.
1040    /// Scripts are allowed by default and only blocked via explicit blocklists
1041    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1042    #[inline]
1043    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1044        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1045            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1046    }
1047
1048    /// Does the network manager have a target domain?
1049    pub fn has_target_domain(&self) -> bool {
1050        !self.document_target_url.is_empty()
1051    }
1052
1053    /// Set the target page url for tracking.
1054    pub fn set_page_url(&mut self, page_target_url: String) {
1055        let host_base = host_and_rest(&page_target_url)
1056            .map(|(h, _)| base_domain_from_host(h))
1057            .unwrap_or("");
1058
1059        self.document_target_domain = host_base.to_string();
1060        self.document_target_url = page_target_url;
1061    }
1062
1063    /// Clear the initial target domain on every navigation.
1064    pub fn clear_target_domain(&mut self) {
1065        self.document_reload_tracker = 0;
1066        self.document_target_url = Default::default();
1067        self.document_target_domain = Default::default();
1068    }
1069
1070    /// Handles:
1071    /// - document reload tracking (`document_reload_tracker`)
1072    /// - redirect masking / replacement
1073    /// - xml document detection (`xml_document`)
1074    /// - `document_target_url` updates
1075    ///
1076    /// Returns (current_url, had_replacer).
1077    #[inline]
1078    fn handle_document_replacement_and_tracking<'a>(
1079        &mut self,
1080        event: &'a EventRequestPaused,
1081        document_resource: bool,
1082    ) -> (Cow<'a, str>, bool) {
1083        let mut replacer: Option<String> = None;
1084        let current_url = event.request.url.as_str();
1085
1086        if document_resource {
1087            if self.document_target_url == current_url {
1088                self.document_reload_tracker += 1;
1089            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1090            {
1091                let (http_document_replacement, mut https_document_replacement) =
1092                    if self.document_target_url.starts_with("http://") {
1093                        (
1094                            self.document_target_url.replacen("http://", "http//", 1),
1095                            self.document_target_url.replacen("http://", "https://", 1),
1096                        )
1097                    } else {
1098                        (
1099                            self.document_target_url.replacen("https://", "https//", 1),
1100                            self.document_target_url.replacen("https://", "http://", 1),
1101                        )
1102                    };
1103
1104                // Track trailing slash to restore later.
1105                let trailing = https_document_replacement.ends_with('/');
1106                if trailing {
1107                    https_document_replacement.pop();
1108                }
1109                if https_document_replacement.ends_with('/') {
1110                    https_document_replacement.pop();
1111                }
1112
1113                let redirect_mask = format!(
1114                    "{}{}",
1115                    https_document_replacement, http_document_replacement
1116                );
1117
1118                if current_url == redirect_mask {
1119                    replacer = Some(if trailing {
1120                        format!("{}/", https_document_replacement)
1121                    } else {
1122                        https_document_replacement
1123                    });
1124                }
1125            }
1126
1127            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1128                self.xml_document = true;
1129            }
1130
1131            // Track last seen document URL.
1132            self.document_target_url = event.request.url.clone();
1133            self.document_target_domain = host_and_rest(&self.document_target_url)
1134                .map(|(h, _)| base_domain_from_host(h).to_string())
1135                .unwrap_or_default();
1136        }
1137
1138        let current_url_cow = match replacer {
1139            Some(r) => Cow::Owned(r),
1140            None => Cow::Borrowed(event.request.url.as_str()),
1141        };
1142
1143        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1144        (current_url_cow, had_replacer)
1145    }
1146
1147    /// Perform a page intercept for chrome
1148    #[cfg(feature = "adblock")]
1149    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1150        use adblock::{
1151            lists::{FilterSet, ParseOptions, RuleTypes},
1152            Engine,
1153        };
1154
1155        lazy_static::lazy_static! {
1156            static ref AD_ENGINE: Engine = {
1157                let mut filter_set = FilterSet::new(false);
1158                let mut rules = ParseOptions::default();
1159                rules.rule_types = RuleTypes::All;
1160
1161                filter_set.add_filters(
1162                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1163                    rules,
1164                );
1165
1166                Engine::from_filter_set(filter_set, true)
1167            };
1168        };
1169
1170        let blockable = ResourceType::Image == event.resource_type
1171            || event.resource_type == ResourceType::Media
1172            || event.resource_type == ResourceType::Stylesheet
1173            || event.resource_type == ResourceType::Document
1174            || event.resource_type == ResourceType::Fetch
1175            || event.resource_type == ResourceType::Xhr;
1176
1177        let u = &event.request.url;
1178
1179        let block_request = blockable
1180            // set it to example.com for 3rd party handling is_same_site
1181        && {
1182            let request = adblock::request::Request::preparsed(
1183                 &u,
1184                 "example.com",
1185                 "example.com",
1186                 &event.resource_type.as_ref().to_lowercase(),
1187                 !event.request.is_same_site.unwrap_or_default());
1188
1189            AD_ENGINE.check_network_request(&request).matched
1190        };
1191
1192        block_request
1193    }
1194
1195    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1196        let response = if self
1197            .attempted_authentications
1198            .contains(event.request_id.as_ref())
1199        {
1200            AuthChallengeResponseResponse::CancelAuth
1201        } else if self.credentials.is_some() {
1202            self.attempted_authentications
1203                .insert(event.request_id.clone().into());
1204            AuthChallengeResponseResponse::ProvideCredentials
1205        } else {
1206            AuthChallengeResponseResponse::Default
1207        };
1208
1209        let mut auth = AuthChallengeResponse::new(response);
1210        if let Some(creds) = self.credentials.clone() {
1211            auth.username = Some(creds.username);
1212            auth.password = Some(creds.password);
1213        }
1214        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1215    }
1216
1217    /// Set the page offline network emulation condition.
1218    pub fn set_offline_mode(&mut self, value: bool) {
1219        if self.offline == value {
1220            return;
1221        }
1222        self.offline = value;
1223        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1224            .offline(self.offline)
1225            .latency(0)
1226            .download_throughput(-1.)
1227            .upload_throughput(-1.)
1228            .build()
1229        {
1230            self.push_cdp_request(network);
1231        }
1232    }
1233
1234    /// Request interception doesn't happen for data URLs with Network Service.
1235    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1236        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1237            if let Some(interception_id) = self
1238                .request_id_to_interception_id
1239                .remove(event.request_id.as_ref())
1240            {
1241                self.on_request(event, Some(interception_id));
1242            } else {
1243                // TODO remove the clone for event
1244                self.requests_will_be_sent
1245                    .insert(event.request_id.clone(), event.clone());
1246            }
1247        } else {
1248            self.on_request(event, None);
1249        }
1250    }
1251
1252    /// The request was served from the cache.
1253    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1254        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1255            request.from_memory_cache = true;
1256        }
1257    }
1258
1259    /// On network response received.
1260    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1261        let mut request_failed = false;
1262
1263        // Track how many bytes we actually deducted from this target.
1264        let mut deducted: u64 = 0;
1265
1266        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1267            let before = *max_bytes;
1268
1269            // encoded_data_length -> saturating cast to u64
1270            let received_bytes: u64 = event.response.encoded_data_length as u64;
1271
1272            // Safe parse of Content-Length
1273            let content_length: Option<u64> = event
1274                .response
1275                .headers
1276                .inner()
1277                .get("content-length")
1278                .and_then(|v| v.as_str())
1279                .and_then(|s| s.trim().parse::<u64>().ok());
1280
1281            // Deduct what we actually received
1282            *max_bytes = max_bytes.saturating_sub(received_bytes);
1283
1284            // If the declared size can't fit, zero out now
1285            if let Some(cl) = content_length {
1286                if cl > *max_bytes {
1287                    *max_bytes = 0;
1288                }
1289            }
1290
1291            request_failed = *max_bytes == 0;
1292
1293            // Compute exact delta deducted on this event
1294            deducted = before.saturating_sub(*max_bytes);
1295        }
1296
1297        // Bubble up the deduction (even if request continues)
1298        if deducted > 0 {
1299            self.queued_events
1300                .push_back(NetworkEvent::BytesConsumed(deducted));
1301        }
1302
1303        // block all network request moving forward.
1304        if request_failed && self.max_bytes_allowed.is_some() {
1305            self.set_block_all(true);
1306        }
1307
1308        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1309            request.set_response(event.response.clone());
1310            self.queued_events.push_back(if request_failed {
1311                NetworkEvent::RequestFailed(request)
1312            } else {
1313                NetworkEvent::RequestFinished(request)
1314            });
1315        }
1316    }
1317
1318    /// On network loading finished.
1319    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1320        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1321            if let Some(interception_id) = request.interception_id.as_ref() {
1322                self.attempted_authentications
1323                    .remove(interception_id.as_ref());
1324            }
1325            self.queued_events
1326                .push_back(NetworkEvent::RequestFinished(request));
1327        }
1328    }
1329
1330    /// On network loading failed.
1331    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1332        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1333            request.failure_text = Some(event.error_text.clone());
1334            if let Some(interception_id) = request.interception_id.as_ref() {
1335                self.attempted_authentications
1336                    .remove(interception_id.as_ref());
1337            }
1338            self.queued_events
1339                .push_back(NetworkEvent::RequestFailed(request));
1340        }
1341    }
1342
1343    /// On request will be sent.
1344    fn on_request(
1345        &mut self,
1346        event: &EventRequestWillBeSent,
1347        interception_id: Option<InterceptionId>,
1348    ) {
1349        let mut redirect_chain = Vec::new();
1350        let mut redirect_location = None;
1351
1352        if let Some(redirect_resp) = &event.redirect_response {
1353            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1354                if is_redirect_status(redirect_resp.status) {
1355                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1356                        if redirect_resp.url != location {
1357                            let fixed_location = location.replace(&redirect_resp.url, "");
1358
1359                            if !fixed_location.is_empty() {
1360                                request.response.as_mut().map(|resp| {
1361                                    resp.headers.0["Location"] =
1362                                        serde_json::Value::String(fixed_location.clone());
1363                                });
1364                            }
1365
1366                            redirect_location = Some(fixed_location);
1367                        }
1368                    }
1369                }
1370
1371                self.handle_request_redirect(
1372                    &mut request,
1373                    if let Some(redirect_location) = redirect_location {
1374                        let mut redirect_resp = redirect_resp.clone();
1375
1376                        if !redirect_location.is_empty() {
1377                            redirect_resp.headers.0["Location"] =
1378                                serde_json::Value::String(redirect_location);
1379                        }
1380
1381                        redirect_resp
1382                    } else {
1383                        redirect_resp.clone()
1384                    },
1385                );
1386
1387                redirect_chain = std::mem::take(&mut request.redirect_chain);
1388                redirect_chain.push(request);
1389            }
1390        }
1391
1392        let request = HttpRequest::new(
1393            event.request_id.clone(),
1394            event.frame_id.clone(),
1395            interception_id,
1396            self.user_request_interception_enabled,
1397            redirect_chain,
1398        );
1399
1400        self.requests.insert(event.request_id.clone(), request);
1401        self.queued_events
1402            .push_back(NetworkEvent::Request(event.request_id.clone()));
1403    }
1404
1405    /// Handle request redirect.
1406    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1407        request.set_response(response);
1408        if let Some(interception_id) = request.interception_id.as_ref() {
1409            self.attempted_authentications
1410                .remove(interception_id.as_ref());
1411        }
1412    }
1413}
1414
1415#[derive(Debug)]
1416pub enum NetworkEvent {
1417    /// Send a CDP request.
1418    SendCdpRequest((MethodId, serde_json::Value)),
1419    /// Request.
1420    Request(RequestId),
1421    /// Response
1422    Response(RequestId),
1423    /// Request failed.
1424    RequestFailed(HttpRequest),
1425    /// Request finished.
1426    RequestFinished(HttpRequest),
1427    /// Bytes consumed.
1428    BytesConsumed(u64),
1429}
1430
1431#[cfg(test)]
1432mod tests {
1433    use super::ALLOWED_MATCHER_3RD_PARTY;
1434    use crate::handler::network::NetworkManager;
1435    use std::time::Duration;
1436
1437    #[test]
1438    fn test_allowed_matcher_3rd_party() {
1439        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1440        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1441        assert!(
1442            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1443            "expected Cloudflare challenge script to be allowed"
1444        );
1445
1446        // Should NOT be allowed (not in allow-list)
1447        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1448        assert!(
1449            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1450            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1451        );
1452
1453        // A couple sanity checks for existing allow patterns
1454        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1455        assert!(ALLOWED_MATCHER_3RD_PARTY
1456            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1457        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1458    }
1459
1460    #[test]
1461    fn test_script_allowed_by_default_when_not_blocklisted() {
1462        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1463        nm.set_page_url(
1464            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1465        );
1466
1467        // A random script that should not match your block tries.
1468        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1469        assert!(
1470            !nm.should_block_script_blocklist_only(ok),
1471            "expected non-blocklisted script to be allowed"
1472        );
1473    }
1474
1475    #[test]
1476    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1477        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1478        nm.set_page_url(
1479            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1480        );
1481
1482        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1483        let bad = "https://cdn.example.net/js/analytics.js";
1484        assert!(
1485            nm.should_block_script_blocklist_only(bad),
1486            "expected analytics.js to be blocklisted"
1487        );
1488    }
1489
1490    #[test]
1491    fn test_allowed_matcher_3rd_party_sanity() {
1492        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1493        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1494        assert!(
1495            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1496            "expected Cloudflare challenge script to be allowed"
1497        );
1498
1499        // Should NOT be allowed (not in allow-list)
1500        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1501        assert!(
1502            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1503            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1504        );
1505
1506        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1507        assert!(ALLOWED_MATCHER_3RD_PARTY
1508            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1509        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1510    }
1511    #[test]
1512    fn test_dynamic_blacklist_blocks_url() {
1513        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1514        nm.set_page_url("https://example.com/".to_string());
1515
1516        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1517        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1518        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1519
1520        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1521    }
1522
1523    #[test]
1524    fn test_blacklist_strict_wins_over_whitelist() {
1525        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1526        nm.set_page_url("https://example.com/".to_string());
1527
1528        // Same URL in both lists.
1529        nm.set_blacklist_patterns(["beacon.min.js"]);
1530        nm.set_whitelist_patterns(["beacon.min.js"]);
1531
1532        nm.set_blacklist_strict(true);
1533
1534        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1535        assert!(nm.is_whitelisted(u));
1536        assert!(nm.is_blacklisted(u));
1537
1538        // In strict mode, it should still be considered blocked at decision time.
1539        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1540        assert!(nm.blacklist_strict);
1541    }
1542
1543    #[test]
1544    fn test_blacklist_non_strict_allows_whitelist_override() {
1545        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1546        nm.set_page_url("https://example.com/".to_string());
1547
1548        nm.set_blacklist_patterns(["beacon.min.js"]);
1549        nm.set_whitelist_patterns(["beacon.min.js"]);
1550
1551        nm.set_blacklist_strict(false);
1552
1553        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1554        assert!(nm.is_blacklisted(u));
1555        assert!(nm.is_whitelisted(u));
1556        assert!(!nm.blacklist_strict);
1557    }
1558}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs