chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/enterprise.js",
73        "https://www.google.com/recaptcha/api.js",
74        "https://google.com/recaptcha/api.js",
75        "https://captcha.px-cloud.net/",
76        "https://cdn.auth0.com/js/lock/",
77        "https://captcha.gtimg.com",
78        "https://cdn.auth0.com/client",
79        "https://js.stripe.com/",
80        "https://cdn.prod.website-files.com/", // webflow cdn scripts
81        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
82        "https://code.jquery.com/jquery-"
83    ];
84
85    /// Determine if a script should be rendered in the browser by name.
86    ///
87    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
88    /// but we keep it for compatibility and other call sites.
89    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
90
91    /// General patterns for popular libraries and resources
92    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
93        // Verified 3rd parties for request
94        "https://m.stripe.network/",
95        "https://challenges.cloudflare.com/",
96        "https://www.google.com/recaptcha/api.js",
97        "https://google.com/recaptcha/api.js",
98        "https://www.google.com/recaptcha/enterprise.js",
99        "https://js.stripe.com/",
100        "https://cdn.prod.website-files.com/", // webflow cdn scripts
101        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
102        "https://code.jquery.com/jquery-",
103        "https://ct.captcha-delivery.com/",
104        "https://geo.captcha-delivery.com/captcha/",
105        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
106        "https://ct.captcha-delivery.com/",
107        "https://cdn.auth0.com/client",
108        "https://captcha.px-cloud.net/",
109        "https://www.gstatic.com/recaptcha/",
110        "https://www.google.com/recaptcha/api2/",
111        "https://www.recaptcha.net/recaptcha/",
112        "https://js.hcaptcha.com/1/api.js",
113        "https://hcaptcha.com/1/api.js",
114        "https://js.datadome.co/tags.js",
115        "https://api-js.datadome.co/",
116        "https://client.perimeterx.net/",
117        "https://captcha.px-cdn.net/",
118        "https://captcha.px-cloud.net/",
119        "https://s.perimeterx.net/",
120        "https://client-api.arkoselabs.com/v2/",
121        "https://static.geetest.com/v4/gt4.js",
122        "https://static.geetest.com/",
123        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
124        "https://cdn.perfdrive.com/aperture/",
125        "https://assets.queue-it.net/",
126        "discourse-cdn.com/",
127        "/cdn-cgi/challenge-platform/",
128        "/_Incapsula_Resource"
129    ];
130
131    /// Determine if a script should be rendered in the browser by name.
132    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
133
134    /// path of a js framework
135    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
136        phf::phf_set! {
137            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
138            "_astro/", "_app/immutable"
139        }
140    };
141
142    /// Ignore the content types.
143    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
144        "application/pdf",
145        "application/zip",
146        "application/x-rar-compressed",
147        "application/x-tar",
148        "image/png",
149        "image/jpeg",
150        "image/gif",
151        "image/bmp",
152        "image/webp",
153        "image/svg+xml",
154        "video/mp4",
155        "video/x-msvideo",
156        "video/x-matroska",
157        "video/webm",
158        "audio/mpeg",
159        "audio/ogg",
160        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
161        "application/vnd.ms-excel",
162        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
163        "application/vnd.ms-powerpoint",
164        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
165        "application/x-7z-compressed",
166        "application/x-rpm",
167        "application/x-shockwave-flash",
168        "application/rtf",
169    };
170
171    /// Ignore the resources for visual content types.
172    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
173        "Image",
174        "Media",
175        "Font"
176    };
177
178    /// Ignore the resources for visual content types.
179    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180        "CspViolationReport",
181        "Manifest",
182        "Other",
183        "Prefetch",
184        "Ping",
185    };
186
187    /// Case insenstive css matching
188    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
189
190    /// The command chain.
191    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
192        let enable = EnableParams::default();
193
194        if let Ok(c) = serde_json::to_value(&enable) {
195            vec![(enable.identifier(), c)]
196        } else {
197            vec![]
198        }
199    };
200
201    /// The command chain with https ignore.
202    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
203        let enable = EnableParams::default();
204        let mut v = vec![];
205        if let Ok(c) = serde_json::to_value(&enable) {
206            v.push((enable.identifier(), c));
207        }
208        let ignore = SetIgnoreCertificateErrorsParams::new(true);
209        if let Ok(ignored) = serde_json::to_value(&ignore) {
210            v.push((ignore.identifier(), ignored));
211        }
212
213        v
214    };
215
216    /// Enable the fetch intercept command
217    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
218        fetch::EnableParams::builder()
219        .handle_auth_requests(true)
220        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
221        .build()
222    };
223}
224
225/// Determine if a redirect is true.
226pub(crate) fn is_redirect_status(status: i64) -> bool {
227    matches!(status, 301 | 302 | 303 | 307 | 308)
228}
229
230#[derive(Debug)]
231/// The base network manager.
232pub struct NetworkManager {
233    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
234    ///
235    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
236    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
237    /// Consumers pull from this queue via `poll()`.
238    queued_events: VecDeque<NetworkEvent>,
239    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
240    ///
241    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
242    /// certificates (self-signed, expired, MITM proxies, etc.).
243    ignore_httpserrors: bool,
244    /// Active in-flight requests keyed by CDP `RequestId`.
245    ///
246    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
247    /// and final state used to emit `RequestFinished` / `RequestFailed`.
248    requests: HashMap<RequestId, HttpRequest>,
249    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
250    /// `Fetch.requestPaused` arrives later (or vice versa).
251    ///
252    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
253    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
254    // TODO put event in an Arc?
255    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
256    /// Extra HTTP headers to apply to subsequent network requests via CDP.
257    ///
258    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
259    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
260    extra_headers: std::collections::HashMap<String, String>,
261    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
262    ///
263    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
264    /// store the interception id here so it can be attached to the `HttpRequest` once the
265    /// network request is observed.
266    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
267    /// Whether the user has disabled the browser cache.
268    ///
269    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
270    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
271    user_cache_disabled: bool,
272    /// Tracks which requests have already attempted authentication.
273    ///
274    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
275    /// authentication challenges (407/401). Once a request id is present here, subsequent
276    /// challenges for the same request are canceled.
277    attempted_authentications: HashSet<RequestId>,
278    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
279    ///
280    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
281    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
282    credentials: Option<Credentials>,
283    /// User-facing toggle indicating whether request interception is desired.
284    ///
285    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
286    /// not guarantee interception is active; interception is actually enabled/disabled by
287    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
288    ///
289    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
290    /// enabled to satisfy auth challenges.
291    pub(crate) user_request_interception_enabled: bool,
292    /// Hard kill-switch to block all network traffic.
293    ///
294    /// When `true`, the manager immediately blocks requests (typically via
295    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
296    /// and short-circuits most decision logic. This is used for safety conditions such as
297    /// exceeding `max_bytes_allowed` or other runtime protections.
298    block_all: bool,
299    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
300    ///
301    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
302    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
303    /// when `user_request_interception_enabled` or `credentials` change.
304    pub(crate) protocol_request_interception_enabled: bool,
305    /// The network is offline.
306    offline: bool,
307    /// The page request timeout.
308    pub request_timeout: Duration,
309    // made_request: bool,
310    /// Ignore visuals (no pings, prefetching, and etc).
311    pub ignore_visuals: bool,
312    /// Block CSS stylesheets.
313    pub block_stylesheets: bool,
314    /// Block javascript that is not critical to rendering.
315    ///
316    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
317    /// by itself (it remains for config compatibility).
318    pub block_javascript: bool,
319    /// Block analytics from rendering
320    pub block_analytics: bool,
321    /// Only html from loading.
322    pub only_html: bool,
323    /// Is xml document?
324    pub xml_document: bool,
325    /// The custom intercept handle logic to run on the website.
326    pub intercept_manager: NetworkInterceptManager,
327    /// Track the amount of times the document reloaded.
328    pub document_reload_tracker: u8,
329    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
330    pub document_target_url: String,
331    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
332    pub document_target_domain: String,
333    /// The max bytes to receive.
334    pub max_bytes_allowed: Option<u64>,
335    #[cfg(feature = "_cache")]
336    /// The cache site_key to use.
337    pub cache_site_key: Option<String>,
338    /// The cache policy to use.
339    #[cfg(feature = "_cache")]
340    pub cache_policy: Option<BasicCachePolicy>,
341    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
342    whitelist_patterns: Vec<String>,
343    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
344    whitelist_matcher: Option<AhoCorasick>,
345    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
346    blacklist_patterns: Vec<String>,
347    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
348    blacklist_matcher: Option<AhoCorasick>,
349    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
350    blacklist_strict: bool,
351}
352
353impl NetworkManager {
354    /// A new network manager.
355    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
356        Self {
357            queued_events: Default::default(),
358            ignore_httpserrors,
359            requests: Default::default(),
360            requests_will_be_sent: Default::default(),
361            extra_headers: Default::default(),
362            request_id_to_interception_id: Default::default(),
363            user_cache_disabled: false,
364            attempted_authentications: Default::default(),
365            credentials: None,
366            block_all: false,
367            user_request_interception_enabled: false,
368            protocol_request_interception_enabled: false,
369            offline: false,
370            request_timeout,
371            ignore_visuals: false,
372            block_javascript: false,
373            block_stylesheets: false,
374            block_analytics: true,
375            only_html: false,
376            xml_document: false,
377            intercept_manager: NetworkInterceptManager::Unknown,
378            document_reload_tracker: 0,
379            document_target_url: String::new(),
380            document_target_domain: String::new(),
381            whitelist_patterns: Vec::new(),
382            whitelist_matcher: None,
383            blacklist_patterns: Vec::new(),
384            blacklist_matcher: None,
385            blacklist_strict: true,
386            max_bytes_allowed: None,
387            #[cfg(feature = "_cache")]
388            cache_site_key: None,
389            #[cfg(feature = "_cache")]
390            cache_policy: None,
391        }
392    }
393
394    /// Replace the whitelist patterns (compiled once).
395    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
396    where
397        I: IntoIterator<Item = S>,
398        S: Into<String>,
399    {
400        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
401        self.rebuild_whitelist_matcher();
402    }
403
404    /// Replace the blacklist patterns (compiled once).
405    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
406    where
407        I: IntoIterator<Item = S>,
408        S: Into<String>,
409    {
410        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
411        self.rebuild_blacklist_matcher();
412    }
413
414    /// Add one pattern (cheap) and rebuild (call this sparingly).
415    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
416        self.blacklist_patterns.push(pattern.into());
417        self.rebuild_blacklist_matcher();
418    }
419
420    /// Add many patterns and rebuild once.
421    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
422    where
423        I: IntoIterator<Item = S>,
424        S: Into<String>,
425    {
426        self.blacklist_patterns
427            .extend(patterns.into_iter().map(Into::into));
428        self.rebuild_blacklist_matcher();
429    }
430
431    /// Clear blacklist entirely.
432    pub fn clear_blacklist(&mut self) {
433        self.blacklist_patterns.clear();
434        self.blacklist_matcher = None;
435    }
436
437    /// Control precedence: when true, blacklist always wins.
438    pub fn set_blacklist_strict(&mut self, strict: bool) {
439        self.blacklist_strict = strict;
440    }
441
442    #[inline]
443    fn rebuild_blacklist_matcher(&mut self) {
444        if self.blacklist_patterns.is_empty() {
445            self.blacklist_matcher = None;
446            return;
447        }
448
449        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
450        self.blacklist_matcher = AhoCorasick::new(refs).ok();
451    }
452
453    #[inline]
454    fn is_blacklisted(&self, url: &str) -> bool {
455        self.blacklist_matcher
456            .as_ref()
457            .map(|m| m.is_match(url))
458            .unwrap_or(false)
459    }
460
461    /// Add one pattern (cheap) and rebuild (call this sparingly).
462    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
463        self.whitelist_patterns.push(pattern.into());
464        self.rebuild_whitelist_matcher();
465    }
466
467    /// Add many patterns and rebuild once.
468    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
469    where
470        I: IntoIterator<Item = S>,
471        S: Into<String>,
472    {
473        self.whitelist_patterns
474            .extend(patterns.into_iter().map(Into::into));
475        self.rebuild_whitelist_matcher();
476    }
477
478    #[inline]
479    fn rebuild_whitelist_matcher(&mut self) {
480        if self.whitelist_patterns.is_empty() {
481            self.whitelist_matcher = None;
482            return;
483        }
484
485        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
486
487        // If building fails (shouldn’t for simple patterns), just disable matcher.
488        self.whitelist_matcher = AhoCorasick::new(refs).ok();
489    }
490
491    #[inline]
492    fn is_whitelisted(&self, url: &str) -> bool {
493        self.whitelist_matcher
494            .as_ref()
495            .map(|m| m.is_match(url))
496            .unwrap_or(false)
497    }
498
499    /// Commands to init the chain with.
500    pub fn init_commands(&self) -> CommandChain {
501        let cmds = if self.ignore_httpserrors {
502            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
503        } else {
504            INIT_CHAIN.clone()
505        };
506        CommandChain::new(cmds, self.request_timeout)
507    }
508
509    /// Push the CDP request.
510    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
511        let method = cmd.identifier();
512        if let Ok(params) = serde_json::to_value(cmd) {
513            self.queued_events
514                .push_back(NetworkEvent::SendCdpRequest((method, params)));
515        }
516    }
517
518    /// The next event to handle.
519    pub fn poll(&mut self) -> Option<NetworkEvent> {
520        self.queued_events.pop_front()
521    }
522
523    /// Get the extra headers.
524    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
525        &self.extra_headers
526    }
527
528    /// Set extra HTTP headers.
529    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
530        self.extra_headers = headers;
531        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
532        self.extra_headers.remove("Proxy-Authorization");
533        if !self.extra_headers.is_empty() {
534            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
535                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
536            }
537        }
538    }
539
540    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
541        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
542    }
543
544    pub fn set_block_all(&mut self, block_all: bool) {
545        self.block_all = block_all;
546    }
547
548    pub fn set_request_interception(&mut self, enabled: bool) {
549        self.user_request_interception_enabled = enabled;
550        self.update_protocol_request_interception();
551    }
552
553    pub fn set_cache_enabled(&mut self, enabled: bool) {
554        let run = self.user_cache_disabled != !enabled;
555        self.user_cache_disabled = !enabled;
556        if run {
557            self.update_protocol_cache_disabled();
558        }
559    }
560
561    /// Enable fetch interception.
562    pub fn enable_request_intercept(&mut self) {
563        self.protocol_request_interception_enabled = true;
564    }
565
566    /// Disable fetch interception.
567    pub fn disable_request_intercept(&mut self) {
568        self.protocol_request_interception_enabled = false;
569    }
570
571    /// Set the cache site key.
572    #[cfg(feature = "_cache")]
573    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
574        self.cache_site_key = cache_site_key;
575    }
576
577    /// Set the cache policy.
578    #[cfg(feature = "_cache")]
579    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
580        self.cache_policy = cache_policy;
581    }
582
583    pub fn update_protocol_cache_disabled(&mut self) {
584        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
585    }
586
587    pub fn authenticate(&mut self, credentials: Credentials) {
588        self.credentials = Some(credentials);
589        self.update_protocol_request_interception();
590        self.protocol_request_interception_enabled = true;
591    }
592
593    fn update_protocol_request_interception(&mut self) {
594        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
595
596        if enabled == self.protocol_request_interception_enabled {
597            return;
598        }
599
600        if enabled {
601            self.push_cdp_request(ENABLE_FETCH.clone())
602        } else {
603            self.push_cdp_request(DisableParams::default())
604        }
605    }
606
607    /// Blocklist-only script blocking.
608    /// Returns true only when the URL matches an explicit blocklist condition.
609    #[inline]
610    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
611        // If analytics blocking is off, skip all analytics tries.
612        let block_analytics = self.block_analytics;
613
614        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
615        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
616        {
617            return true;
618        }
619
620        // 2) Custom website block list (explicit).
621        if crate::handler::blockers::block_websites::block_website(url) {
622            return true;
623        }
624
625        // 3) Path-based explicit tries / fallbacks.
626        //
627        // We run these on:
628        // - path with leading slash ("/js/app.js")
629        // - path without leading slash ("js/app.js")
630        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
631        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
632            // Remove query/fragment so matching stays stable.
633            let p_slash = Self::strip_query_fragment(path_with_slash);
634            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
635
636            // Basename for filename-only lists.
637            let base = match p_slash.rsplit('/').next() {
638                Some(b) => b,
639                None => p_slash,
640            };
641
642            // ---- Trie checks ----
643            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
644            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
645                return true;
646            }
647            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
648                return true;
649            }
650            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
651                return true;
652            }
653
654            // Base-path ignore tries (framework noise / known ignorable script paths).
655            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
656            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
657                return true;
658            }
659
660            // Style path ignores only when visuals are ignored.
661            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
662                return true;
663            }
664        }
665
666        false
667    }
668
669    /// Extract the absolute URL path portion WITH the leading slash.
670    ///
671    /// Example:
672    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
673    #[inline]
674    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
675        // find scheme separator
676        let idx = url.find("//")?;
677        let after_slashes = idx + 2;
678
679        // find first slash after host
680        let slash_rel = url[after_slashes..].find('/')?;
681        let slash_idx = after_slashes + slash_rel;
682
683        if slash_idx < url.len() {
684            Some(&url[slash_idx..])
685        } else {
686            None
687        }
688    }
689
690    /// Strip query string and fragment from a path-ish string.
691    ///
692    /// Example:
693    /// - "/a/b.js?x=1#y" -> "/a/b.js"
694    #[inline]
695    fn strip_query_fragment(s: &str) -> &str {
696        let q = s.find('?');
697        let h = s.find('#');
698
699        match (q, h) {
700            (None, None) => s,
701            (Some(i), None) => &s[..i],
702            (None, Some(i)) => &s[..i],
703            (Some(i), Some(j)) => &s[..i.min(j)],
704        }
705    }
706
707    /// Determine if the request should be skipped.
708    #[inline]
709    fn skip_xhr(
710        &self,
711        skip_networking: bool,
712        event: &EventRequestPaused,
713        network_event: bool,
714    ) -> bool {
715        // XHR check
716        if !skip_networking && network_event {
717            let request_url = event.request.url.as_str();
718
719            // check if part of ignore scripts.
720            let skip_analytics =
721                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
722
723            if skip_analytics {
724                true
725            } else if self.block_stylesheets || self.ignore_visuals {
726                let block_css = self.block_stylesheets;
727                let block_media = self.ignore_visuals;
728
729                let mut block_request = false;
730
731                if let Some(position) = request_url.rfind('.') {
732                    let hlen = request_url.len();
733                    let has_asset = hlen - position;
734
735                    if has_asset >= 3 {
736                        let next_position = position + 1;
737
738                        if block_media
739                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
740                                &request_url[next_position..].into(),
741                            )
742                        {
743                            block_request = true;
744                        } else if block_css {
745                            block_request =
746                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
747                                    .contains(&**CSS_EXTENSION)
748                        }
749                    }
750                }
751
752                if !block_request {
753                    block_request = ignore_script_xhr_media(request_url);
754                }
755
756                block_request
757            } else {
758                skip_networking
759            }
760        } else {
761            skip_networking
762        }
763    }
764
765    #[cfg(feature = "adblock")]
766    #[inline]
767    /// Detect if ad enabled.
768    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
769        if skip_networking {
770            true
771        } else {
772            block_ads(&event.request.url) || self.detect_ad(event)
773        }
774    }
775
776    /// When adblock feature is disabled, this is a no-op.
777    #[cfg(not(feature = "adblock"))]
778    #[inline]
779    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
780        use crate::handler::blockers::block_websites::block_ads;
781        if skip_networking {
782            true
783        } else {
784            block_ads(&event.request.url)
785        }
786    }
787
788    #[inline]
789    /// Fail request
790    fn fail_request_blocked(
791        &mut self,
792        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
793    ) {
794        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
795            request_id.clone(),
796            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
797        );
798        self.push_cdp_request(params);
799    }
800
801    #[inline]
802    /// Fulfill request
803    fn fulfill_request_empty_200(
804        &mut self,
805        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
806    ) {
807        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
808            request_id.clone(),
809            200,
810        );
811        self.push_cdp_request(params);
812    }
813
814    #[cfg(feature = "_cache")]
815    #[inline]
816    /// Fulfill a paused Fetch request from cached bytes + header map.
817    ///
818    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
819    fn fulfill_request_from_cache(
820        &mut self,
821        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
822        body: &[u8],
823        headers: &std::collections::HashMap<String, String>,
824        status: i64,
825    ) {
826        use crate::cdp::browser_protocol::fetch::HeaderEntry;
827        use crate::handler::network::fetch::FulfillRequestParams;
828        use base64::Engine;
829
830        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
831
832        for (k, v) in headers.iter() {
833            resp_headers.push(HeaderEntry {
834                name: k.clone().into(),
835                value: v.clone().into(),
836            });
837        }
838
839        let mut params = FulfillRequestParams::new(request_id.clone(), status);
840
841        // TODO: have this already encoded prior.
842        params.body = Some(
843            base64::engine::general_purpose::STANDARD
844                .encode(body)
845                .into(),
846        );
847
848        params.response_headers = Some(resp_headers);
849
850        self.push_cdp_request(params);
851    }
852
853    #[inline]
854    /// Continue the request url.
855    fn continue_request_with_url(
856        &mut self,
857        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
858        url: Option<&str>,
859        intercept_response: bool,
860    ) {
861        let mut params = ContinueRequestParams::new(request_id.clone());
862        if let Some(url) = url {
863            params.url = Some(url.to_string());
864            params.intercept_response = Some(intercept_response);
865        }
866        self.push_cdp_request(params);
867    }
868
869    /// On fetch request paused interception.
870    #[inline]
871    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
872        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
873            return;
874        }
875
876        let resource_type = &event.resource_type;
877
878        if self.block_all {
879            tracing::debug!(
880                "Blocked (block_all): {:?} - {}",
881                event.resource_type,
882                event.request.url
883            );
884            return self.fail_request_blocked(&event.request_id);
885        }
886
887        if let Some(network_id) = event.network_id.as_ref() {
888            if let Some(request_will_be_sent) =
889                self.requests_will_be_sent.remove(network_id.as_ref())
890            {
891                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
892            } else {
893                self.request_id_to_interception_id
894                    .insert(network_id.clone(), event.request_id.clone().into());
895            }
896        }
897
898        // From here on, we handle the full decision tree.
899        let javascript_resource = *resource_type == ResourceType::Script;
900        let document_resource = *resource_type == ResourceType::Document;
901        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
902
903        // Start with static / cheap skip checks.
904        let mut skip_networking =
905            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
906
907        // Also short-circuit if we've reloaded this document too many times.
908        if !skip_networking {
909            skip_networking = self.document_reload_tracker >= 3;
910        }
911
912        // Handle document redirect / masking and track xml documents.
913        let (current_url_cow, had_replacer) =
914            self.handle_document_replacement_and_tracking(event, document_resource);
915
916        let current_url: &str = current_url_cow.as_ref();
917
918        let blacklisted = self.is_blacklisted(current_url);
919
920        if !self.blacklist_strict && blacklisted {
921            skip_networking = true;
922        }
923
924        if !skip_networking {
925            // Allow XSL for sitemap XML.
926            if self.xml_document && current_url.ends_with(".xsl") {
927                skip_networking = false;
928            } else {
929                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
930            }
931        }
932
933        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
934
935        // Ignore embedded scripts when only_html or ignore_visuals is set.
936        if !skip_networking
937            && self.block_javascript
938            && (self.only_html || self.ignore_visuals)
939            && (javascript_resource || document_resource)
940        {
941            skip_networking = ignore_script_embedded(current_url);
942        }
943
944        // Script policy: allow-by-default.
945        // Block only if explicit block list patterns match.
946        if !skip_networking && javascript_resource {
947            skip_networking = self.should_block_script_blocklist_only(current_url);
948        }
949
950        // XHR / data resources.
951        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
952
953        // Custom interception layer.
954        if !skip_networking && (javascript_resource || network_resource || document_resource) {
955            skip_networking = self.intercept_manager.intercept_detection(
956                current_url,
957                self.ignore_visuals,
958                network_resource,
959            );
960        }
961
962        // Custom website block list.
963        if !skip_networking && (javascript_resource || network_resource) {
964            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
965        }
966
967        // whitelist 3rd party
968        // not required unless explicit blocking.
969        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
970        {
971            skip_networking = false;
972        }
973
974        // check if the url is in the whitelist.
975        if skip_networking && self.is_whitelisted(current_url) {
976            skip_networking = false;
977        }
978
979        if self.blacklist_strict && blacklisted {
980            skip_networking = true;
981        }
982
983        if skip_networking {
984            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
985            self.fulfill_request_empty_200(&event.request_id);
986        } else {
987            #[cfg(feature = "_cache")]
988            {
989                if let (Some(policy), Some(cache_site_key)) =
990                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
991                {
992                    let current_url = format!("{}:{}", event.request.method, &current_url);
993
994                    if let Some((res, cache_policy)) =
995                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
996                    {
997                        if policy.allows_cached(&cache_policy) {
998                            tracing::debug!(
999                                "Remote Cached: {:?} - {}",
1000                                resource_type,
1001                                &current_url
1002                            );
1003                            return self.fulfill_request_from_cache(
1004                                &event.request_id,
1005                                &res.body,
1006                                &res.headers,
1007                                res.status as i64,
1008                            );
1009                        }
1010                    }
1011                }
1012            }
1013
1014            // check our frame cache for the run.
1015            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1016            self.continue_request_with_url(
1017                &event.request_id,
1018                if had_replacer {
1019                    Some(current_url)
1020                } else {
1021                    None
1022                },
1023                !had_replacer,
1024            );
1025        }
1026    }
1027
1028    /// Shared "visuals + basic blocking" logic.
1029    ///
1030    /// IMPORTANT: Scripts are NOT blocked here anymore.
1031    /// Scripts are allowed by default and only blocked via explicit blocklists
1032    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1033    #[inline]
1034    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1035        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1036            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1037    }
1038
1039    /// Does the network manager have a target domain?
1040    pub fn has_target_domain(&self) -> bool {
1041        !self.document_target_url.is_empty()
1042    }
1043
1044    /// Set the target page url for tracking.
1045    pub fn set_page_url(&mut self, page_target_url: String) {
1046        let host_base = host_and_rest(&page_target_url)
1047            .map(|(h, _)| base_domain_from_host(h))
1048            .unwrap_or("");
1049
1050        self.document_target_domain = host_base.to_string();
1051        self.document_target_url = page_target_url;
1052    }
1053
1054    /// Clear the initial target domain on every navigation.
1055    pub fn clear_target_domain(&mut self) {
1056        self.document_reload_tracker = 0;
1057        self.document_target_url = Default::default();
1058        self.document_target_domain = Default::default();
1059    }
1060
1061    /// Handles:
1062    /// - document reload tracking (`document_reload_tracker`)
1063    /// - redirect masking / replacement
1064    /// - xml document detection (`xml_document`)
1065    /// - `document_target_url` updates
1066    ///
1067    /// Returns (current_url, had_replacer).
1068    #[inline]
1069    fn handle_document_replacement_and_tracking<'a>(
1070        &mut self,
1071        event: &'a EventRequestPaused,
1072        document_resource: bool,
1073    ) -> (Cow<'a, str>, bool) {
1074        let mut replacer: Option<String> = None;
1075        let current_url = event.request.url.as_str();
1076
1077        if document_resource {
1078            if self.document_target_url == current_url {
1079                self.document_reload_tracker += 1;
1080            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1081            {
1082                let (http_document_replacement, mut https_document_replacement) =
1083                    if self.document_target_url.starts_with("http://") {
1084                        (
1085                            self.document_target_url.replacen("http://", "http//", 1),
1086                            self.document_target_url.replacen("http://", "https://", 1),
1087                        )
1088                    } else {
1089                        (
1090                            self.document_target_url.replacen("https://", "https//", 1),
1091                            self.document_target_url.replacen("https://", "http://", 1),
1092                        )
1093                    };
1094
1095                // Track trailing slash to restore later.
1096                let trailing = https_document_replacement.ends_with('/');
1097                if trailing {
1098                    https_document_replacement.pop();
1099                }
1100                if https_document_replacement.ends_with('/') {
1101                    https_document_replacement.pop();
1102                }
1103
1104                let redirect_mask = format!(
1105                    "{}{}",
1106                    https_document_replacement, http_document_replacement
1107                );
1108
1109                if current_url == redirect_mask {
1110                    replacer = Some(if trailing {
1111                        format!("{}/", https_document_replacement)
1112                    } else {
1113                        https_document_replacement
1114                    });
1115                }
1116            }
1117
1118            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1119                self.xml_document = true;
1120            }
1121
1122            // Track last seen document URL.
1123            self.document_target_url = event.request.url.clone();
1124            self.document_target_domain = host_and_rest(&self.document_target_url)
1125                .map(|(h, _)| base_domain_from_host(h).to_string())
1126                .unwrap_or_default();
1127        }
1128
1129        let current_url_cow = match replacer {
1130            Some(r) => Cow::Owned(r),
1131            None => Cow::Borrowed(event.request.url.as_str()),
1132        };
1133
1134        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1135        (current_url_cow, had_replacer)
1136    }
1137
1138    /// Perform a page intercept for chrome
1139    #[cfg(feature = "adblock")]
1140    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1141        use adblock::{
1142            lists::{FilterSet, ParseOptions, RuleTypes},
1143            Engine,
1144        };
1145
1146        lazy_static::lazy_static! {
1147            static ref AD_ENGINE: Engine = {
1148                let mut filter_set = FilterSet::new(false);
1149                let mut rules = ParseOptions::default();
1150                rules.rule_types = RuleTypes::All;
1151
1152                filter_set.add_filters(
1153                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1154                    rules,
1155                );
1156
1157                Engine::from_filter_set(filter_set, true)
1158            };
1159        };
1160
1161        let blockable = ResourceType::Image == event.resource_type
1162            || event.resource_type == ResourceType::Media
1163            || event.resource_type == ResourceType::Stylesheet
1164            || event.resource_type == ResourceType::Document
1165            || event.resource_type == ResourceType::Fetch
1166            || event.resource_type == ResourceType::Xhr;
1167
1168        let u = &event.request.url;
1169
1170        let block_request = blockable
1171            // set it to example.com for 3rd party handling is_same_site
1172        && {
1173            let request = adblock::request::Request::preparsed(
1174                 &u,
1175                 "example.com",
1176                 "example.com",
1177                 &event.resource_type.as_ref().to_lowercase(),
1178                 !event.request.is_same_site.unwrap_or_default());
1179
1180            AD_ENGINE.check_network_request(&request).matched
1181        };
1182
1183        block_request
1184    }
1185
1186    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1187        let response = if self
1188            .attempted_authentications
1189            .contains(event.request_id.as_ref())
1190        {
1191            AuthChallengeResponseResponse::CancelAuth
1192        } else if self.credentials.is_some() {
1193            self.attempted_authentications
1194                .insert(event.request_id.clone().into());
1195            AuthChallengeResponseResponse::ProvideCredentials
1196        } else {
1197            AuthChallengeResponseResponse::Default
1198        };
1199
1200        let mut auth = AuthChallengeResponse::new(response);
1201        if let Some(creds) = self.credentials.clone() {
1202            auth.username = Some(creds.username);
1203            auth.password = Some(creds.password);
1204        }
1205        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1206    }
1207
1208    /// Set the page offline network emulation condition.
1209    pub fn set_offline_mode(&mut self, value: bool) {
1210        if self.offline == value {
1211            return;
1212        }
1213        self.offline = value;
1214        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1215            .offline(self.offline)
1216            .latency(0)
1217            .download_throughput(-1.)
1218            .upload_throughput(-1.)
1219            .build()
1220        {
1221            self.push_cdp_request(network);
1222        }
1223    }
1224
1225    /// Request interception doesn't happen for data URLs with Network Service.
1226    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1227        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1228            if let Some(interception_id) = self
1229                .request_id_to_interception_id
1230                .remove(event.request_id.as_ref())
1231            {
1232                self.on_request(event, Some(interception_id));
1233            } else {
1234                // TODO remove the clone for event
1235                self.requests_will_be_sent
1236                    .insert(event.request_id.clone(), event.clone());
1237            }
1238        } else {
1239            self.on_request(event, None);
1240        }
1241    }
1242
1243    /// The request was served from the cache.
1244    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1245        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1246            request.from_memory_cache = true;
1247        }
1248    }
1249
1250    /// On network response received.
1251    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1252        let mut request_failed = false;
1253
1254        // Track how many bytes we actually deducted from this target.
1255        let mut deducted: u64 = 0;
1256
1257        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1258            let before = *max_bytes;
1259
1260            // encoded_data_length -> saturating cast to u64
1261            let received_bytes: u64 = event.response.encoded_data_length as u64;
1262
1263            // Safe parse of Content-Length
1264            let content_length: Option<u64> = event
1265                .response
1266                .headers
1267                .inner()
1268                .get("content-length")
1269                .and_then(|v| v.as_str())
1270                .and_then(|s| s.trim().parse::<u64>().ok());
1271
1272            // Deduct what we actually received
1273            *max_bytes = max_bytes.saturating_sub(received_bytes);
1274
1275            // If the declared size can't fit, zero out now
1276            if let Some(cl) = content_length {
1277                if cl > *max_bytes {
1278                    *max_bytes = 0;
1279                }
1280            }
1281
1282            request_failed = *max_bytes == 0;
1283
1284            // Compute exact delta deducted on this event
1285            deducted = before.saturating_sub(*max_bytes);
1286        }
1287
1288        // Bubble up the deduction (even if request continues)
1289        if deducted > 0 {
1290            self.queued_events
1291                .push_back(NetworkEvent::BytesConsumed(deducted));
1292        }
1293
1294        // block all network request moving forward.
1295        if request_failed && self.max_bytes_allowed.is_some() {
1296            self.set_block_all(true);
1297        }
1298
1299        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1300            request.set_response(event.response.clone());
1301            self.queued_events.push_back(if request_failed {
1302                NetworkEvent::RequestFailed(request)
1303            } else {
1304                NetworkEvent::RequestFinished(request)
1305            });
1306        }
1307    }
1308
1309    /// On network loading finished.
1310    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1311        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1312            if let Some(interception_id) = request.interception_id.as_ref() {
1313                self.attempted_authentications
1314                    .remove(interception_id.as_ref());
1315            }
1316            self.queued_events
1317                .push_back(NetworkEvent::RequestFinished(request));
1318        }
1319    }
1320
1321    /// On network loading failed.
1322    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1323        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1324            request.failure_text = Some(event.error_text.clone());
1325            if let Some(interception_id) = request.interception_id.as_ref() {
1326                self.attempted_authentications
1327                    .remove(interception_id.as_ref());
1328            }
1329            self.queued_events
1330                .push_back(NetworkEvent::RequestFailed(request));
1331        }
1332    }
1333
1334    /// On request will be sent.
1335    fn on_request(
1336        &mut self,
1337        event: &EventRequestWillBeSent,
1338        interception_id: Option<InterceptionId>,
1339    ) {
1340        let mut redirect_chain = Vec::new();
1341        let mut redirect_location = None;
1342
1343        if let Some(redirect_resp) = &event.redirect_response {
1344            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1345                if is_redirect_status(redirect_resp.status) {
1346                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1347                        if redirect_resp.url != location {
1348                            let fixed_location = location.replace(&redirect_resp.url, "");
1349
1350                            if !fixed_location.is_empty() {
1351                                request.response.as_mut().map(|resp| {
1352                                    resp.headers.0["Location"] =
1353                                        serde_json::Value::String(fixed_location.clone());
1354                                });
1355                            }
1356
1357                            redirect_location = Some(fixed_location);
1358                        }
1359                    }
1360                }
1361
1362                self.handle_request_redirect(
1363                    &mut request,
1364                    if let Some(redirect_location) = redirect_location {
1365                        let mut redirect_resp = redirect_resp.clone();
1366
1367                        if !redirect_location.is_empty() {
1368                            redirect_resp.headers.0["Location"] =
1369                                serde_json::Value::String(redirect_location);
1370                        }
1371
1372                        redirect_resp
1373                    } else {
1374                        redirect_resp.clone()
1375                    },
1376                );
1377
1378                redirect_chain = std::mem::take(&mut request.redirect_chain);
1379                redirect_chain.push(request);
1380            }
1381        }
1382
1383        let request = HttpRequest::new(
1384            event.request_id.clone(),
1385            event.frame_id.clone(),
1386            interception_id,
1387            self.user_request_interception_enabled,
1388            redirect_chain,
1389        );
1390
1391        self.requests.insert(event.request_id.clone(), request);
1392        self.queued_events
1393            .push_back(NetworkEvent::Request(event.request_id.clone()));
1394    }
1395
1396    /// Handle request redirect.
1397    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1398        request.set_response(response);
1399        if let Some(interception_id) = request.interception_id.as_ref() {
1400            self.attempted_authentications
1401                .remove(interception_id.as_ref());
1402        }
1403    }
1404}
1405
1406#[derive(Debug)]
1407pub enum NetworkEvent {
1408    /// Send a CDP request.
1409    SendCdpRequest((MethodId, serde_json::Value)),
1410    /// Request.
1411    Request(RequestId),
1412    /// Response
1413    Response(RequestId),
1414    /// Request failed.
1415    RequestFailed(HttpRequest),
1416    /// Request finished.
1417    RequestFinished(HttpRequest),
1418    /// Bytes consumed.
1419    BytesConsumed(u64),
1420}
1421
1422#[cfg(test)]
1423mod tests {
1424    use super::ALLOWED_MATCHER_3RD_PARTY;
1425    use crate::handler::network::NetworkManager;
1426    use std::time::Duration;
1427
1428    #[test]
1429    fn test_allowed_matcher_3rd_party() {
1430        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1431        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1432        assert!(
1433            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1434            "expected Cloudflare challenge script to be allowed"
1435        );
1436
1437        // Should NOT be allowed (not in allow-list)
1438        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1439        assert!(
1440            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1441            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1442        );
1443
1444        // A couple sanity checks for existing allow patterns
1445        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1446        assert!(ALLOWED_MATCHER_3RD_PARTY
1447            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1448        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1449    }
1450
1451    #[test]
1452    fn test_script_allowed_by_default_when_not_blocklisted() {
1453        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1454        nm.set_page_url(
1455            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1456        );
1457
1458        // A random script that should not match your block tries.
1459        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1460        assert!(
1461            !nm.should_block_script_blocklist_only(ok),
1462            "expected non-blocklisted script to be allowed"
1463        );
1464    }
1465
1466    #[test]
1467    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1468        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1469        nm.set_page_url(
1470            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1471        );
1472
1473        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1474        let bad = "https://cdn.example.net/js/analytics.js";
1475        assert!(
1476            nm.should_block_script_blocklist_only(bad),
1477            "expected analytics.js to be blocklisted"
1478        );
1479    }
1480
1481    #[test]
1482    fn test_allowed_matcher_3rd_party_sanity() {
1483        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1484        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1485        assert!(
1486            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1487            "expected Cloudflare challenge script to be allowed"
1488        );
1489
1490        // Should NOT be allowed (not in allow-list)
1491        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1492        assert!(
1493            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1494            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1495        );
1496
1497        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1498        assert!(ALLOWED_MATCHER_3RD_PARTY
1499            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1500        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1501    }
1502    #[test]
1503    fn test_dynamic_blacklist_blocks_url() {
1504        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1505        nm.set_page_url("https://example.com/".to_string());
1506
1507        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1508        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1509        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1510
1511        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1512    }
1513
1514    #[test]
1515    fn test_blacklist_strict_wins_over_whitelist() {
1516        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1517        nm.set_page_url("https://example.com/".to_string());
1518
1519        // Same URL in both lists.
1520        nm.set_blacklist_patterns(["beacon.min.js"]);
1521        nm.set_whitelist_patterns(["beacon.min.js"]);
1522
1523        nm.set_blacklist_strict(true);
1524
1525        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1526        assert!(nm.is_whitelisted(u));
1527        assert!(nm.is_blacklisted(u));
1528
1529        // In strict mode, it should still be considered blocked at decision time.
1530        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1531        assert!(nm.blacklist_strict);
1532    }
1533
1534    #[test]
1535    fn test_blacklist_non_strict_allows_whitelist_override() {
1536        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1537        nm.set_page_url("https://example.com/".to_string());
1538
1539        nm.set_blacklist_patterns(["beacon.min.js"]);
1540        nm.set_whitelist_patterns(["beacon.min.js"]);
1541
1542        nm.set_blacklist_strict(false);
1543
1544        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1545        assert!(nm.is_blacklisted(u));
1546        assert!(nm.is_whitelisted(u));
1547        assert!(!nm.blacklist_strict);
1548    }
1549}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs