chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/",
73        "https://google.com/recaptcha/api.js",
74        "https://www.gstatic.com/recaptcha/",
75        "https://captcha.px-cloud.net/",
76        "https://geo.captcha-delivery.com/",
77        "https://api.leminnow.com/captcha/",
78        "https://cdn.auth0.com/js/lock/",
79        "https://captcha.gtimg.com",
80        "https://client-api.arkoselabs.com/",
81        "https://www.capy.me/puzzle/",
82        "https://newassets.hcaptcha.com/",
83        "https://cdn.auth0.com/client",
84        "https://js.stripe.com/",
85        "https://cdn.prod.website-files.com/", // webflow cdn scripts
86        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
87        "https://code.jquery.com/jquery-"
88    ];
89
90    /// Determine if a script should be rendered in the browser by name.
91    ///
92    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
93    /// but we keep it for compatibility and other call sites.
94    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
95
96    /// General patterns for popular libraries and resources
97    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
98        // Verified 3rd parties for request
99        "https://m.stripe.network/",
100        "https://challenges.cloudflare.com/",
101        "https://js.stripe.com/",
102        "https://cdn.prod.website-files.com/", // webflow cdn scripts
103        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
104        "https://code.jquery.com/jquery-",
105        "https://ct.captcha-delivery.com/",
106        "https://geo.captcha-delivery.com/",
107        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
108        "https://ct.captcha-delivery.com/",
109        "https://cdn.auth0.com/client",
110        "https://captcha.px-cloud.net/",
111        "https://www.capy.me/puzzle/",
112        "https://www.gstatic.com/recaptcha/",
113        "https://google.com/recaptcha/",
114        "https://www.google.com/recaptcha/",
115        "https://www.recaptcha.net/recaptcha/",
116        "https://js.hcaptcha.com/1/api.js",
117        "https://hcaptcha.com/1/api.js",
118        "https://js.datadome.co/tags.js",
119        "https://api-js.datadome.co/",
120        "https://client.perimeterx.net/",
121        "https://captcha.px-cdn.net/",
122        "https://newassets.hcaptcha.com/",
123        "https://captcha.px-cloud.net/",
124        "https://s.perimeterx.net/",
125        "https://api.leminnow.com/captcha/",
126        "https://client-api.arkoselabs.com/",
127        "https://static.geetest.com/v4/gt4.js",
128        "https://static.geetest.com/",
129        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
130        "https://cdn.perfdrive.com/aperture/",
131        "https://assets.queue-it.net/",
132        "discourse-cdn.com/",
133        "hcaptcha.com",
134        "/cdn-cgi/challenge-platform/",
135        "/_Incapsula_Resource"
136    ];
137
138    /// Determine if a script should be rendered in the browser by name.
139    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
140
141    /// path of a js framework
142    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
143        phf::phf_set! {
144            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
145            "_astro/", "_app/immutable"
146        }
147    };
148
149    /// Ignore the content types.
150    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
151        "application/pdf",
152        "application/zip",
153        "application/x-rar-compressed",
154        "application/x-tar",
155        "image/png",
156        "image/jpeg",
157        "image/gif",
158        "image/bmp",
159        "image/webp",
160        "image/svg+xml",
161        "video/mp4",
162        "video/x-msvideo",
163        "video/x-matroska",
164        "video/webm",
165        "audio/mpeg",
166        "audio/ogg",
167        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
168        "application/vnd.ms-excel",
169        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
170        "application/vnd.ms-powerpoint",
171        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
172        "application/x-7z-compressed",
173        "application/x-rpm",
174        "application/x-shockwave-flash",
175        "application/rtf",
176    };
177
178    /// Ignore the resources for visual content types.
179    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180        "Image",
181        "Media",
182        "Font"
183    };
184
185    /// Ignore the resources for visual content types.
186    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
187        "CspViolationReport",
188        "Manifest",
189        "Other",
190        "Prefetch",
191        "Ping",
192    };
193
194    /// Case insenstive css matching
195    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
196
197    /// The command chain.
198    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
199        let enable = EnableParams::default();
200
201        if let Ok(c) = serde_json::to_value(&enable) {
202            vec![(enable.identifier(), c)]
203        } else {
204            vec![]
205        }
206    };
207
208    /// The command chain with https ignore.
209    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
210        let enable = EnableParams::default();
211        let mut v = vec![];
212        if let Ok(c) = serde_json::to_value(&enable) {
213            v.push((enable.identifier(), c));
214        }
215        let ignore = SetIgnoreCertificateErrorsParams::new(true);
216        if let Ok(ignored) = serde_json::to_value(&ignore) {
217            v.push((ignore.identifier(), ignored));
218        }
219
220        v
221    };
222
223    /// Enable the fetch intercept command
224    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
225        fetch::EnableParams::builder()
226        .handle_auth_requests(true)
227        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
228        .build()
229    };
230}
231
232/// Determine if a redirect is true.
233pub(crate) fn is_redirect_status(status: i64) -> bool {
234    matches!(status, 301 | 302 | 303 | 307 | 308)
235}
236
237#[derive(Debug)]
238/// The base network manager.
239pub struct NetworkManager {
240    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
241    ///
242    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
243    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
244    /// Consumers pull from this queue via `poll()`.
245    queued_events: VecDeque<NetworkEvent>,
246    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
247    ///
248    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
249    /// certificates (self-signed, expired, MITM proxies, etc.).
250    ignore_httpserrors: bool,
251    /// Active in-flight requests keyed by CDP `RequestId`.
252    ///
253    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
254    /// and final state used to emit `RequestFinished` / `RequestFailed`.
255    requests: HashMap<RequestId, HttpRequest>,
256    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
257    /// `Fetch.requestPaused` arrives later (or vice versa).
258    ///
259    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
260    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
261    // TODO put event in an Arc?
262    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
263    /// Extra HTTP headers to apply to subsequent network requests via CDP.
264    ///
265    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
266    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
267    extra_headers: std::collections::HashMap<String, String>,
268    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
269    ///
270    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
271    /// store the interception id here so it can be attached to the `HttpRequest` once the
272    /// network request is observed.
273    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
274    /// Whether the user has disabled the browser cache.
275    ///
276    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
277    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
278    user_cache_disabled: bool,
279    /// Tracks which requests have already attempted authentication.
280    ///
281    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
282    /// authentication challenges (407/401). Once a request id is present here, subsequent
283    /// challenges for the same request are canceled.
284    attempted_authentications: HashSet<RequestId>,
285    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
286    ///
287    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
288    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
289    credentials: Option<Credentials>,
290    /// User-facing toggle indicating whether request interception is desired.
291    ///
292    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
293    /// not guarantee interception is active; interception is actually enabled/disabled by
294    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
295    ///
296    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
297    /// enabled to satisfy auth challenges.
298    pub(crate) user_request_interception_enabled: bool,
299    /// Hard kill-switch to block all network traffic.
300    ///
301    /// When `true`, the manager immediately blocks requests (typically via
302    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
303    /// and short-circuits most decision logic. This is used for safety conditions such as
304    /// exceeding `max_bytes_allowed` or other runtime protections.
305    block_all: bool,
306    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
307    ///
308    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
309    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
310    /// when `user_request_interception_enabled` or `credentials` change.
311    pub(crate) protocol_request_interception_enabled: bool,
312    /// The network is offline.
313    offline: bool,
314    /// The page request timeout.
315    pub request_timeout: Duration,
316    // made_request: bool,
317    /// Ignore visuals (no pings, prefetching, and etc).
318    pub ignore_visuals: bool,
319    /// Block CSS stylesheets.
320    pub block_stylesheets: bool,
321    /// Block javascript that is not critical to rendering.
322    ///
323    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
324    /// by itself (it remains for config compatibility).
325    pub block_javascript: bool,
326    /// Block analytics from rendering
327    pub block_analytics: bool,
328    /// Only html from loading.
329    pub only_html: bool,
330    /// Is xml document?
331    pub xml_document: bool,
332    /// The custom intercept handle logic to run on the website.
333    pub intercept_manager: NetworkInterceptManager,
334    /// Track the amount of times the document reloaded.
335    pub document_reload_tracker: u8,
336    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
337    pub document_target_url: String,
338    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
339    pub document_target_domain: String,
340    /// The max bytes to receive.
341    pub max_bytes_allowed: Option<u64>,
342    #[cfg(feature = "_cache")]
343    /// The cache site_key to use.
344    pub cache_site_key: Option<String>,
345    /// The cache policy to use.
346    #[cfg(feature = "_cache")]
347    pub cache_policy: Option<BasicCachePolicy>,
348    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
349    whitelist_patterns: Vec<String>,
350    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
351    whitelist_matcher: Option<AhoCorasick>,
352    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
353    blacklist_patterns: Vec<String>,
354    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
355    blacklist_matcher: Option<AhoCorasick>,
356    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
357    blacklist_strict: bool,
358}
359
360impl NetworkManager {
361    /// A new network manager.
362    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
363        Self {
364            queued_events: Default::default(),
365            ignore_httpserrors,
366            requests: Default::default(),
367            requests_will_be_sent: Default::default(),
368            extra_headers: Default::default(),
369            request_id_to_interception_id: Default::default(),
370            user_cache_disabled: false,
371            attempted_authentications: Default::default(),
372            credentials: None,
373            block_all: false,
374            user_request_interception_enabled: false,
375            protocol_request_interception_enabled: false,
376            offline: false,
377            request_timeout,
378            ignore_visuals: false,
379            block_javascript: false,
380            block_stylesheets: false,
381            block_analytics: true,
382            only_html: false,
383            xml_document: false,
384            intercept_manager: NetworkInterceptManager::Unknown,
385            document_reload_tracker: 0,
386            document_target_url: String::new(),
387            document_target_domain: String::new(),
388            whitelist_patterns: Vec::new(),
389            whitelist_matcher: None,
390            blacklist_patterns: Vec::new(),
391            blacklist_matcher: None,
392            blacklist_strict: true,
393            max_bytes_allowed: None,
394            #[cfg(feature = "_cache")]
395            cache_site_key: None,
396            #[cfg(feature = "_cache")]
397            cache_policy: None,
398        }
399    }
400
401    /// Replace the whitelist patterns (compiled once).
402    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
403    where
404        I: IntoIterator<Item = S>,
405        S: Into<String>,
406    {
407        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
408        self.rebuild_whitelist_matcher();
409    }
410
411    /// Replace the blacklist patterns (compiled once).
412    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
413    where
414        I: IntoIterator<Item = S>,
415        S: Into<String>,
416    {
417        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
418        self.rebuild_blacklist_matcher();
419    }
420
421    /// Add one pattern (cheap) and rebuild (call this sparingly).
422    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
423        self.blacklist_patterns.push(pattern.into());
424        self.rebuild_blacklist_matcher();
425    }
426
427    /// Add many patterns and rebuild once.
428    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
429    where
430        I: IntoIterator<Item = S>,
431        S: Into<String>,
432    {
433        self.blacklist_patterns
434            .extend(patterns.into_iter().map(Into::into));
435        self.rebuild_blacklist_matcher();
436    }
437
438    /// Clear blacklist entirely.
439    pub fn clear_blacklist(&mut self) {
440        self.blacklist_patterns.clear();
441        self.blacklist_matcher = None;
442    }
443
444    /// Control precedence: when true, blacklist always wins.
445    pub fn set_blacklist_strict(&mut self, strict: bool) {
446        self.blacklist_strict = strict;
447    }
448
449    #[inline]
450    fn rebuild_blacklist_matcher(&mut self) {
451        if self.blacklist_patterns.is_empty() {
452            self.blacklist_matcher = None;
453            return;
454        }
455
456        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
457        self.blacklist_matcher = AhoCorasick::new(refs).ok();
458    }
459
460    #[inline]
461    fn is_blacklisted(&self, url: &str) -> bool {
462        self.blacklist_matcher
463            .as_ref()
464            .map(|m| m.is_match(url))
465            .unwrap_or(false)
466    }
467
468    /// Add one pattern (cheap) and rebuild (call this sparingly).
469    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
470        self.whitelist_patterns.push(pattern.into());
471        self.rebuild_whitelist_matcher();
472    }
473
474    /// Add many patterns and rebuild once.
475    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
476    where
477        I: IntoIterator<Item = S>,
478        S: Into<String>,
479    {
480        self.whitelist_patterns
481            .extend(patterns.into_iter().map(Into::into));
482        self.rebuild_whitelist_matcher();
483    }
484
485    #[inline]
486    fn rebuild_whitelist_matcher(&mut self) {
487        if self.whitelist_patterns.is_empty() {
488            self.whitelist_matcher = None;
489            return;
490        }
491
492        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
493
494        // If building fails (shouldn’t for simple patterns), just disable matcher.
495        self.whitelist_matcher = AhoCorasick::new(refs).ok();
496    }
497
498    #[inline]
499    fn is_whitelisted(&self, url: &str) -> bool {
500        self.whitelist_matcher
501            .as_ref()
502            .map(|m| m.is_match(url))
503            .unwrap_or(false)
504    }
505
506    /// Commands to init the chain with.
507    pub fn init_commands(&self) -> CommandChain {
508        let cmds = if self.ignore_httpserrors {
509            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
510        } else {
511            INIT_CHAIN.clone()
512        };
513        CommandChain::new(cmds, self.request_timeout)
514    }
515
516    /// Push the CDP request.
517    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
518        let method = cmd.identifier();
519        if let Ok(params) = serde_json::to_value(cmd) {
520            self.queued_events
521                .push_back(NetworkEvent::SendCdpRequest((method, params)));
522        }
523    }
524
525    /// The next event to handle.
526    pub fn poll(&mut self) -> Option<NetworkEvent> {
527        self.queued_events.pop_front()
528    }
529
530    /// Get the extra headers.
531    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
532        &self.extra_headers
533    }
534
535    /// Set extra HTTP headers.
536    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
537        self.extra_headers = headers;
538        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
539        self.extra_headers.remove("Proxy-Authorization");
540        if !self.extra_headers.is_empty() {
541            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
542                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
543            }
544        }
545    }
546
547    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
548        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
549    }
550
551    pub fn set_block_all(&mut self, block_all: bool) {
552        self.block_all = block_all;
553    }
554
555    pub fn set_request_interception(&mut self, enabled: bool) {
556        self.user_request_interception_enabled = enabled;
557        self.update_protocol_request_interception();
558    }
559
560    pub fn set_cache_enabled(&mut self, enabled: bool) {
561        let run = self.user_cache_disabled != !enabled;
562        self.user_cache_disabled = !enabled;
563        if run {
564            self.update_protocol_cache_disabled();
565        }
566    }
567
568    /// Enable fetch interception.
569    pub fn enable_request_intercept(&mut self) {
570        self.protocol_request_interception_enabled = true;
571    }
572
573    /// Disable fetch interception.
574    pub fn disable_request_intercept(&mut self) {
575        self.protocol_request_interception_enabled = false;
576    }
577
578    /// Set the cache site key.
579    #[cfg(feature = "_cache")]
580    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
581        self.cache_site_key = cache_site_key;
582    }
583
584    /// Set the cache policy.
585    #[cfg(feature = "_cache")]
586    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
587        self.cache_policy = cache_policy;
588    }
589
590    pub fn update_protocol_cache_disabled(&mut self) {
591        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
592    }
593
594    pub fn authenticate(&mut self, credentials: Credentials) {
595        self.credentials = Some(credentials);
596        self.update_protocol_request_interception();
597        self.protocol_request_interception_enabled = true;
598    }
599
600    fn update_protocol_request_interception(&mut self) {
601        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
602
603        if enabled == self.protocol_request_interception_enabled {
604            return;
605        }
606
607        if enabled {
608            self.push_cdp_request(ENABLE_FETCH.clone())
609        } else {
610            self.push_cdp_request(DisableParams::default())
611        }
612    }
613
614    /// Blocklist-only script blocking.
615    /// Returns true only when the URL matches an explicit blocklist condition.
616    #[inline]
617    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
618        // If analytics blocking is off, skip all analytics tries.
619        let block_analytics = self.block_analytics;
620
621        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
622        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
623        {
624            return true;
625        }
626
627        // 2) Custom website block list (explicit).
628        if crate::handler::blockers::block_websites::block_website(url) {
629            return true;
630        }
631
632        // 3) Path-based explicit tries / fallbacks.
633        //
634        // We run these on:
635        // - path with leading slash ("/js/app.js")
636        // - path without leading slash ("js/app.js")
637        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
638        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
639            // Remove query/fragment so matching stays stable.
640            let p_slash = Self::strip_query_fragment(path_with_slash);
641            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
642
643            // Basename for filename-only lists.
644            let base = match p_slash.rsplit('/').next() {
645                Some(b) => b,
646                None => p_slash,
647            };
648
649            // ---- Trie checks ----
650            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
651            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
652                return true;
653            }
654            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
655                return true;
656            }
657            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
658                return true;
659            }
660
661            // Base-path ignore tries (framework noise / known ignorable script paths).
662            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
663            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
664                return true;
665            }
666
667            // Style path ignores only when visuals are ignored.
668            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
669                return true;
670            }
671        }
672
673        false
674    }
675
676    /// Extract the absolute URL path portion WITH the leading slash.
677    ///
678    /// Example:
679    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
680    #[inline]
681    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
682        // find scheme separator
683        let idx = url.find("//")?;
684        let after_slashes = idx + 2;
685
686        // find first slash after host
687        let slash_rel = url[after_slashes..].find('/')?;
688        let slash_idx = after_slashes + slash_rel;
689
690        if slash_idx < url.len() {
691            Some(&url[slash_idx..])
692        } else {
693            None
694        }
695    }
696
697    /// Strip query string and fragment from a path-ish string.
698    ///
699    /// Example:
700    /// - "/a/b.js?x=1#y" -> "/a/b.js"
701    #[inline]
702    fn strip_query_fragment(s: &str) -> &str {
703        let q = s.find('?');
704        let h = s.find('#');
705
706        match (q, h) {
707            (None, None) => s,
708            (Some(i), None) => &s[..i],
709            (None, Some(i)) => &s[..i],
710            (Some(i), Some(j)) => &s[..i.min(j)],
711        }
712    }
713
714    /// Determine if the request should be skipped.
715    #[inline]
716    fn skip_xhr(
717        &self,
718        skip_networking: bool,
719        event: &EventRequestPaused,
720        network_event: bool,
721    ) -> bool {
722        // XHR check
723        if !skip_networking && network_event {
724            let request_url = event.request.url.as_str();
725
726            // check if part of ignore scripts.
727            let skip_analytics =
728                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
729
730            if skip_analytics {
731                true
732            } else if self.block_stylesheets || self.ignore_visuals {
733                let block_css = self.block_stylesheets;
734                let block_media = self.ignore_visuals;
735
736                let mut block_request = false;
737
738                if let Some(position) = request_url.rfind('.') {
739                    let hlen = request_url.len();
740                    let has_asset = hlen - position;
741
742                    if has_asset >= 3 {
743                        let next_position = position + 1;
744
745                        if block_media
746                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
747                                &request_url[next_position..].into(),
748                            )
749                        {
750                            block_request = true;
751                        } else if block_css {
752                            block_request =
753                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
754                                    .contains(&**CSS_EXTENSION)
755                        }
756                    }
757                }
758
759                if !block_request {
760                    block_request = ignore_script_xhr_media(request_url);
761                }
762
763                block_request
764            } else {
765                skip_networking
766            }
767        } else {
768            skip_networking
769        }
770    }
771
772    #[cfg(feature = "adblock")]
773    #[inline]
774    /// Detect if ad enabled.
775    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
776        if skip_networking {
777            true
778        } else {
779            block_ads(&event.request.url) || self.detect_ad(event)
780        }
781    }
782
783    /// When adblock feature is disabled, this is a no-op.
784    #[cfg(not(feature = "adblock"))]
785    #[inline]
786    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
787        use crate::handler::blockers::block_websites::block_ads;
788        if skip_networking {
789            true
790        } else {
791            block_ads(&event.request.url)
792        }
793    }
794
795    #[inline]
796    /// Fail request
797    fn fail_request_blocked(
798        &mut self,
799        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
800    ) {
801        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
802            request_id.clone(),
803            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
804        );
805        self.push_cdp_request(params);
806    }
807
808    #[inline]
809    /// Fulfill request
810    fn fulfill_request_empty_200(
811        &mut self,
812        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
813    ) {
814        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
815            request_id.clone(),
816            200,
817        );
818        self.push_cdp_request(params);
819    }
820
821    #[cfg(feature = "_cache")]
822    #[inline]
823    /// Fulfill a paused Fetch request from cached bytes + header map.
824    ///
825    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
826    fn fulfill_request_from_cache(
827        &mut self,
828        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
829        body: &[u8],
830        headers: &std::collections::HashMap<String, String>,
831        status: i64,
832    ) {
833        use crate::cdp::browser_protocol::fetch::HeaderEntry;
834        use crate::handler::network::fetch::FulfillRequestParams;
835        use base64::Engine;
836
837        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
838
839        for (k, v) in headers.iter() {
840            resp_headers.push(HeaderEntry {
841                name: k.clone().into(),
842                value: v.clone().into(),
843            });
844        }
845
846        let mut params = FulfillRequestParams::new(request_id.clone(), status);
847
848        // TODO: have this already encoded prior.
849        params.body = Some(
850            base64::engine::general_purpose::STANDARD
851                .encode(body)
852                .into(),
853        );
854
855        params.response_headers = Some(resp_headers);
856
857        self.push_cdp_request(params);
858    }
859
860    #[inline]
861    /// Continue the request url.
862    fn continue_request_with_url(
863        &mut self,
864        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
865        url: Option<&str>,
866        intercept_response: bool,
867    ) {
868        let mut params = ContinueRequestParams::new(request_id.clone());
869        if let Some(url) = url {
870            params.url = Some(url.to_string());
871            params.intercept_response = Some(intercept_response);
872        }
873        self.push_cdp_request(params);
874    }
875
876    /// On fetch request paused interception.
877    #[inline]
878    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
879        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
880            return;
881        }
882
883        let resource_type = &event.resource_type;
884
885        if self.block_all {
886            tracing::debug!(
887                "Blocked (block_all): {:?} - {}",
888                event.resource_type,
889                event.request.url
890            );
891            return self.fail_request_blocked(&event.request_id);
892        }
893
894        if let Some(network_id) = event.network_id.as_ref() {
895            if let Some(request_will_be_sent) =
896                self.requests_will_be_sent.remove(network_id.as_ref())
897            {
898                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
899            } else {
900                self.request_id_to_interception_id
901                    .insert(network_id.clone(), event.request_id.clone().into());
902            }
903        }
904
905        // From here on, we handle the full decision tree.
906        let javascript_resource = *resource_type == ResourceType::Script;
907        let document_resource = *resource_type == ResourceType::Document;
908        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
909
910        // Start with static / cheap skip checks.
911        let mut skip_networking =
912            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
913
914        // Also short-circuit if we've reloaded this document too many times.
915        if !skip_networking {
916            skip_networking = self.document_reload_tracker >= 3;
917        }
918
919        // Handle document redirect / masking and track xml documents.
920        let (current_url_cow, had_replacer) =
921            self.handle_document_replacement_and_tracking(event, document_resource);
922
923        let current_url: &str = current_url_cow.as_ref();
924
925        let blacklisted = self.is_blacklisted(current_url);
926
927        if !self.blacklist_strict && blacklisted {
928            skip_networking = true;
929        }
930
931        if !skip_networking {
932            // Allow XSL for sitemap XML.
933            if self.xml_document && current_url.ends_with(".xsl") {
934                skip_networking = false;
935            } else {
936                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
937            }
938        }
939
940        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
941
942        // Ignore embedded scripts when only_html or ignore_visuals is set.
943        if !skip_networking
944            && self.block_javascript
945            && (self.only_html || self.ignore_visuals)
946            && (javascript_resource || document_resource)
947        {
948            skip_networking = ignore_script_embedded(current_url);
949        }
950
951        // Script policy: allow-by-default.
952        // Block only if explicit block list patterns match.
953        if !skip_networking && javascript_resource {
954            skip_networking = self.should_block_script_blocklist_only(current_url);
955        }
956
957        // XHR / data resources.
958        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
959
960        // Custom interception layer.
961        if !skip_networking && (javascript_resource || network_resource || document_resource) {
962            skip_networking = self.intercept_manager.intercept_detection(
963                current_url,
964                self.ignore_visuals,
965                network_resource,
966            );
967        }
968
969        // Custom website block list.
970        if !skip_networking && (javascript_resource || network_resource) {
971            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
972        }
973
974        // whitelist 3rd party
975        // not required unless explicit blocking.
976        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
977        {
978            skip_networking = false;
979        }
980
981        // check if the url is in the whitelist.
982        if skip_networking && self.is_whitelisted(current_url) {
983            skip_networking = false;
984        }
985
986        if self.blacklist_strict && blacklisted {
987            skip_networking = true;
988        }
989
990        if skip_networking {
991            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
992            self.fulfill_request_empty_200(&event.request_id);
993        } else {
994            #[cfg(feature = "_cache")]
995            {
996                if let (Some(policy), Some(cache_site_key)) =
997                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
998                {
999                    let current_url = format!("{}:{}", event.request.method, &current_url);
1000
1001                    if let Some((res, cache_policy)) =
1002                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1003                    {
1004                        if policy.allows_cached(&cache_policy) {
1005                            tracing::debug!(
1006                                "Remote Cached: {:?} - {}",
1007                                resource_type,
1008                                &current_url
1009                            );
1010                            return self.fulfill_request_from_cache(
1011                                &event.request_id,
1012                                &res.body,
1013                                &res.headers,
1014                                res.status as i64,
1015                            );
1016                        }
1017                    }
1018                }
1019            }
1020
1021            // check our frame cache for the run.
1022            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1023            self.continue_request_with_url(
1024                &event.request_id,
1025                if had_replacer {
1026                    Some(current_url)
1027                } else {
1028                    None
1029                },
1030                !had_replacer,
1031            );
1032        }
1033    }
1034
1035    /// Shared "visuals + basic blocking" logic.
1036    ///
1037    /// IMPORTANT: Scripts are NOT blocked here anymore.
1038    /// Scripts are allowed by default and only blocked via explicit blocklists
1039    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1040    #[inline]
1041    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1042        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1043            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1044    }
1045
1046    /// Does the network manager have a target domain?
1047    pub fn has_target_domain(&self) -> bool {
1048        !self.document_target_url.is_empty()
1049    }
1050
1051    /// Set the target page url for tracking.
1052    pub fn set_page_url(&mut self, page_target_url: String) {
1053        let host_base = host_and_rest(&page_target_url)
1054            .map(|(h, _)| base_domain_from_host(h))
1055            .unwrap_or("");
1056
1057        self.document_target_domain = host_base.to_string();
1058        self.document_target_url = page_target_url;
1059    }
1060
1061    /// Clear the initial target domain on every navigation.
1062    pub fn clear_target_domain(&mut self) {
1063        self.document_reload_tracker = 0;
1064        self.document_target_url = Default::default();
1065        self.document_target_domain = Default::default();
1066    }
1067
1068    /// Handles:
1069    /// - document reload tracking (`document_reload_tracker`)
1070    /// - redirect masking / replacement
1071    /// - xml document detection (`xml_document`)
1072    /// - `document_target_url` updates
1073    ///
1074    /// Returns (current_url, had_replacer).
1075    #[inline]
1076    fn handle_document_replacement_and_tracking<'a>(
1077        &mut self,
1078        event: &'a EventRequestPaused,
1079        document_resource: bool,
1080    ) -> (Cow<'a, str>, bool) {
1081        let mut replacer: Option<String> = None;
1082        let current_url = event.request.url.as_str();
1083
1084        if document_resource {
1085            if self.document_target_url == current_url {
1086                self.document_reload_tracker += 1;
1087            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1088            {
1089                let (http_document_replacement, mut https_document_replacement) =
1090                    if self.document_target_url.starts_with("http://") {
1091                        (
1092                            self.document_target_url.replacen("http://", "http//", 1),
1093                            self.document_target_url.replacen("http://", "https://", 1),
1094                        )
1095                    } else {
1096                        (
1097                            self.document_target_url.replacen("https://", "https//", 1),
1098                            self.document_target_url.replacen("https://", "http://", 1),
1099                        )
1100                    };
1101
1102                // Track trailing slash to restore later.
1103                let trailing = https_document_replacement.ends_with('/');
1104                if trailing {
1105                    https_document_replacement.pop();
1106                }
1107                if https_document_replacement.ends_with('/') {
1108                    https_document_replacement.pop();
1109                }
1110
1111                let redirect_mask = format!(
1112                    "{}{}",
1113                    https_document_replacement, http_document_replacement
1114                );
1115
1116                if current_url == redirect_mask {
1117                    replacer = Some(if trailing {
1118                        format!("{}/", https_document_replacement)
1119                    } else {
1120                        https_document_replacement
1121                    });
1122                }
1123            }
1124
1125            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1126                self.xml_document = true;
1127            }
1128
1129            // Track last seen document URL.
1130            self.document_target_url = event.request.url.clone();
1131            self.document_target_domain = host_and_rest(&self.document_target_url)
1132                .map(|(h, _)| base_domain_from_host(h).to_string())
1133                .unwrap_or_default();
1134        }
1135
1136        let current_url_cow = match replacer {
1137            Some(r) => Cow::Owned(r),
1138            None => Cow::Borrowed(event.request.url.as_str()),
1139        };
1140
1141        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1142        (current_url_cow, had_replacer)
1143    }
1144
1145    /// Perform a page intercept for chrome
1146    #[cfg(feature = "adblock")]
1147    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1148        use adblock::{
1149            lists::{FilterSet, ParseOptions, RuleTypes},
1150            Engine,
1151        };
1152
1153        lazy_static::lazy_static! {
1154            static ref AD_ENGINE: Engine = {
1155                let mut filter_set = FilterSet::new(false);
1156                let mut rules = ParseOptions::default();
1157                rules.rule_types = RuleTypes::All;
1158
1159                filter_set.add_filters(
1160                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1161                    rules,
1162                );
1163
1164                Engine::from_filter_set(filter_set, true)
1165            };
1166        };
1167
1168        let blockable = ResourceType::Image == event.resource_type
1169            || event.resource_type == ResourceType::Media
1170            || event.resource_type == ResourceType::Stylesheet
1171            || event.resource_type == ResourceType::Document
1172            || event.resource_type == ResourceType::Fetch
1173            || event.resource_type == ResourceType::Xhr;
1174
1175        let u = &event.request.url;
1176
1177        let block_request = blockable
1178            // set it to example.com for 3rd party handling is_same_site
1179        && {
1180            let request = adblock::request::Request::preparsed(
1181                 &u,
1182                 "example.com",
1183                 "example.com",
1184                 &event.resource_type.as_ref().to_lowercase(),
1185                 !event.request.is_same_site.unwrap_or_default());
1186
1187            AD_ENGINE.check_network_request(&request).matched
1188        };
1189
1190        block_request
1191    }
1192
1193    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1194        let response = if self
1195            .attempted_authentications
1196            .contains(event.request_id.as_ref())
1197        {
1198            AuthChallengeResponseResponse::CancelAuth
1199        } else if self.credentials.is_some() {
1200            self.attempted_authentications
1201                .insert(event.request_id.clone().into());
1202            AuthChallengeResponseResponse::ProvideCredentials
1203        } else {
1204            AuthChallengeResponseResponse::Default
1205        };
1206
1207        let mut auth = AuthChallengeResponse::new(response);
1208        if let Some(creds) = self.credentials.clone() {
1209            auth.username = Some(creds.username);
1210            auth.password = Some(creds.password);
1211        }
1212        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1213    }
1214
1215    /// Set the page offline network emulation condition.
1216    pub fn set_offline_mode(&mut self, value: bool) {
1217        if self.offline == value {
1218            return;
1219        }
1220        self.offline = value;
1221        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1222            .offline(self.offline)
1223            .latency(0)
1224            .download_throughput(-1.)
1225            .upload_throughput(-1.)
1226            .build()
1227        {
1228            self.push_cdp_request(network);
1229        }
1230    }
1231
1232    /// Request interception doesn't happen for data URLs with Network Service.
1233    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1234        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1235            if let Some(interception_id) = self
1236                .request_id_to_interception_id
1237                .remove(event.request_id.as_ref())
1238            {
1239                self.on_request(event, Some(interception_id));
1240            } else {
1241                // TODO remove the clone for event
1242                self.requests_will_be_sent
1243                    .insert(event.request_id.clone(), event.clone());
1244            }
1245        } else {
1246            self.on_request(event, None);
1247        }
1248    }
1249
1250    /// The request was served from the cache.
1251    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1252        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1253            request.from_memory_cache = true;
1254        }
1255    }
1256
1257    /// On network response received.
1258    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1259        let mut request_failed = false;
1260
1261        // Track how many bytes we actually deducted from this target.
1262        let mut deducted: u64 = 0;
1263
1264        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1265            let before = *max_bytes;
1266
1267            // encoded_data_length -> saturating cast to u64
1268            let received_bytes: u64 = event.response.encoded_data_length as u64;
1269
1270            // Safe parse of Content-Length
1271            let content_length: Option<u64> = event
1272                .response
1273                .headers
1274                .inner()
1275                .get("content-length")
1276                .and_then(|v| v.as_str())
1277                .and_then(|s| s.trim().parse::<u64>().ok());
1278
1279            // Deduct what we actually received
1280            *max_bytes = max_bytes.saturating_sub(received_bytes);
1281
1282            // If the declared size can't fit, zero out now
1283            if let Some(cl) = content_length {
1284                if cl > *max_bytes {
1285                    *max_bytes = 0;
1286                }
1287            }
1288
1289            request_failed = *max_bytes == 0;
1290
1291            // Compute exact delta deducted on this event
1292            deducted = before.saturating_sub(*max_bytes);
1293        }
1294
1295        // Bubble up the deduction (even if request continues)
1296        if deducted > 0 {
1297            self.queued_events
1298                .push_back(NetworkEvent::BytesConsumed(deducted));
1299        }
1300
1301        // block all network request moving forward.
1302        if request_failed && self.max_bytes_allowed.is_some() {
1303            self.set_block_all(true);
1304        }
1305
1306        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1307            request.set_response(event.response.clone());
1308            self.queued_events.push_back(if request_failed {
1309                NetworkEvent::RequestFailed(request)
1310            } else {
1311                NetworkEvent::RequestFinished(request)
1312            });
1313        }
1314    }
1315
1316    /// On network loading finished.
1317    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1318        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1319            if let Some(interception_id) = request.interception_id.as_ref() {
1320                self.attempted_authentications
1321                    .remove(interception_id.as_ref());
1322            }
1323            self.queued_events
1324                .push_back(NetworkEvent::RequestFinished(request));
1325        }
1326    }
1327
1328    /// On network loading failed.
1329    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1330        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1331            request.failure_text = Some(event.error_text.clone());
1332            if let Some(interception_id) = request.interception_id.as_ref() {
1333                self.attempted_authentications
1334                    .remove(interception_id.as_ref());
1335            }
1336            self.queued_events
1337                .push_back(NetworkEvent::RequestFailed(request));
1338        }
1339    }
1340
1341    /// On request will be sent.
1342    fn on_request(
1343        &mut self,
1344        event: &EventRequestWillBeSent,
1345        interception_id: Option<InterceptionId>,
1346    ) {
1347        let mut redirect_chain = Vec::new();
1348        let mut redirect_location = None;
1349
1350        if let Some(redirect_resp) = &event.redirect_response {
1351            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1352                if is_redirect_status(redirect_resp.status) {
1353                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1354                        if redirect_resp.url != location {
1355                            let fixed_location = location.replace(&redirect_resp.url, "");
1356
1357                            if !fixed_location.is_empty() {
1358                                request.response.as_mut().map(|resp| {
1359                                    resp.headers.0["Location"] =
1360                                        serde_json::Value::String(fixed_location.clone());
1361                                });
1362                            }
1363
1364                            redirect_location = Some(fixed_location);
1365                        }
1366                    }
1367                }
1368
1369                self.handle_request_redirect(
1370                    &mut request,
1371                    if let Some(redirect_location) = redirect_location {
1372                        let mut redirect_resp = redirect_resp.clone();
1373
1374                        if !redirect_location.is_empty() {
1375                            redirect_resp.headers.0["Location"] =
1376                                serde_json::Value::String(redirect_location);
1377                        }
1378
1379                        redirect_resp
1380                    } else {
1381                        redirect_resp.clone()
1382                    },
1383                );
1384
1385                redirect_chain = std::mem::take(&mut request.redirect_chain);
1386                redirect_chain.push(request);
1387            }
1388        }
1389
1390        let request = HttpRequest::new(
1391            event.request_id.clone(),
1392            event.frame_id.clone(),
1393            interception_id,
1394            self.user_request_interception_enabled,
1395            redirect_chain,
1396        );
1397
1398        self.requests.insert(event.request_id.clone(), request);
1399        self.queued_events
1400            .push_back(NetworkEvent::Request(event.request_id.clone()));
1401    }
1402
1403    /// Handle request redirect.
1404    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1405        request.set_response(response);
1406        if let Some(interception_id) = request.interception_id.as_ref() {
1407            self.attempted_authentications
1408                .remove(interception_id.as_ref());
1409        }
1410    }
1411}
1412
1413#[derive(Debug)]
1414pub enum NetworkEvent {
1415    /// Send a CDP request.
1416    SendCdpRequest((MethodId, serde_json::Value)),
1417    /// Request.
1418    Request(RequestId),
1419    /// Response
1420    Response(RequestId),
1421    /// Request failed.
1422    RequestFailed(HttpRequest),
1423    /// Request finished.
1424    RequestFinished(HttpRequest),
1425    /// Bytes consumed.
1426    BytesConsumed(u64),
1427}
1428
1429#[cfg(test)]
1430mod tests {
1431    use super::ALLOWED_MATCHER_3RD_PARTY;
1432    use crate::handler::network::NetworkManager;
1433    use std::time::Duration;
1434
1435    #[test]
1436    fn test_allowed_matcher_3rd_party() {
1437        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1438        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1439        assert!(
1440            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1441            "expected Cloudflare challenge script to be allowed"
1442        );
1443
1444        // Should NOT be allowed (not in allow-list)
1445        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1446        assert!(
1447            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1448            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1449        );
1450
1451        // A couple sanity checks for existing allow patterns
1452        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1453        assert!(ALLOWED_MATCHER_3RD_PARTY
1454            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1455        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1456    }
1457
1458    #[test]
1459    fn test_script_allowed_by_default_when_not_blocklisted() {
1460        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1461        nm.set_page_url(
1462            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1463        );
1464
1465        // A random script that should not match your block tries.
1466        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1467        assert!(
1468            !nm.should_block_script_blocklist_only(ok),
1469            "expected non-blocklisted script to be allowed"
1470        );
1471    }
1472
1473    #[test]
1474    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1475        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1476        nm.set_page_url(
1477            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1478        );
1479
1480        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1481        let bad = "https://cdn.example.net/js/analytics.js";
1482        assert!(
1483            nm.should_block_script_blocklist_only(bad),
1484            "expected analytics.js to be blocklisted"
1485        );
1486    }
1487
1488    #[test]
1489    fn test_allowed_matcher_3rd_party_sanity() {
1490        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1491        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1492        assert!(
1493            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1494            "expected Cloudflare challenge script to be allowed"
1495        );
1496
1497        // Should NOT be allowed (not in allow-list)
1498        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1499        assert!(
1500            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1501            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1502        );
1503
1504        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1505        assert!(ALLOWED_MATCHER_3RD_PARTY
1506            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1507        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1508    }
1509    #[test]
1510    fn test_dynamic_blacklist_blocks_url() {
1511        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1512        nm.set_page_url("https://example.com/".to_string());
1513
1514        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1515        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1516        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1517
1518        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1519    }
1520
1521    #[test]
1522    fn test_blacklist_strict_wins_over_whitelist() {
1523        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1524        nm.set_page_url("https://example.com/".to_string());
1525
1526        // Same URL in both lists.
1527        nm.set_blacklist_patterns(["beacon.min.js"]);
1528        nm.set_whitelist_patterns(["beacon.min.js"]);
1529
1530        nm.set_blacklist_strict(true);
1531
1532        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1533        assert!(nm.is_whitelisted(u));
1534        assert!(nm.is_blacklisted(u));
1535
1536        // In strict mode, it should still be considered blocked at decision time.
1537        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1538        assert!(nm.blacklist_strict);
1539    }
1540
1541    #[test]
1542    fn test_blacklist_non_strict_allows_whitelist_override() {
1543        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1544        nm.set_page_url("https://example.com/".to_string());
1545
1546        nm.set_blacklist_patterns(["beacon.min.js"]);
1547        nm.set_whitelist_patterns(["beacon.min.js"]);
1548
1549        nm.set_blacklist_strict(false);
1550
1551        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1552        assert!(nm.is_blacklisted(u));
1553        assert!(nm.is_whitelisted(u));
1554        assert!(!nm.blacklist_strict);
1555    }
1556}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs