chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/enterprise.js",
73        "https://www.google.com/recaptcha/api.js",
74        "https://google.com/recaptcha/api.js",
75        "https://captcha.px-cloud.net/",
76        "https://geo.captcha-delivery.com/",
77        "https://cdn.auth0.com/js/lock/",
78        "https://captcha.gtimg.com",
79        "https://newassets.hcaptcha.com/",
80        "https://cdn.auth0.com/client",
81        "https://js.stripe.com/",
82        "https://cdn.prod.website-files.com/", // webflow cdn scripts
83        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
84        "https://code.jquery.com/jquery-"
85    ];
86
87    /// Determine if a script should be rendered in the browser by name.
88    ///
89    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
90    /// but we keep it for compatibility and other call sites.
91    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
92
93    /// General patterns for popular libraries and resources
94    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
95        // Verified 3rd parties for request
96        "https://m.stripe.network/",
97        "https://challenges.cloudflare.com/",
98        "https://www.google.com/recaptcha/api.js",
99        "https://google.com/recaptcha/api.js",
100        "https://www.google.com/recaptcha/enterprise.js",
101        "https://js.stripe.com/",
102        "https://cdn.prod.website-files.com/", // webflow cdn scripts
103        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
104        "https://code.jquery.com/jquery-",
105        "https://ct.captcha-delivery.com/",
106        "https://geo.captcha-delivery.com/",
107        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
108        "https://ct.captcha-delivery.com/",
109        "https://cdn.auth0.com/client",
110        "https://captcha.px-cloud.net/",
111        "https://www.gstatic.com/recaptcha/",
112        "https://www.google.com/recaptcha/api2/",
113        "https://www.recaptcha.net/recaptcha/",
114        "https://js.hcaptcha.com/1/api.js",
115        "https://hcaptcha.com/1/api.js",
116        "https://js.datadome.co/tags.js",
117        "https://api-js.datadome.co/",
118        "https://client.perimeterx.net/",
119        "https://captcha.px-cdn.net/",
120        "https://newassets.hcaptcha.com/",
121        "https://captcha.px-cloud.net/",
122        "https://s.perimeterx.net/",
123        "https://client-api.arkoselabs.com/v2/",
124        "https://static.geetest.com/v4/gt4.js",
125        "https://static.geetest.com/",
126        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
127        "https://cdn.perfdrive.com/aperture/",
128        "https://assets.queue-it.net/",
129        "discourse-cdn.com/",
130        "hcaptcha.com",
131        "/cdn-cgi/challenge-platform/",
132        "/_Incapsula_Resource"
133    ];
134
135    /// Determine if a script should be rendered in the browser by name.
136    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
137
138    /// path of a js framework
139    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
140        phf::phf_set! {
141            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
142            "_astro/", "_app/immutable"
143        }
144    };
145
146    /// Ignore the content types.
147    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
148        "application/pdf",
149        "application/zip",
150        "application/x-rar-compressed",
151        "application/x-tar",
152        "image/png",
153        "image/jpeg",
154        "image/gif",
155        "image/bmp",
156        "image/webp",
157        "image/svg+xml",
158        "video/mp4",
159        "video/x-msvideo",
160        "video/x-matroska",
161        "video/webm",
162        "audio/mpeg",
163        "audio/ogg",
164        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
165        "application/vnd.ms-excel",
166        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
167        "application/vnd.ms-powerpoint",
168        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
169        "application/x-7z-compressed",
170        "application/x-rpm",
171        "application/x-shockwave-flash",
172        "application/rtf",
173    };
174
175    /// Ignore the resources for visual content types.
176    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
177        "Image",
178        "Media",
179        "Font"
180    };
181
182    /// Ignore the resources for visual content types.
183    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
184        "CspViolationReport",
185        "Manifest",
186        "Other",
187        "Prefetch",
188        "Ping",
189    };
190
191    /// Case insenstive css matching
192    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
193
194    /// The command chain.
195    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
196        let enable = EnableParams::default();
197
198        if let Ok(c) = serde_json::to_value(&enable) {
199            vec![(enable.identifier(), c)]
200        } else {
201            vec![]
202        }
203    };
204
205    /// The command chain with https ignore.
206    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
207        let enable = EnableParams::default();
208        let mut v = vec![];
209        if let Ok(c) = serde_json::to_value(&enable) {
210            v.push((enable.identifier(), c));
211        }
212        let ignore = SetIgnoreCertificateErrorsParams::new(true);
213        if let Ok(ignored) = serde_json::to_value(&ignore) {
214            v.push((ignore.identifier(), ignored));
215        }
216
217        v
218    };
219
220    /// Enable the fetch intercept command
221    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
222        fetch::EnableParams::builder()
223        .handle_auth_requests(true)
224        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
225        .build()
226    };
227}
228
229/// Determine if a redirect is true.
230pub(crate) fn is_redirect_status(status: i64) -> bool {
231    matches!(status, 301 | 302 | 303 | 307 | 308)
232}
233
234#[derive(Debug)]
235/// The base network manager.
236pub struct NetworkManager {
237    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
238    ///
239    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
240    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
241    /// Consumers pull from this queue via `poll()`.
242    queued_events: VecDeque<NetworkEvent>,
243    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
244    ///
245    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
246    /// certificates (self-signed, expired, MITM proxies, etc.).
247    ignore_httpserrors: bool,
248    /// Active in-flight requests keyed by CDP `RequestId`.
249    ///
250    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
251    /// and final state used to emit `RequestFinished` / `RequestFailed`.
252    requests: HashMap<RequestId, HttpRequest>,
253    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
254    /// `Fetch.requestPaused` arrives later (or vice versa).
255    ///
256    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
257    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
258    // TODO put event in an Arc?
259    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
260    /// Extra HTTP headers to apply to subsequent network requests via CDP.
261    ///
262    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
263    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
264    extra_headers: std::collections::HashMap<String, String>,
265    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
266    ///
267    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
268    /// store the interception id here so it can be attached to the `HttpRequest` once the
269    /// network request is observed.
270    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
271    /// Whether the user has disabled the browser cache.
272    ///
273    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
274    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
275    user_cache_disabled: bool,
276    /// Tracks which requests have already attempted authentication.
277    ///
278    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
279    /// authentication challenges (407/401). Once a request id is present here, subsequent
280    /// challenges for the same request are canceled.
281    attempted_authentications: HashSet<RequestId>,
282    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
283    ///
284    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
285    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
286    credentials: Option<Credentials>,
287    /// User-facing toggle indicating whether request interception is desired.
288    ///
289    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
290    /// not guarantee interception is active; interception is actually enabled/disabled by
291    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
292    ///
293    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
294    /// enabled to satisfy auth challenges.
295    pub(crate) user_request_interception_enabled: bool,
296    /// Hard kill-switch to block all network traffic.
297    ///
298    /// When `true`, the manager immediately blocks requests (typically via
299    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
300    /// and short-circuits most decision logic. This is used for safety conditions such as
301    /// exceeding `max_bytes_allowed` or other runtime protections.
302    block_all: bool,
303    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
304    ///
305    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
306    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
307    /// when `user_request_interception_enabled` or `credentials` change.
308    pub(crate) protocol_request_interception_enabled: bool,
309    /// The network is offline.
310    offline: bool,
311    /// The page request timeout.
312    pub request_timeout: Duration,
313    // made_request: bool,
314    /// Ignore visuals (no pings, prefetching, and etc).
315    pub ignore_visuals: bool,
316    /// Block CSS stylesheets.
317    pub block_stylesheets: bool,
318    /// Block javascript that is not critical to rendering.
319    ///
320    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
321    /// by itself (it remains for config compatibility).
322    pub block_javascript: bool,
323    /// Block analytics from rendering
324    pub block_analytics: bool,
325    /// Only html from loading.
326    pub only_html: bool,
327    /// Is xml document?
328    pub xml_document: bool,
329    /// The custom intercept handle logic to run on the website.
330    pub intercept_manager: NetworkInterceptManager,
331    /// Track the amount of times the document reloaded.
332    pub document_reload_tracker: u8,
333    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
334    pub document_target_url: String,
335    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
336    pub document_target_domain: String,
337    /// The max bytes to receive.
338    pub max_bytes_allowed: Option<u64>,
339    #[cfg(feature = "_cache")]
340    /// The cache site_key to use.
341    pub cache_site_key: Option<String>,
342    /// The cache policy to use.
343    #[cfg(feature = "_cache")]
344    pub cache_policy: Option<BasicCachePolicy>,
345    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
346    whitelist_patterns: Vec<String>,
347    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
348    whitelist_matcher: Option<AhoCorasick>,
349    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
350    blacklist_patterns: Vec<String>,
351    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
352    blacklist_matcher: Option<AhoCorasick>,
353    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
354    blacklist_strict: bool,
355}
356
357impl NetworkManager {
358    /// A new network manager.
359    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
360        Self {
361            queued_events: Default::default(),
362            ignore_httpserrors,
363            requests: Default::default(),
364            requests_will_be_sent: Default::default(),
365            extra_headers: Default::default(),
366            request_id_to_interception_id: Default::default(),
367            user_cache_disabled: false,
368            attempted_authentications: Default::default(),
369            credentials: None,
370            block_all: false,
371            user_request_interception_enabled: false,
372            protocol_request_interception_enabled: false,
373            offline: false,
374            request_timeout,
375            ignore_visuals: false,
376            block_javascript: false,
377            block_stylesheets: false,
378            block_analytics: true,
379            only_html: false,
380            xml_document: false,
381            intercept_manager: NetworkInterceptManager::Unknown,
382            document_reload_tracker: 0,
383            document_target_url: String::new(),
384            document_target_domain: String::new(),
385            whitelist_patterns: Vec::new(),
386            whitelist_matcher: None,
387            blacklist_patterns: Vec::new(),
388            blacklist_matcher: None,
389            blacklist_strict: true,
390            max_bytes_allowed: None,
391            #[cfg(feature = "_cache")]
392            cache_site_key: None,
393            #[cfg(feature = "_cache")]
394            cache_policy: None,
395        }
396    }
397
398    /// Replace the whitelist patterns (compiled once).
399    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
400    where
401        I: IntoIterator<Item = S>,
402        S: Into<String>,
403    {
404        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
405        self.rebuild_whitelist_matcher();
406    }
407
408    /// Replace the blacklist patterns (compiled once).
409    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
410    where
411        I: IntoIterator<Item = S>,
412        S: Into<String>,
413    {
414        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
415        self.rebuild_blacklist_matcher();
416    }
417
418    /// Add one pattern (cheap) and rebuild (call this sparingly).
419    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
420        self.blacklist_patterns.push(pattern.into());
421        self.rebuild_blacklist_matcher();
422    }
423
424    /// Add many patterns and rebuild once.
425    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
426    where
427        I: IntoIterator<Item = S>,
428        S: Into<String>,
429    {
430        self.blacklist_patterns
431            .extend(patterns.into_iter().map(Into::into));
432        self.rebuild_blacklist_matcher();
433    }
434
435    /// Clear blacklist entirely.
436    pub fn clear_blacklist(&mut self) {
437        self.blacklist_patterns.clear();
438        self.blacklist_matcher = None;
439    }
440
441    /// Control precedence: when true, blacklist always wins.
442    pub fn set_blacklist_strict(&mut self, strict: bool) {
443        self.blacklist_strict = strict;
444    }
445
446    #[inline]
447    fn rebuild_blacklist_matcher(&mut self) {
448        if self.blacklist_patterns.is_empty() {
449            self.blacklist_matcher = None;
450            return;
451        }
452
453        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
454        self.blacklist_matcher = AhoCorasick::new(refs).ok();
455    }
456
457    #[inline]
458    fn is_blacklisted(&self, url: &str) -> bool {
459        self.blacklist_matcher
460            .as_ref()
461            .map(|m| m.is_match(url))
462            .unwrap_or(false)
463    }
464
465    /// Add one pattern (cheap) and rebuild (call this sparingly).
466    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
467        self.whitelist_patterns.push(pattern.into());
468        self.rebuild_whitelist_matcher();
469    }
470
471    /// Add many patterns and rebuild once.
472    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
473    where
474        I: IntoIterator<Item = S>,
475        S: Into<String>,
476    {
477        self.whitelist_patterns
478            .extend(patterns.into_iter().map(Into::into));
479        self.rebuild_whitelist_matcher();
480    }
481
482    #[inline]
483    fn rebuild_whitelist_matcher(&mut self) {
484        if self.whitelist_patterns.is_empty() {
485            self.whitelist_matcher = None;
486            return;
487        }
488
489        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
490
491        // If building fails (shouldn’t for simple patterns), just disable matcher.
492        self.whitelist_matcher = AhoCorasick::new(refs).ok();
493    }
494
495    #[inline]
496    fn is_whitelisted(&self, url: &str) -> bool {
497        self.whitelist_matcher
498            .as_ref()
499            .map(|m| m.is_match(url))
500            .unwrap_or(false)
501    }
502
503    /// Commands to init the chain with.
504    pub fn init_commands(&self) -> CommandChain {
505        let cmds = if self.ignore_httpserrors {
506            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
507        } else {
508            INIT_CHAIN.clone()
509        };
510        CommandChain::new(cmds, self.request_timeout)
511    }
512
513    /// Push the CDP request.
514    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
515        let method = cmd.identifier();
516        if let Ok(params) = serde_json::to_value(cmd) {
517            self.queued_events
518                .push_back(NetworkEvent::SendCdpRequest((method, params)));
519        }
520    }
521
522    /// The next event to handle.
523    pub fn poll(&mut self) -> Option<NetworkEvent> {
524        self.queued_events.pop_front()
525    }
526
527    /// Get the extra headers.
528    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
529        &self.extra_headers
530    }
531
532    /// Set extra HTTP headers.
533    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
534        self.extra_headers = headers;
535        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
536        self.extra_headers.remove("Proxy-Authorization");
537        if !self.extra_headers.is_empty() {
538            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
539                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
540            }
541        }
542    }
543
544    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
545        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
546    }
547
548    pub fn set_block_all(&mut self, block_all: bool) {
549        self.block_all = block_all;
550    }
551
552    pub fn set_request_interception(&mut self, enabled: bool) {
553        self.user_request_interception_enabled = enabled;
554        self.update_protocol_request_interception();
555    }
556
557    pub fn set_cache_enabled(&mut self, enabled: bool) {
558        let run = self.user_cache_disabled != !enabled;
559        self.user_cache_disabled = !enabled;
560        if run {
561            self.update_protocol_cache_disabled();
562        }
563    }
564
565    /// Enable fetch interception.
566    pub fn enable_request_intercept(&mut self) {
567        self.protocol_request_interception_enabled = true;
568    }
569
570    /// Disable fetch interception.
571    pub fn disable_request_intercept(&mut self) {
572        self.protocol_request_interception_enabled = false;
573    }
574
575    /// Set the cache site key.
576    #[cfg(feature = "_cache")]
577    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
578        self.cache_site_key = cache_site_key;
579    }
580
581    /// Set the cache policy.
582    #[cfg(feature = "_cache")]
583    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
584        self.cache_policy = cache_policy;
585    }
586
587    pub fn update_protocol_cache_disabled(&mut self) {
588        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
589    }
590
591    pub fn authenticate(&mut self, credentials: Credentials) {
592        self.credentials = Some(credentials);
593        self.update_protocol_request_interception();
594        self.protocol_request_interception_enabled = true;
595    }
596
597    fn update_protocol_request_interception(&mut self) {
598        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
599
600        if enabled == self.protocol_request_interception_enabled {
601            return;
602        }
603
604        if enabled {
605            self.push_cdp_request(ENABLE_FETCH.clone())
606        } else {
607            self.push_cdp_request(DisableParams::default())
608        }
609    }
610
611    /// Blocklist-only script blocking.
612    /// Returns true only when the URL matches an explicit blocklist condition.
613    #[inline]
614    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
615        // If analytics blocking is off, skip all analytics tries.
616        let block_analytics = self.block_analytics;
617
618        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
619        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
620        {
621            return true;
622        }
623
624        // 2) Custom website block list (explicit).
625        if crate::handler::blockers::block_websites::block_website(url) {
626            return true;
627        }
628
629        // 3) Path-based explicit tries / fallbacks.
630        //
631        // We run these on:
632        // - path with leading slash ("/js/app.js")
633        // - path without leading slash ("js/app.js")
634        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
635        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
636            // Remove query/fragment so matching stays stable.
637            let p_slash = Self::strip_query_fragment(path_with_slash);
638            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
639
640            // Basename for filename-only lists.
641            let base = match p_slash.rsplit('/').next() {
642                Some(b) => b,
643                None => p_slash,
644            };
645
646            // ---- Trie checks ----
647            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
648            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
649                return true;
650            }
651            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
652                return true;
653            }
654            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
655                return true;
656            }
657
658            // Base-path ignore tries (framework noise / known ignorable script paths).
659            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
660            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
661                return true;
662            }
663
664            // Style path ignores only when visuals are ignored.
665            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
666                return true;
667            }
668        }
669
670        false
671    }
672
673    /// Extract the absolute URL path portion WITH the leading slash.
674    ///
675    /// Example:
676    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
677    #[inline]
678    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
679        // find scheme separator
680        let idx = url.find("//")?;
681        let after_slashes = idx + 2;
682
683        // find first slash after host
684        let slash_rel = url[after_slashes..].find('/')?;
685        let slash_idx = after_slashes + slash_rel;
686
687        if slash_idx < url.len() {
688            Some(&url[slash_idx..])
689        } else {
690            None
691        }
692    }
693
694    /// Strip query string and fragment from a path-ish string.
695    ///
696    /// Example:
697    /// - "/a/b.js?x=1#y" -> "/a/b.js"
698    #[inline]
699    fn strip_query_fragment(s: &str) -> &str {
700        let q = s.find('?');
701        let h = s.find('#');
702
703        match (q, h) {
704            (None, None) => s,
705            (Some(i), None) => &s[..i],
706            (None, Some(i)) => &s[..i],
707            (Some(i), Some(j)) => &s[..i.min(j)],
708        }
709    }
710
711    /// Determine if the request should be skipped.
712    #[inline]
713    fn skip_xhr(
714        &self,
715        skip_networking: bool,
716        event: &EventRequestPaused,
717        network_event: bool,
718    ) -> bool {
719        // XHR check
720        if !skip_networking && network_event {
721            let request_url = event.request.url.as_str();
722
723            // check if part of ignore scripts.
724            let skip_analytics =
725                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
726
727            if skip_analytics {
728                true
729            } else if self.block_stylesheets || self.ignore_visuals {
730                let block_css = self.block_stylesheets;
731                let block_media = self.ignore_visuals;
732
733                let mut block_request = false;
734
735                if let Some(position) = request_url.rfind('.') {
736                    let hlen = request_url.len();
737                    let has_asset = hlen - position;
738
739                    if has_asset >= 3 {
740                        let next_position = position + 1;
741
742                        if block_media
743                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
744                                &request_url[next_position..].into(),
745                            )
746                        {
747                            block_request = true;
748                        } else if block_css {
749                            block_request =
750                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
751                                    .contains(&**CSS_EXTENSION)
752                        }
753                    }
754                }
755
756                if !block_request {
757                    block_request = ignore_script_xhr_media(request_url);
758                }
759
760                block_request
761            } else {
762                skip_networking
763            }
764        } else {
765            skip_networking
766        }
767    }
768
769    #[cfg(feature = "adblock")]
770    #[inline]
771    /// Detect if ad enabled.
772    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
773        if skip_networking {
774            true
775        } else {
776            block_ads(&event.request.url) || self.detect_ad(event)
777        }
778    }
779
780    /// When adblock feature is disabled, this is a no-op.
781    #[cfg(not(feature = "adblock"))]
782    #[inline]
783    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
784        use crate::handler::blockers::block_websites::block_ads;
785        if skip_networking {
786            true
787        } else {
788            block_ads(&event.request.url)
789        }
790    }
791
792    #[inline]
793    /// Fail request
794    fn fail_request_blocked(
795        &mut self,
796        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
797    ) {
798        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
799            request_id.clone(),
800            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
801        );
802        self.push_cdp_request(params);
803    }
804
805    #[inline]
806    /// Fulfill request
807    fn fulfill_request_empty_200(
808        &mut self,
809        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
810    ) {
811        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
812            request_id.clone(),
813            200,
814        );
815        self.push_cdp_request(params);
816    }
817
818    #[cfg(feature = "_cache")]
819    #[inline]
820    /// Fulfill a paused Fetch request from cached bytes + header map.
821    ///
822    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
823    fn fulfill_request_from_cache(
824        &mut self,
825        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
826        body: &[u8],
827        headers: &std::collections::HashMap<String, String>,
828        status: i64,
829    ) {
830        use crate::cdp::browser_protocol::fetch::HeaderEntry;
831        use crate::handler::network::fetch::FulfillRequestParams;
832        use base64::Engine;
833
834        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
835
836        for (k, v) in headers.iter() {
837            resp_headers.push(HeaderEntry {
838                name: k.clone().into(),
839                value: v.clone().into(),
840            });
841        }
842
843        let mut params = FulfillRequestParams::new(request_id.clone(), status);
844
845        // TODO: have this already encoded prior.
846        params.body = Some(
847            base64::engine::general_purpose::STANDARD
848                .encode(body)
849                .into(),
850        );
851
852        params.response_headers = Some(resp_headers);
853
854        self.push_cdp_request(params);
855    }
856
857    #[inline]
858    /// Continue the request url.
859    fn continue_request_with_url(
860        &mut self,
861        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
862        url: Option<&str>,
863        intercept_response: bool,
864    ) {
865        let mut params = ContinueRequestParams::new(request_id.clone());
866        if let Some(url) = url {
867            params.url = Some(url.to_string());
868            params.intercept_response = Some(intercept_response);
869        }
870        self.push_cdp_request(params);
871    }
872
873    /// On fetch request paused interception.
874    #[inline]
875    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
876        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
877            return;
878        }
879
880        let resource_type = &event.resource_type;
881
882        if self.block_all {
883            tracing::debug!(
884                "Blocked (block_all): {:?} - {}",
885                event.resource_type,
886                event.request.url
887            );
888            return self.fail_request_blocked(&event.request_id);
889        }
890
891        if let Some(network_id) = event.network_id.as_ref() {
892            if let Some(request_will_be_sent) =
893                self.requests_will_be_sent.remove(network_id.as_ref())
894            {
895                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
896            } else {
897                self.request_id_to_interception_id
898                    .insert(network_id.clone(), event.request_id.clone().into());
899            }
900        }
901
902        // From here on, we handle the full decision tree.
903        let javascript_resource = *resource_type == ResourceType::Script;
904        let document_resource = *resource_type == ResourceType::Document;
905        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
906
907        // Start with static / cheap skip checks.
908        let mut skip_networking =
909            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
910
911        // Also short-circuit if we've reloaded this document too many times.
912        if !skip_networking {
913            skip_networking = self.document_reload_tracker >= 3;
914        }
915
916        // Handle document redirect / masking and track xml documents.
917        let (current_url_cow, had_replacer) =
918            self.handle_document_replacement_and_tracking(event, document_resource);
919
920        let current_url: &str = current_url_cow.as_ref();
921
922        let blacklisted = self.is_blacklisted(current_url);
923
924        if !self.blacklist_strict && blacklisted {
925            skip_networking = true;
926        }
927
928        if !skip_networking {
929            // Allow XSL for sitemap XML.
930            if self.xml_document && current_url.ends_with(".xsl") {
931                skip_networking = false;
932            } else {
933                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
934            }
935        }
936
937        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
938
939        // Ignore embedded scripts when only_html or ignore_visuals is set.
940        if !skip_networking
941            && self.block_javascript
942            && (self.only_html || self.ignore_visuals)
943            && (javascript_resource || document_resource)
944        {
945            skip_networking = ignore_script_embedded(current_url);
946        }
947
948        // Script policy: allow-by-default.
949        // Block only if explicit block list patterns match.
950        if !skip_networking && javascript_resource {
951            skip_networking = self.should_block_script_blocklist_only(current_url);
952        }
953
954        // XHR / data resources.
955        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
956
957        // Custom interception layer.
958        if !skip_networking && (javascript_resource || network_resource || document_resource) {
959            skip_networking = self.intercept_manager.intercept_detection(
960                current_url,
961                self.ignore_visuals,
962                network_resource,
963            );
964        }
965
966        // Custom website block list.
967        if !skip_networking && (javascript_resource || network_resource) {
968            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
969        }
970
971        // whitelist 3rd party
972        // not required unless explicit blocking.
973        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
974        {
975            skip_networking = false;
976        }
977
978        // check if the url is in the whitelist.
979        if skip_networking && self.is_whitelisted(current_url) {
980            skip_networking = false;
981        }
982
983        if self.blacklist_strict && blacklisted {
984            skip_networking = true;
985        }
986
987        if skip_networking {
988            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
989            self.fulfill_request_empty_200(&event.request_id);
990        } else {
991            #[cfg(feature = "_cache")]
992            {
993                if let (Some(policy), Some(cache_site_key)) =
994                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
995                {
996                    let current_url = format!("{}:{}", event.request.method, &current_url);
997
998                    if let Some((res, cache_policy)) =
999                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1000                    {
1001                        if policy.allows_cached(&cache_policy) {
1002                            tracing::debug!(
1003                                "Remote Cached: {:?} - {}",
1004                                resource_type,
1005                                &current_url
1006                            );
1007                            return self.fulfill_request_from_cache(
1008                                &event.request_id,
1009                                &res.body,
1010                                &res.headers,
1011                                res.status as i64,
1012                            );
1013                        }
1014                    }
1015                }
1016            }
1017
1018            // check our frame cache for the run.
1019            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1020            self.continue_request_with_url(
1021                &event.request_id,
1022                if had_replacer {
1023                    Some(current_url)
1024                } else {
1025                    None
1026                },
1027                !had_replacer,
1028            );
1029        }
1030    }
1031
1032    /// Shared "visuals + basic blocking" logic.
1033    ///
1034    /// IMPORTANT: Scripts are NOT blocked here anymore.
1035    /// Scripts are allowed by default and only blocked via explicit blocklists
1036    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1037    #[inline]
1038    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1039        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1040            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1041    }
1042
1043    /// Does the network manager have a target domain?
1044    pub fn has_target_domain(&self) -> bool {
1045        !self.document_target_url.is_empty()
1046    }
1047
1048    /// Set the target page url for tracking.
1049    pub fn set_page_url(&mut self, page_target_url: String) {
1050        let host_base = host_and_rest(&page_target_url)
1051            .map(|(h, _)| base_domain_from_host(h))
1052            .unwrap_or("");
1053
1054        self.document_target_domain = host_base.to_string();
1055        self.document_target_url = page_target_url;
1056    }
1057
1058    /// Clear the initial target domain on every navigation.
1059    pub fn clear_target_domain(&mut self) {
1060        self.document_reload_tracker = 0;
1061        self.document_target_url = Default::default();
1062        self.document_target_domain = Default::default();
1063    }
1064
1065    /// Handles:
1066    /// - document reload tracking (`document_reload_tracker`)
1067    /// - redirect masking / replacement
1068    /// - xml document detection (`xml_document`)
1069    /// - `document_target_url` updates
1070    ///
1071    /// Returns (current_url, had_replacer).
1072    #[inline]
1073    fn handle_document_replacement_and_tracking<'a>(
1074        &mut self,
1075        event: &'a EventRequestPaused,
1076        document_resource: bool,
1077    ) -> (Cow<'a, str>, bool) {
1078        let mut replacer: Option<String> = None;
1079        let current_url = event.request.url.as_str();
1080
1081        if document_resource {
1082            if self.document_target_url == current_url {
1083                self.document_reload_tracker += 1;
1084            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1085            {
1086                let (http_document_replacement, mut https_document_replacement) =
1087                    if self.document_target_url.starts_with("http://") {
1088                        (
1089                            self.document_target_url.replacen("http://", "http//", 1),
1090                            self.document_target_url.replacen("http://", "https://", 1),
1091                        )
1092                    } else {
1093                        (
1094                            self.document_target_url.replacen("https://", "https//", 1),
1095                            self.document_target_url.replacen("https://", "http://", 1),
1096                        )
1097                    };
1098
1099                // Track trailing slash to restore later.
1100                let trailing = https_document_replacement.ends_with('/');
1101                if trailing {
1102                    https_document_replacement.pop();
1103                }
1104                if https_document_replacement.ends_with('/') {
1105                    https_document_replacement.pop();
1106                }
1107
1108                let redirect_mask = format!(
1109                    "{}{}",
1110                    https_document_replacement, http_document_replacement
1111                );
1112
1113                if current_url == redirect_mask {
1114                    replacer = Some(if trailing {
1115                        format!("{}/", https_document_replacement)
1116                    } else {
1117                        https_document_replacement
1118                    });
1119                }
1120            }
1121
1122            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1123                self.xml_document = true;
1124            }
1125
1126            // Track last seen document URL.
1127            self.document_target_url = event.request.url.clone();
1128            self.document_target_domain = host_and_rest(&self.document_target_url)
1129                .map(|(h, _)| base_domain_from_host(h).to_string())
1130                .unwrap_or_default();
1131        }
1132
1133        let current_url_cow = match replacer {
1134            Some(r) => Cow::Owned(r),
1135            None => Cow::Borrowed(event.request.url.as_str()),
1136        };
1137
1138        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1139        (current_url_cow, had_replacer)
1140    }
1141
1142    /// Perform a page intercept for chrome
1143    #[cfg(feature = "adblock")]
1144    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1145        use adblock::{
1146            lists::{FilterSet, ParseOptions, RuleTypes},
1147            Engine,
1148        };
1149
1150        lazy_static::lazy_static! {
1151            static ref AD_ENGINE: Engine = {
1152                let mut filter_set = FilterSet::new(false);
1153                let mut rules = ParseOptions::default();
1154                rules.rule_types = RuleTypes::All;
1155
1156                filter_set.add_filters(
1157                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1158                    rules,
1159                );
1160
1161                Engine::from_filter_set(filter_set, true)
1162            };
1163        };
1164
1165        let blockable = ResourceType::Image == event.resource_type
1166            || event.resource_type == ResourceType::Media
1167            || event.resource_type == ResourceType::Stylesheet
1168            || event.resource_type == ResourceType::Document
1169            || event.resource_type == ResourceType::Fetch
1170            || event.resource_type == ResourceType::Xhr;
1171
1172        let u = &event.request.url;
1173
1174        let block_request = blockable
1175            // set it to example.com for 3rd party handling is_same_site
1176        && {
1177            let request = adblock::request::Request::preparsed(
1178                 &u,
1179                 "example.com",
1180                 "example.com",
1181                 &event.resource_type.as_ref().to_lowercase(),
1182                 !event.request.is_same_site.unwrap_or_default());
1183
1184            AD_ENGINE.check_network_request(&request).matched
1185        };
1186
1187        block_request
1188    }
1189
1190    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1191        let response = if self
1192            .attempted_authentications
1193            .contains(event.request_id.as_ref())
1194        {
1195            AuthChallengeResponseResponse::CancelAuth
1196        } else if self.credentials.is_some() {
1197            self.attempted_authentications
1198                .insert(event.request_id.clone().into());
1199            AuthChallengeResponseResponse::ProvideCredentials
1200        } else {
1201            AuthChallengeResponseResponse::Default
1202        };
1203
1204        let mut auth = AuthChallengeResponse::new(response);
1205        if let Some(creds) = self.credentials.clone() {
1206            auth.username = Some(creds.username);
1207            auth.password = Some(creds.password);
1208        }
1209        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1210    }
1211
1212    /// Set the page offline network emulation condition.
1213    pub fn set_offline_mode(&mut self, value: bool) {
1214        if self.offline == value {
1215            return;
1216        }
1217        self.offline = value;
1218        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1219            .offline(self.offline)
1220            .latency(0)
1221            .download_throughput(-1.)
1222            .upload_throughput(-1.)
1223            .build()
1224        {
1225            self.push_cdp_request(network);
1226        }
1227    }
1228
1229    /// Request interception doesn't happen for data URLs with Network Service.
1230    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1231        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1232            if let Some(interception_id) = self
1233                .request_id_to_interception_id
1234                .remove(event.request_id.as_ref())
1235            {
1236                self.on_request(event, Some(interception_id));
1237            } else {
1238                // TODO remove the clone for event
1239                self.requests_will_be_sent
1240                    .insert(event.request_id.clone(), event.clone());
1241            }
1242        } else {
1243            self.on_request(event, None);
1244        }
1245    }
1246
1247    /// The request was served from the cache.
1248    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1249        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1250            request.from_memory_cache = true;
1251        }
1252    }
1253
1254    /// On network response received.
1255    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1256        let mut request_failed = false;
1257
1258        // Track how many bytes we actually deducted from this target.
1259        let mut deducted: u64 = 0;
1260
1261        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1262            let before = *max_bytes;
1263
1264            // encoded_data_length -> saturating cast to u64
1265            let received_bytes: u64 = event.response.encoded_data_length as u64;
1266
1267            // Safe parse of Content-Length
1268            let content_length: Option<u64> = event
1269                .response
1270                .headers
1271                .inner()
1272                .get("content-length")
1273                .and_then(|v| v.as_str())
1274                .and_then(|s| s.trim().parse::<u64>().ok());
1275
1276            // Deduct what we actually received
1277            *max_bytes = max_bytes.saturating_sub(received_bytes);
1278
1279            // If the declared size can't fit, zero out now
1280            if let Some(cl) = content_length {
1281                if cl > *max_bytes {
1282                    *max_bytes = 0;
1283                }
1284            }
1285
1286            request_failed = *max_bytes == 0;
1287
1288            // Compute exact delta deducted on this event
1289            deducted = before.saturating_sub(*max_bytes);
1290        }
1291
1292        // Bubble up the deduction (even if request continues)
1293        if deducted > 0 {
1294            self.queued_events
1295                .push_back(NetworkEvent::BytesConsumed(deducted));
1296        }
1297
1298        // block all network request moving forward.
1299        if request_failed && self.max_bytes_allowed.is_some() {
1300            self.set_block_all(true);
1301        }
1302
1303        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1304            request.set_response(event.response.clone());
1305            self.queued_events.push_back(if request_failed {
1306                NetworkEvent::RequestFailed(request)
1307            } else {
1308                NetworkEvent::RequestFinished(request)
1309            });
1310        }
1311    }
1312
1313    /// On network loading finished.
1314    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1315        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1316            if let Some(interception_id) = request.interception_id.as_ref() {
1317                self.attempted_authentications
1318                    .remove(interception_id.as_ref());
1319            }
1320            self.queued_events
1321                .push_back(NetworkEvent::RequestFinished(request));
1322        }
1323    }
1324
1325    /// On network loading failed.
1326    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1327        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1328            request.failure_text = Some(event.error_text.clone());
1329            if let Some(interception_id) = request.interception_id.as_ref() {
1330                self.attempted_authentications
1331                    .remove(interception_id.as_ref());
1332            }
1333            self.queued_events
1334                .push_back(NetworkEvent::RequestFailed(request));
1335        }
1336    }
1337
1338    /// On request will be sent.
1339    fn on_request(
1340        &mut self,
1341        event: &EventRequestWillBeSent,
1342        interception_id: Option<InterceptionId>,
1343    ) {
1344        let mut redirect_chain = Vec::new();
1345        let mut redirect_location = None;
1346
1347        if let Some(redirect_resp) = &event.redirect_response {
1348            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1349                if is_redirect_status(redirect_resp.status) {
1350                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1351                        if redirect_resp.url != location {
1352                            let fixed_location = location.replace(&redirect_resp.url, "");
1353
1354                            if !fixed_location.is_empty() {
1355                                request.response.as_mut().map(|resp| {
1356                                    resp.headers.0["Location"] =
1357                                        serde_json::Value::String(fixed_location.clone());
1358                                });
1359                            }
1360
1361                            redirect_location = Some(fixed_location);
1362                        }
1363                    }
1364                }
1365
1366                self.handle_request_redirect(
1367                    &mut request,
1368                    if let Some(redirect_location) = redirect_location {
1369                        let mut redirect_resp = redirect_resp.clone();
1370
1371                        if !redirect_location.is_empty() {
1372                            redirect_resp.headers.0["Location"] =
1373                                serde_json::Value::String(redirect_location);
1374                        }
1375
1376                        redirect_resp
1377                    } else {
1378                        redirect_resp.clone()
1379                    },
1380                );
1381
1382                redirect_chain = std::mem::take(&mut request.redirect_chain);
1383                redirect_chain.push(request);
1384            }
1385        }
1386
1387        let request = HttpRequest::new(
1388            event.request_id.clone(),
1389            event.frame_id.clone(),
1390            interception_id,
1391            self.user_request_interception_enabled,
1392            redirect_chain,
1393        );
1394
1395        self.requests.insert(event.request_id.clone(), request);
1396        self.queued_events
1397            .push_back(NetworkEvent::Request(event.request_id.clone()));
1398    }
1399
1400    /// Handle request redirect.
1401    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1402        request.set_response(response);
1403        if let Some(interception_id) = request.interception_id.as_ref() {
1404            self.attempted_authentications
1405                .remove(interception_id.as_ref());
1406        }
1407    }
1408}
1409
1410#[derive(Debug)]
1411pub enum NetworkEvent {
1412    /// Send a CDP request.
1413    SendCdpRequest((MethodId, serde_json::Value)),
1414    /// Request.
1415    Request(RequestId),
1416    /// Response
1417    Response(RequestId),
1418    /// Request failed.
1419    RequestFailed(HttpRequest),
1420    /// Request finished.
1421    RequestFinished(HttpRequest),
1422    /// Bytes consumed.
1423    BytesConsumed(u64),
1424}
1425
1426#[cfg(test)]
1427mod tests {
1428    use super::ALLOWED_MATCHER_3RD_PARTY;
1429    use crate::handler::network::NetworkManager;
1430    use std::time::Duration;
1431
1432    #[test]
1433    fn test_allowed_matcher_3rd_party() {
1434        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1435        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1436        assert!(
1437            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1438            "expected Cloudflare challenge script to be allowed"
1439        );
1440
1441        // Should NOT be allowed (not in allow-list)
1442        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1443        assert!(
1444            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1445            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1446        );
1447
1448        // A couple sanity checks for existing allow patterns
1449        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1450        assert!(ALLOWED_MATCHER_3RD_PARTY
1451            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1452        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1453    }
1454
1455    #[test]
1456    fn test_script_allowed_by_default_when_not_blocklisted() {
1457        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1458        nm.set_page_url(
1459            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1460        );
1461
1462        // A random script that should not match your block tries.
1463        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1464        assert!(
1465            !nm.should_block_script_blocklist_only(ok),
1466            "expected non-blocklisted script to be allowed"
1467        );
1468    }
1469
1470    #[test]
1471    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1472        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1473        nm.set_page_url(
1474            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1475        );
1476
1477        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1478        let bad = "https://cdn.example.net/js/analytics.js";
1479        assert!(
1480            nm.should_block_script_blocklist_only(bad),
1481            "expected analytics.js to be blocklisted"
1482        );
1483    }
1484
1485    #[test]
1486    fn test_allowed_matcher_3rd_party_sanity() {
1487        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1488        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1489        assert!(
1490            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1491            "expected Cloudflare challenge script to be allowed"
1492        );
1493
1494        // Should NOT be allowed (not in allow-list)
1495        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1496        assert!(
1497            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1498            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1499        );
1500
1501        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1502        assert!(ALLOWED_MATCHER_3RD_PARTY
1503            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1504        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1505    }
1506    #[test]
1507    fn test_dynamic_blacklist_blocks_url() {
1508        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1509        nm.set_page_url("https://example.com/".to_string());
1510
1511        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1512        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1513        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1514
1515        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1516    }
1517
1518    #[test]
1519    fn test_blacklist_strict_wins_over_whitelist() {
1520        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1521        nm.set_page_url("https://example.com/".to_string());
1522
1523        // Same URL in both lists.
1524        nm.set_blacklist_patterns(["beacon.min.js"]);
1525        nm.set_whitelist_patterns(["beacon.min.js"]);
1526
1527        nm.set_blacklist_strict(true);
1528
1529        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1530        assert!(nm.is_whitelisted(u));
1531        assert!(nm.is_blacklisted(u));
1532
1533        // In strict mode, it should still be considered blocked at decision time.
1534        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1535        assert!(nm.blacklist_strict);
1536    }
1537
1538    #[test]
1539    fn test_blacklist_non_strict_allows_whitelist_override() {
1540        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1541        nm.set_page_url("https://example.com/".to_string());
1542
1543        nm.set_blacklist_patterns(["beacon.min.js"]);
1544        nm.set_whitelist_patterns(["beacon.min.js"]);
1545
1546        nm.set_blacklist_strict(false);
1547
1548        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1549        assert!(nm.is_blacklisted(u));
1550        assert!(nm.is_whitelisted(u));
1551        assert!(!nm.blacklist_strict);
1552    }
1553}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs