chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/enterprise.js",
73        "https://www.google.com/recaptcha/api.js",
74        "https://google.com/recaptcha/api.js",
75        "https://captcha.px-cloud.net/",
76        "https://geo.captcha-delivery.com/",
77        "https://cdn.auth0.com/js/lock/",
78        "https://captcha.gtimg.com",
79        "https://cdn.auth0.com/client",
80        "https://js.stripe.com/",
81        "https://cdn.prod.website-files.com/", // webflow cdn scripts
82        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
83        "https://code.jquery.com/jquery-"
84    ];
85
86    /// Determine if a script should be rendered in the browser by name.
87    ///
88    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
89    /// but we keep it for compatibility and other call sites.
90    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
91
92    /// General patterns for popular libraries and resources
93    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
94        // Verified 3rd parties for request
95        "https://m.stripe.network/",
96        "https://challenges.cloudflare.com/",
97        "https://www.google.com/recaptcha/api.js",
98        "https://google.com/recaptcha/api.js",
99        "https://www.google.com/recaptcha/enterprise.js",
100        "https://js.stripe.com/",
101        "https://cdn.prod.website-files.com/", // webflow cdn scripts
102        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
103        "https://code.jquery.com/jquery-",
104        "https://ct.captcha-delivery.com/",
105        "https://geo.captcha-delivery.com/",
106        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
107        "https://ct.captcha-delivery.com/",
108        "https://cdn.auth0.com/client",
109        "https://captcha.px-cloud.net/",
110        "https://www.gstatic.com/recaptcha/",
111        "https://www.google.com/recaptcha/api2/",
112        "https://www.recaptcha.net/recaptcha/",
113        "https://js.hcaptcha.com/1/api.js",
114        "https://hcaptcha.com/1/api.js",
115        "https://js.datadome.co/tags.js",
116        "https://api-js.datadome.co/",
117        "https://client.perimeterx.net/",
118        "https://captcha.px-cdn.net/",
119        "https://captcha.px-cloud.net/",
120        "https://s.perimeterx.net/",
121        "https://client-api.arkoselabs.com/v2/",
122        "https://static.geetest.com/v4/gt4.js",
123        "https://static.geetest.com/",
124        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
125        "https://cdn.perfdrive.com/aperture/",
126        "https://assets.queue-it.net/",
127        "discourse-cdn.com/",
128        "/cdn-cgi/challenge-platform/",
129        "/_Incapsula_Resource"
130    ];
131
132    /// Determine if a script should be rendered in the browser by name.
133    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
134
135    /// path of a js framework
136    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
137        phf::phf_set! {
138            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
139            "_astro/", "_app/immutable"
140        }
141    };
142
143    /// Ignore the content types.
144    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
145        "application/pdf",
146        "application/zip",
147        "application/x-rar-compressed",
148        "application/x-tar",
149        "image/png",
150        "image/jpeg",
151        "image/gif",
152        "image/bmp",
153        "image/webp",
154        "image/svg+xml",
155        "video/mp4",
156        "video/x-msvideo",
157        "video/x-matroska",
158        "video/webm",
159        "audio/mpeg",
160        "audio/ogg",
161        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
162        "application/vnd.ms-excel",
163        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
164        "application/vnd.ms-powerpoint",
165        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
166        "application/x-7z-compressed",
167        "application/x-rpm",
168        "application/x-shockwave-flash",
169        "application/rtf",
170    };
171
172    /// Ignore the resources for visual content types.
173    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
174        "Image",
175        "Media",
176        "Font"
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "CspViolationReport",
182        "Manifest",
183        "Other",
184        "Prefetch",
185        "Ping",
186    };
187
188    /// Case insenstive css matching
189    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
190
191    /// The command chain.
192    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
193        let enable = EnableParams::default();
194
195        if let Ok(c) = serde_json::to_value(&enable) {
196            vec![(enable.identifier(), c)]
197        } else {
198            vec![]
199        }
200    };
201
202    /// The command chain with https ignore.
203    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
204        let enable = EnableParams::default();
205        let mut v = vec![];
206        if let Ok(c) = serde_json::to_value(&enable) {
207            v.push((enable.identifier(), c));
208        }
209        let ignore = SetIgnoreCertificateErrorsParams::new(true);
210        if let Ok(ignored) = serde_json::to_value(&ignore) {
211            v.push((ignore.identifier(), ignored));
212        }
213
214        v
215    };
216
217    /// Enable the fetch intercept command
218    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
219        fetch::EnableParams::builder()
220        .handle_auth_requests(true)
221        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
222        .build()
223    };
224}
225
226/// Determine if a redirect is true.
227pub(crate) fn is_redirect_status(status: i64) -> bool {
228    matches!(status, 301 | 302 | 303 | 307 | 308)
229}
230
231#[derive(Debug)]
232/// The base network manager.
233pub struct NetworkManager {
234    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
235    ///
236    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
237    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
238    /// Consumers pull from this queue via `poll()`.
239    queued_events: VecDeque<NetworkEvent>,
240    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
241    ///
242    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
243    /// certificates (self-signed, expired, MITM proxies, etc.).
244    ignore_httpserrors: bool,
245    /// Active in-flight requests keyed by CDP `RequestId`.
246    ///
247    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
248    /// and final state used to emit `RequestFinished` / `RequestFailed`.
249    requests: HashMap<RequestId, HttpRequest>,
250    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
251    /// `Fetch.requestPaused` arrives later (or vice versa).
252    ///
253    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
254    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
255    // TODO put event in an Arc?
256    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
257    /// Extra HTTP headers to apply to subsequent network requests via CDP.
258    ///
259    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
260    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
261    extra_headers: std::collections::HashMap<String, String>,
262    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
263    ///
264    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
265    /// store the interception id here so it can be attached to the `HttpRequest` once the
266    /// network request is observed.
267    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
268    /// Whether the user has disabled the browser cache.
269    ///
270    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
271    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
272    user_cache_disabled: bool,
273    /// Tracks which requests have already attempted authentication.
274    ///
275    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
276    /// authentication challenges (407/401). Once a request id is present here, subsequent
277    /// challenges for the same request are canceled.
278    attempted_authentications: HashSet<RequestId>,
279    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
280    ///
281    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
282    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
283    credentials: Option<Credentials>,
284    /// User-facing toggle indicating whether request interception is desired.
285    ///
286    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
287    /// not guarantee interception is active; interception is actually enabled/disabled by
288    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
289    ///
290    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
291    /// enabled to satisfy auth challenges.
292    pub(crate) user_request_interception_enabled: bool,
293    /// Hard kill-switch to block all network traffic.
294    ///
295    /// When `true`, the manager immediately blocks requests (typically via
296    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
297    /// and short-circuits most decision logic. This is used for safety conditions such as
298    /// exceeding `max_bytes_allowed` or other runtime protections.
299    block_all: bool,
300    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
301    ///
302    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
303    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
304    /// when `user_request_interception_enabled` or `credentials` change.
305    pub(crate) protocol_request_interception_enabled: bool,
306    /// The network is offline.
307    offline: bool,
308    /// The page request timeout.
309    pub request_timeout: Duration,
310    // made_request: bool,
311    /// Ignore visuals (no pings, prefetching, and etc).
312    pub ignore_visuals: bool,
313    /// Block CSS stylesheets.
314    pub block_stylesheets: bool,
315    /// Block javascript that is not critical to rendering.
316    ///
317    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
318    /// by itself (it remains for config compatibility).
319    pub block_javascript: bool,
320    /// Block analytics from rendering
321    pub block_analytics: bool,
322    /// Only html from loading.
323    pub only_html: bool,
324    /// Is xml document?
325    pub xml_document: bool,
326    /// The custom intercept handle logic to run on the website.
327    pub intercept_manager: NetworkInterceptManager,
328    /// Track the amount of times the document reloaded.
329    pub document_reload_tracker: u8,
330    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
331    pub document_target_url: String,
332    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
333    pub document_target_domain: String,
334    /// The max bytes to receive.
335    pub max_bytes_allowed: Option<u64>,
336    #[cfg(feature = "_cache")]
337    /// The cache site_key to use.
338    pub cache_site_key: Option<String>,
339    /// The cache policy to use.
340    #[cfg(feature = "_cache")]
341    pub cache_policy: Option<BasicCachePolicy>,
342    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
343    whitelist_patterns: Vec<String>,
344    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
345    whitelist_matcher: Option<AhoCorasick>,
346    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
347    blacklist_patterns: Vec<String>,
348    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
349    blacklist_matcher: Option<AhoCorasick>,
350    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
351    blacklist_strict: bool,
352}
353
354impl NetworkManager {
355    /// A new network manager.
356    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
357        Self {
358            queued_events: Default::default(),
359            ignore_httpserrors,
360            requests: Default::default(),
361            requests_will_be_sent: Default::default(),
362            extra_headers: Default::default(),
363            request_id_to_interception_id: Default::default(),
364            user_cache_disabled: false,
365            attempted_authentications: Default::default(),
366            credentials: None,
367            block_all: false,
368            user_request_interception_enabled: false,
369            protocol_request_interception_enabled: false,
370            offline: false,
371            request_timeout,
372            ignore_visuals: false,
373            block_javascript: false,
374            block_stylesheets: false,
375            block_analytics: true,
376            only_html: false,
377            xml_document: false,
378            intercept_manager: NetworkInterceptManager::Unknown,
379            document_reload_tracker: 0,
380            document_target_url: String::new(),
381            document_target_domain: String::new(),
382            whitelist_patterns: Vec::new(),
383            whitelist_matcher: None,
384            blacklist_patterns: Vec::new(),
385            blacklist_matcher: None,
386            blacklist_strict: true,
387            max_bytes_allowed: None,
388            #[cfg(feature = "_cache")]
389            cache_site_key: None,
390            #[cfg(feature = "_cache")]
391            cache_policy: None,
392        }
393    }
394
395    /// Replace the whitelist patterns (compiled once).
396    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
397    where
398        I: IntoIterator<Item = S>,
399        S: Into<String>,
400    {
401        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
402        self.rebuild_whitelist_matcher();
403    }
404
405    /// Replace the blacklist patterns (compiled once).
406    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
407    where
408        I: IntoIterator<Item = S>,
409        S: Into<String>,
410    {
411        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
412        self.rebuild_blacklist_matcher();
413    }
414
415    /// Add one pattern (cheap) and rebuild (call this sparingly).
416    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
417        self.blacklist_patterns.push(pattern.into());
418        self.rebuild_blacklist_matcher();
419    }
420
421    /// Add many patterns and rebuild once.
422    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
423    where
424        I: IntoIterator<Item = S>,
425        S: Into<String>,
426    {
427        self.blacklist_patterns
428            .extend(patterns.into_iter().map(Into::into));
429        self.rebuild_blacklist_matcher();
430    }
431
432    /// Clear blacklist entirely.
433    pub fn clear_blacklist(&mut self) {
434        self.blacklist_patterns.clear();
435        self.blacklist_matcher = None;
436    }
437
438    /// Control precedence: when true, blacklist always wins.
439    pub fn set_blacklist_strict(&mut self, strict: bool) {
440        self.blacklist_strict = strict;
441    }
442
443    #[inline]
444    fn rebuild_blacklist_matcher(&mut self) {
445        if self.blacklist_patterns.is_empty() {
446            self.blacklist_matcher = None;
447            return;
448        }
449
450        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
451        self.blacklist_matcher = AhoCorasick::new(refs).ok();
452    }
453
454    #[inline]
455    fn is_blacklisted(&self, url: &str) -> bool {
456        self.blacklist_matcher
457            .as_ref()
458            .map(|m| m.is_match(url))
459            .unwrap_or(false)
460    }
461
462    /// Add one pattern (cheap) and rebuild (call this sparingly).
463    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
464        self.whitelist_patterns.push(pattern.into());
465        self.rebuild_whitelist_matcher();
466    }
467
468    /// Add many patterns and rebuild once.
469    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
470    where
471        I: IntoIterator<Item = S>,
472        S: Into<String>,
473    {
474        self.whitelist_patterns
475            .extend(patterns.into_iter().map(Into::into));
476        self.rebuild_whitelist_matcher();
477    }
478
479    #[inline]
480    fn rebuild_whitelist_matcher(&mut self) {
481        if self.whitelist_patterns.is_empty() {
482            self.whitelist_matcher = None;
483            return;
484        }
485
486        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
487
488        // If building fails (shouldn’t for simple patterns), just disable matcher.
489        self.whitelist_matcher = AhoCorasick::new(refs).ok();
490    }
491
492    #[inline]
493    fn is_whitelisted(&self, url: &str) -> bool {
494        self.whitelist_matcher
495            .as_ref()
496            .map(|m| m.is_match(url))
497            .unwrap_or(false)
498    }
499
500    /// Commands to init the chain with.
501    pub fn init_commands(&self) -> CommandChain {
502        let cmds = if self.ignore_httpserrors {
503            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
504        } else {
505            INIT_CHAIN.clone()
506        };
507        CommandChain::new(cmds, self.request_timeout)
508    }
509
510    /// Push the CDP request.
511    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
512        let method = cmd.identifier();
513        if let Ok(params) = serde_json::to_value(cmd) {
514            self.queued_events
515                .push_back(NetworkEvent::SendCdpRequest((method, params)));
516        }
517    }
518
519    /// The next event to handle.
520    pub fn poll(&mut self) -> Option<NetworkEvent> {
521        self.queued_events.pop_front()
522    }
523
524    /// Get the extra headers.
525    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
526        &self.extra_headers
527    }
528
529    /// Set extra HTTP headers.
530    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
531        self.extra_headers = headers;
532        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
533        self.extra_headers.remove("Proxy-Authorization");
534        if !self.extra_headers.is_empty() {
535            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
536                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
537            }
538        }
539    }
540
541    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
542        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
543    }
544
545    pub fn set_block_all(&mut self, block_all: bool) {
546        self.block_all = block_all;
547    }
548
549    pub fn set_request_interception(&mut self, enabled: bool) {
550        self.user_request_interception_enabled = enabled;
551        self.update_protocol_request_interception();
552    }
553
554    pub fn set_cache_enabled(&mut self, enabled: bool) {
555        let run = self.user_cache_disabled != !enabled;
556        self.user_cache_disabled = !enabled;
557        if run {
558            self.update_protocol_cache_disabled();
559        }
560    }
561
562    /// Enable fetch interception.
563    pub fn enable_request_intercept(&mut self) {
564        self.protocol_request_interception_enabled = true;
565    }
566
567    /// Disable fetch interception.
568    pub fn disable_request_intercept(&mut self) {
569        self.protocol_request_interception_enabled = false;
570    }
571
572    /// Set the cache site key.
573    #[cfg(feature = "_cache")]
574    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
575        self.cache_site_key = cache_site_key;
576    }
577
578    /// Set the cache policy.
579    #[cfg(feature = "_cache")]
580    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
581        self.cache_policy = cache_policy;
582    }
583
584    pub fn update_protocol_cache_disabled(&mut self) {
585        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
586    }
587
588    pub fn authenticate(&mut self, credentials: Credentials) {
589        self.credentials = Some(credentials);
590        self.update_protocol_request_interception();
591        self.protocol_request_interception_enabled = true;
592    }
593
594    fn update_protocol_request_interception(&mut self) {
595        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
596
597        if enabled == self.protocol_request_interception_enabled {
598            return;
599        }
600
601        if enabled {
602            self.push_cdp_request(ENABLE_FETCH.clone())
603        } else {
604            self.push_cdp_request(DisableParams::default())
605        }
606    }
607
608    /// Blocklist-only script blocking.
609    /// Returns true only when the URL matches an explicit blocklist condition.
610    #[inline]
611    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
612        // If analytics blocking is off, skip all analytics tries.
613        let block_analytics = self.block_analytics;
614
615        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
616        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
617        {
618            return true;
619        }
620
621        // 2) Custom website block list (explicit).
622        if crate::handler::blockers::block_websites::block_website(url) {
623            return true;
624        }
625
626        // 3) Path-based explicit tries / fallbacks.
627        //
628        // We run these on:
629        // - path with leading slash ("/js/app.js")
630        // - path without leading slash ("js/app.js")
631        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
632        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
633            // Remove query/fragment so matching stays stable.
634            let p_slash = Self::strip_query_fragment(path_with_slash);
635            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
636
637            // Basename for filename-only lists.
638            let base = match p_slash.rsplit('/').next() {
639                Some(b) => b,
640                None => p_slash,
641            };
642
643            // ---- Trie checks ----
644            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
645            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
646                return true;
647            }
648            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
649                return true;
650            }
651            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
652                return true;
653            }
654
655            // Base-path ignore tries (framework noise / known ignorable script paths).
656            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
657            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
658                return true;
659            }
660
661            // Style path ignores only when visuals are ignored.
662            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
663                return true;
664            }
665        }
666
667        false
668    }
669
670    /// Extract the absolute URL path portion WITH the leading slash.
671    ///
672    /// Example:
673    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
674    #[inline]
675    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
676        // find scheme separator
677        let idx = url.find("//")?;
678        let after_slashes = idx + 2;
679
680        // find first slash after host
681        let slash_rel = url[after_slashes..].find('/')?;
682        let slash_idx = after_slashes + slash_rel;
683
684        if slash_idx < url.len() {
685            Some(&url[slash_idx..])
686        } else {
687            None
688        }
689    }
690
691    /// Strip query string and fragment from a path-ish string.
692    ///
693    /// Example:
694    /// - "/a/b.js?x=1#y" -> "/a/b.js"
695    #[inline]
696    fn strip_query_fragment(s: &str) -> &str {
697        let q = s.find('?');
698        let h = s.find('#');
699
700        match (q, h) {
701            (None, None) => s,
702            (Some(i), None) => &s[..i],
703            (None, Some(i)) => &s[..i],
704            (Some(i), Some(j)) => &s[..i.min(j)],
705        }
706    }
707
708    /// Determine if the request should be skipped.
709    #[inline]
710    fn skip_xhr(
711        &self,
712        skip_networking: bool,
713        event: &EventRequestPaused,
714        network_event: bool,
715    ) -> bool {
716        // XHR check
717        if !skip_networking && network_event {
718            let request_url = event.request.url.as_str();
719
720            // check if part of ignore scripts.
721            let skip_analytics =
722                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
723
724            if skip_analytics {
725                true
726            } else if self.block_stylesheets || self.ignore_visuals {
727                let block_css = self.block_stylesheets;
728                let block_media = self.ignore_visuals;
729
730                let mut block_request = false;
731
732                if let Some(position) = request_url.rfind('.') {
733                    let hlen = request_url.len();
734                    let has_asset = hlen - position;
735
736                    if has_asset >= 3 {
737                        let next_position = position + 1;
738
739                        if block_media
740                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
741                                &request_url[next_position..].into(),
742                            )
743                        {
744                            block_request = true;
745                        } else if block_css {
746                            block_request =
747                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
748                                    .contains(&**CSS_EXTENSION)
749                        }
750                    }
751                }
752
753                if !block_request {
754                    block_request = ignore_script_xhr_media(request_url);
755                }
756
757                block_request
758            } else {
759                skip_networking
760            }
761        } else {
762            skip_networking
763        }
764    }
765
766    #[cfg(feature = "adblock")]
767    #[inline]
768    /// Detect if ad enabled.
769    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
770        if skip_networking {
771            true
772        } else {
773            block_ads(&event.request.url) || self.detect_ad(event)
774        }
775    }
776
777    /// When adblock feature is disabled, this is a no-op.
778    #[cfg(not(feature = "adblock"))]
779    #[inline]
780    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
781        use crate::handler::blockers::block_websites::block_ads;
782        if skip_networking {
783            true
784        } else {
785            block_ads(&event.request.url)
786        }
787    }
788
789    #[inline]
790    /// Fail request
791    fn fail_request_blocked(
792        &mut self,
793        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
794    ) {
795        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
796            request_id.clone(),
797            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
798        );
799        self.push_cdp_request(params);
800    }
801
802    #[inline]
803    /// Fulfill request
804    fn fulfill_request_empty_200(
805        &mut self,
806        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
807    ) {
808        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
809            request_id.clone(),
810            200,
811        );
812        self.push_cdp_request(params);
813    }
814
815    #[cfg(feature = "_cache")]
816    #[inline]
817    /// Fulfill a paused Fetch request from cached bytes + header map.
818    ///
819    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
820    fn fulfill_request_from_cache(
821        &mut self,
822        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
823        body: &[u8],
824        headers: &std::collections::HashMap<String, String>,
825        status: i64,
826    ) {
827        use crate::cdp::browser_protocol::fetch::HeaderEntry;
828        use crate::handler::network::fetch::FulfillRequestParams;
829        use base64::Engine;
830
831        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
832
833        for (k, v) in headers.iter() {
834            resp_headers.push(HeaderEntry {
835                name: k.clone().into(),
836                value: v.clone().into(),
837            });
838        }
839
840        let mut params = FulfillRequestParams::new(request_id.clone(), status);
841
842        // TODO: have this already encoded prior.
843        params.body = Some(
844            base64::engine::general_purpose::STANDARD
845                .encode(body)
846                .into(),
847        );
848
849        params.response_headers = Some(resp_headers);
850
851        self.push_cdp_request(params);
852    }
853
854    #[inline]
855    /// Continue the request url.
856    fn continue_request_with_url(
857        &mut self,
858        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
859        url: Option<&str>,
860        intercept_response: bool,
861    ) {
862        let mut params = ContinueRequestParams::new(request_id.clone());
863        if let Some(url) = url {
864            params.url = Some(url.to_string());
865            params.intercept_response = Some(intercept_response);
866        }
867        self.push_cdp_request(params);
868    }
869
870    /// On fetch request paused interception.
871    #[inline]
872    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
873        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
874            return;
875        }
876
877        let resource_type = &event.resource_type;
878
879        if self.block_all {
880            tracing::debug!(
881                "Blocked (block_all): {:?} - {}",
882                event.resource_type,
883                event.request.url
884            );
885            return self.fail_request_blocked(&event.request_id);
886        }
887
888        if let Some(network_id) = event.network_id.as_ref() {
889            if let Some(request_will_be_sent) =
890                self.requests_will_be_sent.remove(network_id.as_ref())
891            {
892                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
893            } else {
894                self.request_id_to_interception_id
895                    .insert(network_id.clone(), event.request_id.clone().into());
896            }
897        }
898
899        // From here on, we handle the full decision tree.
900        let javascript_resource = *resource_type == ResourceType::Script;
901        let document_resource = *resource_type == ResourceType::Document;
902        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
903
904        // Start with static / cheap skip checks.
905        let mut skip_networking =
906            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
907
908        // Also short-circuit if we've reloaded this document too many times.
909        if !skip_networking {
910            skip_networking = self.document_reload_tracker >= 3;
911        }
912
913        // Handle document redirect / masking and track xml documents.
914        let (current_url_cow, had_replacer) =
915            self.handle_document_replacement_and_tracking(event, document_resource);
916
917        let current_url: &str = current_url_cow.as_ref();
918
919        let blacklisted = self.is_blacklisted(current_url);
920
921        if !self.blacklist_strict && blacklisted {
922            skip_networking = true;
923        }
924
925        if !skip_networking {
926            // Allow XSL for sitemap XML.
927            if self.xml_document && current_url.ends_with(".xsl") {
928                skip_networking = false;
929            } else {
930                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
931            }
932        }
933
934        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
935
936        // Ignore embedded scripts when only_html or ignore_visuals is set.
937        if !skip_networking
938            && self.block_javascript
939            && (self.only_html || self.ignore_visuals)
940            && (javascript_resource || document_resource)
941        {
942            skip_networking = ignore_script_embedded(current_url);
943        }
944
945        // Script policy: allow-by-default.
946        // Block only if explicit block list patterns match.
947        if !skip_networking && javascript_resource {
948            skip_networking = self.should_block_script_blocklist_only(current_url);
949        }
950
951        // XHR / data resources.
952        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
953
954        // Custom interception layer.
955        if !skip_networking && (javascript_resource || network_resource || document_resource) {
956            skip_networking = self.intercept_manager.intercept_detection(
957                current_url,
958                self.ignore_visuals,
959                network_resource,
960            );
961        }
962
963        // Custom website block list.
964        if !skip_networking && (javascript_resource || network_resource) {
965            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
966        }
967
968        // whitelist 3rd party
969        // not required unless explicit blocking.
970        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
971        {
972            skip_networking = false;
973        }
974
975        // check if the url is in the whitelist.
976        if skip_networking && self.is_whitelisted(current_url) {
977            skip_networking = false;
978        }
979
980        if self.blacklist_strict && blacklisted {
981            skip_networking = true;
982        }
983
984        if skip_networking {
985            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
986            self.fulfill_request_empty_200(&event.request_id);
987        } else {
988            #[cfg(feature = "_cache")]
989            {
990                if let (Some(policy), Some(cache_site_key)) =
991                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
992                {
993                    let current_url = format!("{}:{}", event.request.method, &current_url);
994
995                    if let Some((res, cache_policy)) =
996                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
997                    {
998                        if policy.allows_cached(&cache_policy) {
999                            tracing::debug!(
1000                                "Remote Cached: {:?} - {}",
1001                                resource_type,
1002                                &current_url
1003                            );
1004                            return self.fulfill_request_from_cache(
1005                                &event.request_id,
1006                                &res.body,
1007                                &res.headers,
1008                                res.status as i64,
1009                            );
1010                        }
1011                    }
1012                }
1013            }
1014
1015            // check our frame cache for the run.
1016            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1017            self.continue_request_with_url(
1018                &event.request_id,
1019                if had_replacer {
1020                    Some(current_url)
1021                } else {
1022                    None
1023                },
1024                !had_replacer,
1025            );
1026        }
1027    }
1028
1029    /// Shared "visuals + basic blocking" logic.
1030    ///
1031    /// IMPORTANT: Scripts are NOT blocked here anymore.
1032    /// Scripts are allowed by default and only blocked via explicit blocklists
1033    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1034    #[inline]
1035    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1036        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1037            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1038    }
1039
1040    /// Does the network manager have a target domain?
1041    pub fn has_target_domain(&self) -> bool {
1042        !self.document_target_url.is_empty()
1043    }
1044
1045    /// Set the target page url for tracking.
1046    pub fn set_page_url(&mut self, page_target_url: String) {
1047        let host_base = host_and_rest(&page_target_url)
1048            .map(|(h, _)| base_domain_from_host(h))
1049            .unwrap_or("");
1050
1051        self.document_target_domain = host_base.to_string();
1052        self.document_target_url = page_target_url;
1053    }
1054
1055    /// Clear the initial target domain on every navigation.
1056    pub fn clear_target_domain(&mut self) {
1057        self.document_reload_tracker = 0;
1058        self.document_target_url = Default::default();
1059        self.document_target_domain = Default::default();
1060    }
1061
1062    /// Handles:
1063    /// - document reload tracking (`document_reload_tracker`)
1064    /// - redirect masking / replacement
1065    /// - xml document detection (`xml_document`)
1066    /// - `document_target_url` updates
1067    ///
1068    /// Returns (current_url, had_replacer).
1069    #[inline]
1070    fn handle_document_replacement_and_tracking<'a>(
1071        &mut self,
1072        event: &'a EventRequestPaused,
1073        document_resource: bool,
1074    ) -> (Cow<'a, str>, bool) {
1075        let mut replacer: Option<String> = None;
1076        let current_url = event.request.url.as_str();
1077
1078        if document_resource {
1079            if self.document_target_url == current_url {
1080                self.document_reload_tracker += 1;
1081            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1082            {
1083                let (http_document_replacement, mut https_document_replacement) =
1084                    if self.document_target_url.starts_with("http://") {
1085                        (
1086                            self.document_target_url.replacen("http://", "http//", 1),
1087                            self.document_target_url.replacen("http://", "https://", 1),
1088                        )
1089                    } else {
1090                        (
1091                            self.document_target_url.replacen("https://", "https//", 1),
1092                            self.document_target_url.replacen("https://", "http://", 1),
1093                        )
1094                    };
1095
1096                // Track trailing slash to restore later.
1097                let trailing = https_document_replacement.ends_with('/');
1098                if trailing {
1099                    https_document_replacement.pop();
1100                }
1101                if https_document_replacement.ends_with('/') {
1102                    https_document_replacement.pop();
1103                }
1104
1105                let redirect_mask = format!(
1106                    "{}{}",
1107                    https_document_replacement, http_document_replacement
1108                );
1109
1110                if current_url == redirect_mask {
1111                    replacer = Some(if trailing {
1112                        format!("{}/", https_document_replacement)
1113                    } else {
1114                        https_document_replacement
1115                    });
1116                }
1117            }
1118
1119            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1120                self.xml_document = true;
1121            }
1122
1123            // Track last seen document URL.
1124            self.document_target_url = event.request.url.clone();
1125            self.document_target_domain = host_and_rest(&self.document_target_url)
1126                .map(|(h, _)| base_domain_from_host(h).to_string())
1127                .unwrap_or_default();
1128        }
1129
1130        let current_url_cow = match replacer {
1131            Some(r) => Cow::Owned(r),
1132            None => Cow::Borrowed(event.request.url.as_str()),
1133        };
1134
1135        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1136        (current_url_cow, had_replacer)
1137    }
1138
1139    /// Perform a page intercept for chrome
1140    #[cfg(feature = "adblock")]
1141    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1142        use adblock::{
1143            lists::{FilterSet, ParseOptions, RuleTypes},
1144            Engine,
1145        };
1146
1147        lazy_static::lazy_static! {
1148            static ref AD_ENGINE: Engine = {
1149                let mut filter_set = FilterSet::new(false);
1150                let mut rules = ParseOptions::default();
1151                rules.rule_types = RuleTypes::All;
1152
1153                filter_set.add_filters(
1154                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1155                    rules,
1156                );
1157
1158                Engine::from_filter_set(filter_set, true)
1159            };
1160        };
1161
1162        let blockable = ResourceType::Image == event.resource_type
1163            || event.resource_type == ResourceType::Media
1164            || event.resource_type == ResourceType::Stylesheet
1165            || event.resource_type == ResourceType::Document
1166            || event.resource_type == ResourceType::Fetch
1167            || event.resource_type == ResourceType::Xhr;
1168
1169        let u = &event.request.url;
1170
1171        let block_request = blockable
1172            // set it to example.com for 3rd party handling is_same_site
1173        && {
1174            let request = adblock::request::Request::preparsed(
1175                 &u,
1176                 "example.com",
1177                 "example.com",
1178                 &event.resource_type.as_ref().to_lowercase(),
1179                 !event.request.is_same_site.unwrap_or_default());
1180
1181            AD_ENGINE.check_network_request(&request).matched
1182        };
1183
1184        block_request
1185    }
1186
1187    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1188        let response = if self
1189            .attempted_authentications
1190            .contains(event.request_id.as_ref())
1191        {
1192            AuthChallengeResponseResponse::CancelAuth
1193        } else if self.credentials.is_some() {
1194            self.attempted_authentications
1195                .insert(event.request_id.clone().into());
1196            AuthChallengeResponseResponse::ProvideCredentials
1197        } else {
1198            AuthChallengeResponseResponse::Default
1199        };
1200
1201        let mut auth = AuthChallengeResponse::new(response);
1202        if let Some(creds) = self.credentials.clone() {
1203            auth.username = Some(creds.username);
1204            auth.password = Some(creds.password);
1205        }
1206        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1207    }
1208
1209    /// Set the page offline network emulation condition.
1210    pub fn set_offline_mode(&mut self, value: bool) {
1211        if self.offline == value {
1212            return;
1213        }
1214        self.offline = value;
1215        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1216            .offline(self.offline)
1217            .latency(0)
1218            .download_throughput(-1.)
1219            .upload_throughput(-1.)
1220            .build()
1221        {
1222            self.push_cdp_request(network);
1223        }
1224    }
1225
1226    /// Request interception doesn't happen for data URLs with Network Service.
1227    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1228        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1229            if let Some(interception_id) = self
1230                .request_id_to_interception_id
1231                .remove(event.request_id.as_ref())
1232            {
1233                self.on_request(event, Some(interception_id));
1234            } else {
1235                // TODO remove the clone for event
1236                self.requests_will_be_sent
1237                    .insert(event.request_id.clone(), event.clone());
1238            }
1239        } else {
1240            self.on_request(event, None);
1241        }
1242    }
1243
1244    /// The request was served from the cache.
1245    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1246        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1247            request.from_memory_cache = true;
1248        }
1249    }
1250
1251    /// On network response received.
1252    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1253        let mut request_failed = false;
1254
1255        // Track how many bytes we actually deducted from this target.
1256        let mut deducted: u64 = 0;
1257
1258        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1259            let before = *max_bytes;
1260
1261            // encoded_data_length -> saturating cast to u64
1262            let received_bytes: u64 = event.response.encoded_data_length as u64;
1263
1264            // Safe parse of Content-Length
1265            let content_length: Option<u64> = event
1266                .response
1267                .headers
1268                .inner()
1269                .get("content-length")
1270                .and_then(|v| v.as_str())
1271                .and_then(|s| s.trim().parse::<u64>().ok());
1272
1273            // Deduct what we actually received
1274            *max_bytes = max_bytes.saturating_sub(received_bytes);
1275
1276            // If the declared size can't fit, zero out now
1277            if let Some(cl) = content_length {
1278                if cl > *max_bytes {
1279                    *max_bytes = 0;
1280                }
1281            }
1282
1283            request_failed = *max_bytes == 0;
1284
1285            // Compute exact delta deducted on this event
1286            deducted = before.saturating_sub(*max_bytes);
1287        }
1288
1289        // Bubble up the deduction (even if request continues)
1290        if deducted > 0 {
1291            self.queued_events
1292                .push_back(NetworkEvent::BytesConsumed(deducted));
1293        }
1294
1295        // block all network request moving forward.
1296        if request_failed && self.max_bytes_allowed.is_some() {
1297            self.set_block_all(true);
1298        }
1299
1300        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1301            request.set_response(event.response.clone());
1302            self.queued_events.push_back(if request_failed {
1303                NetworkEvent::RequestFailed(request)
1304            } else {
1305                NetworkEvent::RequestFinished(request)
1306            });
1307        }
1308    }
1309
1310    /// On network loading finished.
1311    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1312        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1313            if let Some(interception_id) = request.interception_id.as_ref() {
1314                self.attempted_authentications
1315                    .remove(interception_id.as_ref());
1316            }
1317            self.queued_events
1318                .push_back(NetworkEvent::RequestFinished(request));
1319        }
1320    }
1321
1322    /// On network loading failed.
1323    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1324        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1325            request.failure_text = Some(event.error_text.clone());
1326            if let Some(interception_id) = request.interception_id.as_ref() {
1327                self.attempted_authentications
1328                    .remove(interception_id.as_ref());
1329            }
1330            self.queued_events
1331                .push_back(NetworkEvent::RequestFailed(request));
1332        }
1333    }
1334
1335    /// On request will be sent.
1336    fn on_request(
1337        &mut self,
1338        event: &EventRequestWillBeSent,
1339        interception_id: Option<InterceptionId>,
1340    ) {
1341        let mut redirect_chain = Vec::new();
1342        let mut redirect_location = None;
1343
1344        if let Some(redirect_resp) = &event.redirect_response {
1345            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1346                if is_redirect_status(redirect_resp.status) {
1347                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1348                        if redirect_resp.url != location {
1349                            let fixed_location = location.replace(&redirect_resp.url, "");
1350
1351                            if !fixed_location.is_empty() {
1352                                request.response.as_mut().map(|resp| {
1353                                    resp.headers.0["Location"] =
1354                                        serde_json::Value::String(fixed_location.clone());
1355                                });
1356                            }
1357
1358                            redirect_location = Some(fixed_location);
1359                        }
1360                    }
1361                }
1362
1363                self.handle_request_redirect(
1364                    &mut request,
1365                    if let Some(redirect_location) = redirect_location {
1366                        let mut redirect_resp = redirect_resp.clone();
1367
1368                        if !redirect_location.is_empty() {
1369                            redirect_resp.headers.0["Location"] =
1370                                serde_json::Value::String(redirect_location);
1371                        }
1372
1373                        redirect_resp
1374                    } else {
1375                        redirect_resp.clone()
1376                    },
1377                );
1378
1379                redirect_chain = std::mem::take(&mut request.redirect_chain);
1380                redirect_chain.push(request);
1381            }
1382        }
1383
1384        let request = HttpRequest::new(
1385            event.request_id.clone(),
1386            event.frame_id.clone(),
1387            interception_id,
1388            self.user_request_interception_enabled,
1389            redirect_chain,
1390        );
1391
1392        self.requests.insert(event.request_id.clone(), request);
1393        self.queued_events
1394            .push_back(NetworkEvent::Request(event.request_id.clone()));
1395    }
1396
1397    /// Handle request redirect.
1398    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1399        request.set_response(response);
1400        if let Some(interception_id) = request.interception_id.as_ref() {
1401            self.attempted_authentications
1402                .remove(interception_id.as_ref());
1403        }
1404    }
1405}
1406
1407#[derive(Debug)]
1408pub enum NetworkEvent {
1409    /// Send a CDP request.
1410    SendCdpRequest((MethodId, serde_json::Value)),
1411    /// Request.
1412    Request(RequestId),
1413    /// Response
1414    Response(RequestId),
1415    /// Request failed.
1416    RequestFailed(HttpRequest),
1417    /// Request finished.
1418    RequestFinished(HttpRequest),
1419    /// Bytes consumed.
1420    BytesConsumed(u64),
1421}
1422
1423#[cfg(test)]
1424mod tests {
1425    use super::ALLOWED_MATCHER_3RD_PARTY;
1426    use crate::handler::network::NetworkManager;
1427    use std::time::Duration;
1428
1429    #[test]
1430    fn test_allowed_matcher_3rd_party() {
1431        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1432        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1433        assert!(
1434            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1435            "expected Cloudflare challenge script to be allowed"
1436        );
1437
1438        // Should NOT be allowed (not in allow-list)
1439        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1440        assert!(
1441            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1442            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1443        );
1444
1445        // A couple sanity checks for existing allow patterns
1446        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1447        assert!(ALLOWED_MATCHER_3RD_PARTY
1448            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1449        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1450    }
1451
1452    #[test]
1453    fn test_script_allowed_by_default_when_not_blocklisted() {
1454        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1455        nm.set_page_url(
1456            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1457        );
1458
1459        // A random script that should not match your block tries.
1460        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1461        assert!(
1462            !nm.should_block_script_blocklist_only(ok),
1463            "expected non-blocklisted script to be allowed"
1464        );
1465    }
1466
1467    #[test]
1468    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1469        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1470        nm.set_page_url(
1471            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1472        );
1473
1474        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1475        let bad = "https://cdn.example.net/js/analytics.js";
1476        assert!(
1477            nm.should_block_script_blocklist_only(bad),
1478            "expected analytics.js to be blocklisted"
1479        );
1480    }
1481
1482    #[test]
1483    fn test_allowed_matcher_3rd_party_sanity() {
1484        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1485        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1486        assert!(
1487            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1488            "expected Cloudflare challenge script to be allowed"
1489        );
1490
1491        // Should NOT be allowed (not in allow-list)
1492        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1493        assert!(
1494            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1495            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1496        );
1497
1498        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1499        assert!(ALLOWED_MATCHER_3RD_PARTY
1500            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1501        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1502    }
1503    #[test]
1504    fn test_dynamic_blacklist_blocks_url() {
1505        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1506        nm.set_page_url("https://example.com/".to_string());
1507
1508        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1509        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1510        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1511
1512        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1513    }
1514
1515    #[test]
1516    fn test_blacklist_strict_wins_over_whitelist() {
1517        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1518        nm.set_page_url("https://example.com/".to_string());
1519
1520        // Same URL in both lists.
1521        nm.set_blacklist_patterns(["beacon.min.js"]);
1522        nm.set_whitelist_patterns(["beacon.min.js"]);
1523
1524        nm.set_blacklist_strict(true);
1525
1526        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1527        assert!(nm.is_whitelisted(u));
1528        assert!(nm.is_blacklisted(u));
1529
1530        // In strict mode, it should still be considered blocked at decision time.
1531        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1532        assert!(nm.blacklist_strict);
1533    }
1534
1535    #[test]
1536    fn test_blacklist_non_strict_allows_whitelist_override() {
1537        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1538        nm.set_page_url("https://example.com/".to_string());
1539
1540        nm.set_blacklist_patterns(["beacon.min.js"]);
1541        nm.set_whitelist_patterns(["beacon.min.js"]);
1542
1543        nm.set_blacklist_strict(false);
1544
1545        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1546        assert!(nm.is_blacklisted(u));
1547        assert!(nm.is_whitelisted(u));
1548        assert!(!nm.blacklist_strict);
1549    }
1550}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs