chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20    SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::Duration;
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235#[derive(Debug)]
236/// The base network manager.
237pub struct NetworkManager {
238    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
239    ///
240    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
241    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
242    /// Consumers pull from this queue via `poll()`.
243    queued_events: VecDeque<NetworkEvent>,
244    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
245    ///
246    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
247    /// certificates (self-signed, expired, MITM proxies, etc.).
248    ignore_httpserrors: bool,
249    /// Active in-flight requests keyed by CDP `RequestId`.
250    ///
251    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
252    /// and final state used to emit `RequestFinished` / `RequestFailed`.
253    requests: HashMap<RequestId, HttpRequest>,
254    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
255    /// `Fetch.requestPaused` arrives later (or vice versa).
256    ///
257    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
258    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
259    // TODO put event in an Arc?
260    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
261    /// Extra HTTP headers to apply to subsequent network requests via CDP.
262    ///
263    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
264    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
265    extra_headers: std::collections::HashMap<String, String>,
266    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
267    ///
268    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
269    /// store the interception id here so it can be attached to the `HttpRequest` once the
270    /// network request is observed.
271    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
272    /// Whether the user has disabled the browser cache.
273    ///
274    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
275    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
276    user_cache_disabled: bool,
277    /// Tracks which requests have already attempted authentication.
278    ///
279    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
280    /// authentication challenges (407/401). Once a request id is present here, subsequent
281    /// challenges for the same request are canceled.
282    attempted_authentications: HashSet<RequestId>,
283    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
284    ///
285    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
286    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
287    credentials: Option<Credentials>,
288    /// User-facing toggle indicating whether request interception is desired.
289    ///
290    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
291    /// not guarantee interception is active; interception is actually enabled/disabled by
292    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
293    ///
294    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
295    /// enabled to satisfy auth challenges.
296    pub(crate) user_request_interception_enabled: bool,
297    /// Hard kill-switch to block all network traffic.
298    ///
299    /// When `true`, the manager immediately blocks requests (typically via
300    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
301    /// and short-circuits most decision logic. This is used for safety conditions such as
302    /// exceeding `max_bytes_allowed` or other runtime protections.
303    block_all: bool,
304    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
305    ///
306    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
307    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
308    /// when `user_request_interception_enabled` or `credentials` change.
309    pub(crate) protocol_request_interception_enabled: bool,
310    /// The network is offline.
311    offline: bool,
312    /// The page request timeout.
313    pub request_timeout: Duration,
314    // made_request: bool,
315    /// Ignore visuals (no pings, prefetching, and etc).
316    pub ignore_visuals: bool,
317    /// Block CSS stylesheets.
318    pub block_stylesheets: bool,
319    /// Block javascript that is not critical to rendering.
320    ///
321    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
322    /// by itself (it remains for config compatibility).
323    pub block_javascript: bool,
324    /// Block analytics from rendering
325    pub block_analytics: bool,
326    /// Block pre-fetch request
327    pub block_prefetch: bool,
328    /// Only html from loading.
329    pub only_html: bool,
330    /// Is xml document?
331    pub xml_document: bool,
332    /// The custom intercept handle logic to run on the website.
333    pub intercept_manager: NetworkInterceptManager,
334    /// Track the amount of times the document reloaded.
335    pub document_reload_tracker: u8,
336    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
337    pub document_target_url: String,
338    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
339    pub document_target_domain: String,
340    /// The max bytes to receive.
341    pub max_bytes_allowed: Option<u64>,
342    #[cfg(feature = "_cache")]
343    /// The cache site_key to use.
344    pub cache_site_key: Option<String>,
345    /// The cache policy to use.
346    #[cfg(feature = "_cache")]
347    pub cache_policy: Option<BasicCachePolicy>,
348    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
349    whitelist_patterns: Vec<String>,
350    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
351    whitelist_matcher: Option<AhoCorasick>,
352    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
353    blacklist_patterns: Vec<String>,
354    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
355    blacklist_matcher: Option<AhoCorasick>,
356    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
357    blacklist_strict: bool,
358}
359
360impl NetworkManager {
361    /// A new network manager.
362    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
363        Self {
364            queued_events: Default::default(),
365            ignore_httpserrors,
366            requests: Default::default(),
367            requests_will_be_sent: Default::default(),
368            extra_headers: Default::default(),
369            request_id_to_interception_id: Default::default(),
370            user_cache_disabled: false,
371            attempted_authentications: Default::default(),
372            credentials: None,
373            block_all: false,
374            user_request_interception_enabled: false,
375            protocol_request_interception_enabled: false,
376            offline: false,
377            request_timeout,
378            ignore_visuals: false,
379            block_javascript: false,
380            block_stylesheets: false,
381            block_prefetch: true,
382            block_analytics: true,
383            only_html: false,
384            xml_document: false,
385            intercept_manager: NetworkInterceptManager::Unknown,
386            document_reload_tracker: 0,
387            document_target_url: String::new(),
388            document_target_domain: String::new(),
389            whitelist_patterns: Vec::new(),
390            whitelist_matcher: None,
391            blacklist_patterns: Vec::new(),
392            blacklist_matcher: None,
393            blacklist_strict: true,
394            max_bytes_allowed: None,
395            #[cfg(feature = "_cache")]
396            cache_site_key: None,
397            #[cfg(feature = "_cache")]
398            cache_policy: None,
399        }
400    }
401
402    /// Replace the whitelist patterns (compiled once).
403    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
404    where
405        I: IntoIterator<Item = S>,
406        S: Into<String>,
407    {
408        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
409        self.rebuild_whitelist_matcher();
410    }
411
412    /// Replace the blacklist patterns (compiled once).
413    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
414    where
415        I: IntoIterator<Item = S>,
416        S: Into<String>,
417    {
418        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
419        self.rebuild_blacklist_matcher();
420    }
421
422    /// Add one pattern (cheap) and rebuild (call this sparingly).
423    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
424        self.blacklist_patterns.push(pattern.into());
425        self.rebuild_blacklist_matcher();
426    }
427
428    /// Add many patterns and rebuild once.
429    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
430    where
431        I: IntoIterator<Item = S>,
432        S: Into<String>,
433    {
434        self.blacklist_patterns
435            .extend(patterns.into_iter().map(Into::into));
436        self.rebuild_blacklist_matcher();
437    }
438
439    /// Clear blacklist entirely.
440    pub fn clear_blacklist(&mut self) {
441        self.blacklist_patterns.clear();
442        self.blacklist_matcher = None;
443    }
444
445    /// Control precedence: when true, blacklist always wins.
446    pub fn set_blacklist_strict(&mut self, strict: bool) {
447        self.blacklist_strict = strict;
448    }
449
450    #[inline]
451    fn rebuild_blacklist_matcher(&mut self) {
452        if self.blacklist_patterns.is_empty() {
453            self.blacklist_matcher = None;
454            return;
455        }
456
457        self.blacklist_matcher =
458            AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
459    }
460
461    #[inline]
462    fn is_blacklisted(&self, url: &str) -> bool {
463        self.blacklist_matcher
464            .as_ref()
465            .map(|m| m.is_match(url))
466            .unwrap_or(false)
467    }
468
469    /// Add one pattern (cheap) and rebuild (call this sparingly).
470    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
471        self.whitelist_patterns.push(pattern.into());
472        self.rebuild_whitelist_matcher();
473    }
474
475    /// Add many patterns and rebuild once.
476    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
477    where
478        I: IntoIterator<Item = S>,
479        S: Into<String>,
480    {
481        self.whitelist_patterns
482            .extend(patterns.into_iter().map(Into::into));
483        self.rebuild_whitelist_matcher();
484    }
485
486    #[inline]
487    fn rebuild_whitelist_matcher(&mut self) {
488        if self.whitelist_patterns.is_empty() {
489            self.whitelist_matcher = None;
490            return;
491        }
492
493        // If building fails (shouldn’t for simple patterns), just disable matcher.
494        self.whitelist_matcher =
495            AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
496    }
497
498    #[inline]
499    fn is_whitelisted(&self, url: &str) -> bool {
500        self.whitelist_matcher
501            .as_ref()
502            .map(|m| m.is_match(url))
503            .unwrap_or(false)
504    }
505
506    /// Commands to init the chain with.
507    pub fn init_commands(&self) -> CommandChain {
508        let cmds = if self.ignore_httpserrors {
509            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
510        } else {
511            INIT_CHAIN.clone()
512        };
513        CommandChain::new(cmds, self.request_timeout)
514    }
515
516    /// Push the CDP request.
517    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
518        let method = cmd.identifier();
519        if let Ok(params) = serde_json::to_value(cmd) {
520            self.queued_events
521                .push_back(NetworkEvent::SendCdpRequest((method, params)));
522        }
523    }
524
525    /// The next event to handle.
526    pub fn poll(&mut self) -> Option<NetworkEvent> {
527        self.queued_events.pop_front()
528    }
529
530    /// Get the extra headers.
531    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
532        &self.extra_headers
533    }
534
535    /// Set extra HTTP headers.
536    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
537        self.extra_headers = headers;
538        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
539        self.extra_headers.remove("Proxy-Authorization");
540        if !self.extra_headers.is_empty() {
541            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
542                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
543            }
544        }
545    }
546
547    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
548        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
549    }
550
551    pub fn set_block_all(&mut self, block_all: bool) {
552        self.block_all = block_all;
553    }
554
555    pub fn set_request_interception(&mut self, enabled: bool) {
556        self.user_request_interception_enabled = enabled;
557        self.update_protocol_request_interception();
558    }
559
560    pub fn set_cache_enabled(&mut self, enabled: bool) {
561        let run = self.user_cache_disabled == enabled;
562        self.user_cache_disabled = !enabled;
563        if run {
564            self.update_protocol_cache_disabled();
565        }
566    }
567
568    /// Enable fetch interception.
569    pub fn enable_request_intercept(&mut self) {
570        self.protocol_request_interception_enabled = true;
571    }
572
573    /// Disable fetch interception.
574    pub fn disable_request_intercept(&mut self) {
575        self.protocol_request_interception_enabled = false;
576    }
577
578    /// Set the cache site key.
579    #[cfg(feature = "_cache")]
580    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
581        self.cache_site_key = cache_site_key;
582    }
583
584    /// Set the cache policy.
585    #[cfg(feature = "_cache")]
586    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
587        self.cache_policy = cache_policy;
588    }
589
590    pub fn update_protocol_cache_disabled(&mut self) {
591        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
592    }
593
594    pub fn authenticate(&mut self, credentials: Credentials) {
595        self.credentials = Some(credentials);
596        self.update_protocol_request_interception();
597        self.protocol_request_interception_enabled = true;
598    }
599
600    fn update_protocol_request_interception(&mut self) {
601        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
602
603        if enabled == self.protocol_request_interception_enabled {
604            return;
605        }
606
607        if enabled {
608            self.push_cdp_request(ENABLE_FETCH.clone())
609        } else {
610            self.push_cdp_request(DisableParams::default())
611        }
612    }
613
614    /// Blocklist-only script blocking.
615    /// Returns true only when the URL matches an explicit blocklist condition.
616    #[inline]
617    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
618        // If analytics blocking is off, skip all analytics tries.
619        let block_analytics = self.block_analytics;
620
621        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
622        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
623        {
624            return true;
625        }
626
627        // 2) Custom website block list (explicit).
628        if crate::handler::blockers::block_websites::block_website(url) {
629            return true;
630        }
631
632        // 3) Path-based explicit tries / fallbacks.
633        //
634        // We run these on:
635        // - path with leading slash ("/js/app.js")
636        // - path without leading slash ("js/app.js")
637        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
638        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
639            // Remove query/fragment so matching stays stable.
640            let p_slash = Self::strip_query_fragment(path_with_slash);
641            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
642
643            // Basename for filename-only lists.
644            let base = match p_slash.rsplit('/').next() {
645                Some(b) => b,
646                None => p_slash,
647            };
648
649            // ---- Trie checks ----
650            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
651            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
652                return true;
653            }
654            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
655                return true;
656            }
657            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
658                return true;
659            }
660
661            // Base-path ignore tries (framework noise / known ignorable script paths).
662            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
663            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
664                return true;
665            }
666
667            // Style path ignores only when visuals are ignored.
668            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
669                return true;
670            }
671        }
672
673        false
674    }
675
676    /// Extract the absolute URL path portion WITH the leading slash.
677    ///
678    /// Example:
679    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
680    #[inline]
681    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
682        // find scheme separator
683        let idx = url.find("//")?;
684        let after_slashes = idx + 2;
685
686        // find first slash after host
687        let slash_rel = url[after_slashes..].find('/')?;
688        let slash_idx = after_slashes + slash_rel;
689
690        if slash_idx < url.len() {
691            Some(&url[slash_idx..])
692        } else {
693            None
694        }
695    }
696
697    /// Strip query string and fragment from a path-ish string.
698    ///
699    /// Example:
700    /// - "/a/b.js?x=1#y" -> "/a/b.js"
701    #[inline]
702    fn strip_query_fragment(s: &str) -> &str {
703        let q = s.find('?');
704        let h = s.find('#');
705
706        match (q, h) {
707            (None, None) => s,
708            (Some(i), None) => &s[..i],
709            (None, Some(i)) => &s[..i],
710            (Some(i), Some(j)) => &s[..i.min(j)],
711        }
712    }
713
714    /// Determine if the request should be skipped.
715    #[inline]
716    fn skip_xhr(
717        &self,
718        skip_networking: bool,
719        event: &EventRequestPaused,
720        network_event: bool,
721    ) -> bool {
722        // XHR check
723        if !skip_networking && network_event {
724            let request_url = event.request.url.as_str();
725
726            // check if part of ignore scripts.
727            let skip_analytics =
728                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
729
730            if skip_analytics {
731                true
732            } else if self.block_stylesheets || self.ignore_visuals {
733                let block_css = self.block_stylesheets;
734                let block_media = self.ignore_visuals;
735
736                let mut block_request = false;
737
738                if let Some(position) = request_url.rfind('.') {
739                    let hlen = request_url.len();
740                    let has_asset = hlen - position;
741
742                    if has_asset >= 3 {
743                        let next_position = position + 1;
744
745                        if block_media
746                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
747                                &request_url[next_position..].into(),
748                            )
749                        {
750                            block_request = true;
751                        } else if block_css {
752                            block_request = CaseInsensitiveString::from(
753                                &request_url.as_bytes()[next_position..],
754                            )
755                            .contains(&**CSS_EXTENSION)
756                        }
757                    }
758                }
759
760                if !block_request {
761                    block_request = ignore_script_xhr_media(request_url);
762                }
763
764                block_request
765            } else {
766                skip_networking
767            }
768        } else {
769            skip_networking
770        }
771    }
772
773    #[cfg(feature = "adblock")]
774    #[inline]
775    /// Detect if ad enabled.
776    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
777        if skip_networking {
778            true
779        } else {
780            block_ads(&event.request.url) || self.detect_ad(event)
781        }
782    }
783
784    /// When adblock feature is disabled, this is a no-op.
785    #[cfg(not(feature = "adblock"))]
786    #[inline]
787    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
788        use crate::handler::blockers::block_websites::block_ads;
789        if skip_networking {
790            true
791        } else {
792            block_ads(&event.request.url)
793        }
794    }
795
796    #[inline]
797    /// Fail request
798    fn fail_request_blocked(
799        &mut self,
800        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
801    ) {
802        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
803            request_id.clone(),
804            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
805        );
806        self.push_cdp_request(params);
807    }
808
809    #[inline]
810    /// Fulfill request
811    fn fulfill_request_empty_200(
812        &mut self,
813        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
814    ) {
815        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
816            request_id.clone(),
817            200,
818        );
819        self.push_cdp_request(params);
820    }
821
822    #[cfg(feature = "_cache")]
823    #[inline]
824    /// Fulfill a paused Fetch request from cached bytes + header map.
825    ///
826    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
827    fn fulfill_request_from_cache(
828        &mut self,
829        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
830        body: &[u8],
831        headers: &std::collections::HashMap<String, String>,
832        status: i64,
833    ) {
834        use crate::cdp::browser_protocol::fetch::HeaderEntry;
835        use crate::handler::network::fetch::FulfillRequestParams;
836        use base64::Engine;
837
838        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
839
840        for (k, v) in headers.iter() {
841            resp_headers.push(HeaderEntry {
842                name: k.clone().into(),
843                value: v.clone().into(),
844            });
845        }
846
847        let mut params = FulfillRequestParams::new(request_id.clone(), status);
848
849        // TODO: have this already encoded prior.
850        params.body = Some(
851            base64::engine::general_purpose::STANDARD
852                .encode(body)
853                .into(),
854        );
855
856        params.response_headers = Some(resp_headers);
857
858        self.push_cdp_request(params);
859    }
860
861    #[inline]
862    /// Continue the request url.
863    fn continue_request_with_url(
864        &mut self,
865        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
866        url: Option<&str>,
867        intercept_response: bool,
868    ) {
869        let mut params = ContinueRequestParams::new(request_id.clone());
870        if let Some(url) = url {
871            params.url = Some(url.to_string());
872            params.intercept_response = Some(intercept_response);
873        }
874        self.push_cdp_request(params);
875    }
876
877    /// On fetch request paused interception.
878    #[inline]
879    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
880        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
881            return;
882        }
883
884        if self.block_all {
885            tracing::debug!(
886                "Blocked (block_all): {:?} - {}",
887                event.resource_type,
888                event.request.url
889            );
890            return self.fail_request_blocked(&event.request_id);
891        }
892
893        if let Some(network_id) = event.network_id.as_ref() {
894            if let Some(request_will_be_sent) =
895                self.requests_will_be_sent.remove(network_id.as_ref())
896            {
897                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
898            } else {
899                self.request_id_to_interception_id
900                    .insert(network_id.clone(), event.request_id.clone().into());
901            }
902        }
903
904        // From here on, we handle the full decision tree.
905        let javascript_resource = event.resource_type == ResourceType::Script;
906        let document_resource = event.resource_type == ResourceType::Document;
907        let network_resource =
908            !document_resource && crate::utils::is_data_resource(&event.resource_type);
909
910        // Start with static / cheap skip checks.
911        let mut skip_networking =
912            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
913
914        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
915            skip_networking = true;
916        }
917
918        // Also short-circuit if we've reloaded this document too many times.
919        if !skip_networking {
920            skip_networking = self.document_reload_tracker >= 3;
921        }
922
923        // Handle document redirect / masking and track xml documents.
924        let (current_url_cow, had_replacer) =
925            self.handle_document_replacement_and_tracking(event, document_resource);
926
927        let current_url: &str = current_url_cow.as_ref();
928
929        let blacklisted = self.is_blacklisted(current_url);
930
931        if !self.blacklist_strict && blacklisted {
932            skip_networking = true;
933        }
934
935        if !skip_networking {
936            // Allow XSL for sitemap XML.
937            if self.xml_document && current_url.ends_with(".xsl") {
938                skip_networking = false;
939            } else {
940                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
941            }
942        }
943
944        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
945
946        // Ignore embedded scripts when only_html or ignore_visuals is set.
947        if !skip_networking
948            && self.block_javascript
949            && (self.only_html || self.ignore_visuals)
950            && (javascript_resource || document_resource)
951        {
952            skip_networking = ignore_script_embedded(current_url);
953        }
954
955        // Script policy: allow-by-default.
956        // Block only if explicit block list patterns match.
957        if !skip_networking && javascript_resource {
958            skip_networking = self.should_block_script_blocklist_only(current_url);
959        }
960
961        // XHR / data resources.
962        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
963
964        // Custom interception layer.
965        if !skip_networking && (javascript_resource || network_resource || document_resource) {
966            skip_networking = self.intercept_manager.intercept_detection(
967                current_url,
968                self.ignore_visuals,
969                network_resource,
970            );
971        }
972
973        // Custom website block list.
974        if !skip_networking && (javascript_resource || network_resource) {
975            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
976        }
977
978        // whitelist 3rd party
979        // not required unless explicit blocking.
980        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
981        {
982            skip_networking = false;
983        }
984
985        // check if the url is in the whitelist.
986        if skip_networking && self.is_whitelisted(current_url) {
987            skip_networking = false;
988        }
989
990        if self.blacklist_strict && blacklisted {
991            skip_networking = true;
992        }
993
994        if skip_networking {
995            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
996            self.fulfill_request_empty_200(&event.request_id);
997        } else {
998            #[cfg(feature = "_cache")]
999            {
1000                if let (Some(policy), Some(cache_site_key)) =
1001                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1002                {
1003                    let current_url = format!("{}:{}", event.request.method, &current_url);
1004
1005                    if let Some((res, cache_policy)) =
1006                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1007                    {
1008                        if policy.allows_cached(&cache_policy) {
1009                            tracing::debug!(
1010                                "Remote Cached: {:?} - {}",
1011                                &event.resource_type,
1012                                &current_url
1013                            );
1014                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1015                            return self.fulfill_request_from_cache(
1016                                &event.request_id,
1017                                &res.body,
1018                                &flat_headers,
1019                                res.status as i64,
1020                            );
1021                        }
1022                    }
1023                }
1024            }
1025
1026            // check our frame cache for the run.
1027            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1028            self.continue_request_with_url(
1029                &event.request_id,
1030                if had_replacer {
1031                    Some(current_url)
1032                } else {
1033                    None
1034                },
1035                !had_replacer,
1036            );
1037        }
1038    }
1039
1040    /// Shared "visuals + basic blocking" logic.
1041    ///
1042    /// IMPORTANT: Scripts are NOT blocked here anymore.
1043    /// Scripts are allowed by default and only blocked via explicit blocklists
1044    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1045    #[inline]
1046    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1047        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1048            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1049    }
1050
1051    /// Does the network manager have a target domain?
1052    pub fn has_target_domain(&self) -> bool {
1053        !self.document_target_url.is_empty()
1054    }
1055
1056    /// Set the target page url for tracking.
1057    pub fn set_page_url(&mut self, page_target_url: String) {
1058        let host_base = host_and_rest(&page_target_url)
1059            .map(|(h, _)| base_domain_from_host(h))
1060            .unwrap_or("");
1061
1062        self.document_target_domain = host_base.to_string();
1063        self.document_target_url = page_target_url;
1064    }
1065
1066    /// Clear the initial target domain on every navigation.
1067    pub fn clear_target_domain(&mut self) {
1068        self.document_reload_tracker = 0;
1069        self.document_target_url = Default::default();
1070        self.document_target_domain = Default::default();
1071    }
1072
1073    /// Handles:
1074    /// - document reload tracking (`document_reload_tracker`)
1075    /// - redirect masking / replacement
1076    /// - xml document detection (`xml_document`)
1077    /// - `document_target_url` updates
1078    ///
1079    /// Returns (current_url, had_replacer).
1080    #[inline]
1081    fn handle_document_replacement_and_tracking<'a>(
1082        &mut self,
1083        event: &'a EventRequestPaused,
1084        document_resource: bool,
1085    ) -> (Cow<'a, str>, bool) {
1086        let mut replacer: Option<String> = None;
1087        let current_url = event.request.url.as_str();
1088
1089        if document_resource {
1090            if self.document_target_url == current_url {
1091                self.document_reload_tracker += 1;
1092            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1093            {
1094                let (http_document_replacement, mut https_document_replacement) =
1095                    if self.document_target_url.starts_with("http://") {
1096                        (
1097                            self.document_target_url.replacen("http://", "http//", 1),
1098                            self.document_target_url.replacen("http://", "https://", 1),
1099                        )
1100                    } else {
1101                        (
1102                            self.document_target_url.replacen("https://", "https//", 1),
1103                            self.document_target_url.replacen("https://", "http://", 1),
1104                        )
1105                    };
1106
1107                // Track trailing slash to restore later.
1108                let trailing = https_document_replacement.ends_with('/');
1109                if trailing {
1110                    https_document_replacement.pop();
1111                }
1112                if https_document_replacement.ends_with('/') {
1113                    https_document_replacement.pop();
1114                }
1115
1116                let redirect_mask = format!(
1117                    "{}{}",
1118                    https_document_replacement, http_document_replacement
1119                );
1120
1121                if current_url == redirect_mask {
1122                    replacer = Some(if trailing {
1123                        format!("{}/", https_document_replacement)
1124                    } else {
1125                        https_document_replacement
1126                    });
1127                }
1128            }
1129
1130            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1131                self.xml_document = true;
1132            }
1133
1134            // Track last seen document URL.
1135            self.document_target_url = event.request.url.clone();
1136            self.document_target_domain = host_and_rest(&self.document_target_url)
1137                .map(|(h, _)| base_domain_from_host(h).to_string())
1138                .unwrap_or_default();
1139        }
1140
1141        let current_url_cow = match replacer {
1142            Some(r) => Cow::Owned(r),
1143            None => Cow::Borrowed(event.request.url.as_str()),
1144        };
1145
1146        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1147        (current_url_cow, had_replacer)
1148    }
1149
1150    /// Perform a page intercept for chrome
1151    #[cfg(feature = "adblock")]
1152    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1153        use adblock::{
1154            lists::{FilterSet, ParseOptions, RuleTypes},
1155            Engine,
1156        };
1157
1158        lazy_static::lazy_static! {
1159            static ref AD_ENGINE: Engine = {
1160                let mut filter_set = FilterSet::new(false);
1161                let mut rules = ParseOptions::default();
1162                rules.rule_types = RuleTypes::All;
1163
1164                filter_set.add_filters(
1165                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1166                    rules,
1167                );
1168
1169                Engine::from_filter_set(filter_set, true)
1170            };
1171        };
1172
1173        let blockable = ResourceType::Image == event.resource_type
1174            || event.resource_type == ResourceType::Media
1175            || event.resource_type == ResourceType::Stylesheet
1176            || event.resource_type == ResourceType::Document
1177            || event.resource_type == ResourceType::Fetch
1178            || event.resource_type == ResourceType::Xhr;
1179
1180        let u = &event.request.url;
1181
1182        let block_request = blockable
1183            // set it to example.com for 3rd party handling is_same_site
1184        && {
1185            let request = adblock::request::Request::preparsed(
1186                 &u,
1187                 "example.com",
1188                 "example.com",
1189                 &event.resource_type.as_ref().to_lowercase(),
1190                 !event.request.is_same_site.unwrap_or_default());
1191
1192            AD_ENGINE.check_network_request(&request).matched
1193        };
1194
1195        block_request
1196    }
1197
1198    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1199        let response = if self
1200            .attempted_authentications
1201            .contains(event.request_id.as_ref())
1202        {
1203            AuthChallengeResponseResponse::CancelAuth
1204        } else if self.credentials.is_some() {
1205            self.attempted_authentications
1206                .insert(event.request_id.clone().into());
1207            AuthChallengeResponseResponse::ProvideCredentials
1208        } else {
1209            AuthChallengeResponseResponse::Default
1210        };
1211
1212        let mut auth = AuthChallengeResponse::new(response);
1213        if let Some(creds) = self.credentials.clone() {
1214            auth.username = Some(creds.username);
1215            auth.password = Some(creds.password);
1216        }
1217        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1218    }
1219
1220    /// Set the page offline network emulation condition.
1221    pub fn set_offline_mode(&mut self, value: bool) {
1222        if self.offline == value {
1223            return;
1224        }
1225        self.offline = value;
1226        if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1227            .offline(self.offline)
1228            .matched_network_condition(
1229                NetworkConditions::builder()
1230                    .url_pattern("")
1231                    .latency(0)
1232                    .download_throughput(-1.)
1233                    .upload_throughput(-1.)
1234                    .build()
1235                    .unwrap(),
1236            )
1237            .build()
1238        {
1239            self.push_cdp_request(network);
1240        }
1241    }
1242
1243    /// Request interception doesn't happen for data URLs with Network Service.
1244    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1245        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1246            if let Some(interception_id) = self
1247                .request_id_to_interception_id
1248                .remove(event.request_id.as_ref())
1249            {
1250                self.on_request(event, Some(interception_id));
1251            } else {
1252                // TODO remove the clone for event
1253                self.requests_will_be_sent
1254                    .insert(event.request_id.clone(), event.clone());
1255            }
1256        } else {
1257            self.on_request(event, None);
1258        }
1259    }
1260
1261    /// The request was served from the cache.
1262    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1263        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1264            request.from_memory_cache = true;
1265        }
1266    }
1267
1268    /// On network response received.
1269    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1270        let mut request_failed = false;
1271
1272        // Track how many bytes we actually deducted from this target.
1273        let mut deducted: u64 = 0;
1274
1275        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1276            let before = *max_bytes;
1277
1278            // encoded_data_length -> saturating cast to u64
1279            let received_bytes: u64 = event.response.encoded_data_length as u64;
1280
1281            // Safe parse of Content-Length
1282            let content_length: Option<u64> = event
1283                .response
1284                .headers
1285                .inner()
1286                .get("content-length")
1287                .and_then(|v| v.as_str())
1288                .and_then(|s| s.trim().parse::<u64>().ok());
1289
1290            // Deduct what we actually received
1291            *max_bytes = max_bytes.saturating_sub(received_bytes);
1292
1293            // If the declared size can't fit, zero out now
1294            if let Some(cl) = content_length {
1295                if cl > *max_bytes {
1296                    *max_bytes = 0;
1297                }
1298            }
1299
1300            request_failed = *max_bytes == 0;
1301
1302            // Compute exact delta deducted on this event
1303            deducted = before.saturating_sub(*max_bytes);
1304        }
1305
1306        // Bubble up the deduction (even if request continues)
1307        if deducted > 0 {
1308            self.queued_events
1309                .push_back(NetworkEvent::BytesConsumed(deducted));
1310        }
1311
1312        // block all network request moving forward.
1313        if request_failed && self.max_bytes_allowed.is_some() {
1314            self.set_block_all(true);
1315        }
1316
1317        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1318            request.set_response(event.response.clone());
1319            self.queued_events.push_back(if request_failed {
1320                NetworkEvent::RequestFailed(request)
1321            } else {
1322                NetworkEvent::RequestFinished(request)
1323            });
1324        }
1325    }
1326
1327    /// On network loading finished.
1328    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1329        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1330            if let Some(interception_id) = request.interception_id.as_ref() {
1331                self.attempted_authentications
1332                    .remove(interception_id.as_ref());
1333            }
1334            self.queued_events
1335                .push_back(NetworkEvent::RequestFinished(request));
1336        }
1337    }
1338
1339    /// On network loading failed.
1340    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1341        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1342            request.failure_text = Some(event.error_text.clone());
1343            if let Some(interception_id) = request.interception_id.as_ref() {
1344                self.attempted_authentications
1345                    .remove(interception_id.as_ref());
1346            }
1347            self.queued_events
1348                .push_back(NetworkEvent::RequestFailed(request));
1349        }
1350    }
1351
1352    /// On request will be sent.
1353    fn on_request(
1354        &mut self,
1355        event: &EventRequestWillBeSent,
1356        interception_id: Option<InterceptionId>,
1357    ) {
1358        let mut redirect_chain = Vec::new();
1359        let mut redirect_location = None;
1360
1361        if let Some(redirect_resp) = &event.redirect_response {
1362            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1363                if is_redirect_status(redirect_resp.status) {
1364                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1365                        if redirect_resp.url != location {
1366                            let fixed_location = location.replace(&redirect_resp.url, "");
1367
1368                            if !fixed_location.is_empty() {
1369                                if let Some(resp) = request.response.as_mut() {
1370                                    resp.headers.0["Location"] =
1371                                        serde_json::Value::String(fixed_location.clone());
1372                                }
1373                            }
1374
1375                            redirect_location = Some(fixed_location);
1376                        }
1377                    }
1378                }
1379
1380                self.handle_request_redirect(
1381                    &mut request,
1382                    if let Some(redirect_location) = redirect_location {
1383                        let mut redirect_resp = redirect_resp.clone();
1384
1385                        if !redirect_location.is_empty() {
1386                            redirect_resp.headers.0["Location"] =
1387                                serde_json::Value::String(redirect_location);
1388                        }
1389
1390                        redirect_resp
1391                    } else {
1392                        redirect_resp.clone()
1393                    },
1394                );
1395
1396                redirect_chain = std::mem::take(&mut request.redirect_chain);
1397                redirect_chain.push(request);
1398            }
1399        }
1400
1401        let request = HttpRequest::new(
1402            event.request_id.clone(),
1403            event.frame_id.clone(),
1404            interception_id,
1405            self.user_request_interception_enabled,
1406            redirect_chain,
1407        );
1408
1409        self.requests.insert(event.request_id.clone(), request);
1410        self.queued_events
1411            .push_back(NetworkEvent::Request(event.request_id.clone()));
1412    }
1413
1414    /// Handle request redirect.
1415    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1416        request.set_response(response);
1417        if let Some(interception_id) = request.interception_id.as_ref() {
1418            self.attempted_authentications
1419                .remove(interception_id.as_ref());
1420        }
1421    }
1422}
1423
1424#[derive(Debug)]
1425pub enum NetworkEvent {
1426    /// Send a CDP request.
1427    SendCdpRequest((MethodId, serde_json::Value)),
1428    /// Request.
1429    Request(RequestId),
1430    /// Response
1431    Response(RequestId),
1432    /// Request failed.
1433    RequestFailed(HttpRequest),
1434    /// Request finished.
1435    RequestFinished(HttpRequest),
1436    /// Bytes consumed.
1437    BytesConsumed(u64),
1438}
1439
1440#[cfg(test)]
1441mod tests {
1442    use super::ALLOWED_MATCHER_3RD_PARTY;
1443    use crate::handler::network::NetworkManager;
1444    use std::time::Duration;
1445
1446    #[test]
1447    fn test_allowed_matcher_3rd_party() {
1448        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1449        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1450        assert!(
1451            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1452            "expected Cloudflare challenge script to be allowed"
1453        );
1454
1455        // Should NOT be allowed (not in allow-list)
1456        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1457        assert!(
1458            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1459            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1460        );
1461
1462        // A couple sanity checks for existing allow patterns
1463        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1464        assert!(ALLOWED_MATCHER_3RD_PARTY
1465            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1466        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1467    }
1468
1469    #[test]
1470    fn test_script_allowed_by_default_when_not_blocklisted() {
1471        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1472        nm.set_page_url(
1473            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1474        );
1475
1476        // A random script that should not match your block tries.
1477        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1478        assert!(
1479            !nm.should_block_script_blocklist_only(ok),
1480            "expected non-blocklisted script to be allowed"
1481        );
1482    }
1483
1484    #[test]
1485    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1486        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1487        nm.set_page_url(
1488            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1489        );
1490
1491        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1492        let bad = "https://cdn.example.net/js/analytics.js";
1493        assert!(
1494            nm.should_block_script_blocklist_only(bad),
1495            "expected analytics.js to be blocklisted"
1496        );
1497    }
1498
1499    #[test]
1500    fn test_allowed_matcher_3rd_party_sanity() {
1501        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1502        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1503        assert!(
1504            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1505            "expected Cloudflare challenge script to be allowed"
1506        );
1507
1508        // Should NOT be allowed (not in allow-list)
1509        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1510        assert!(
1511            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1512            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1513        );
1514
1515        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1516        assert!(ALLOWED_MATCHER_3RD_PARTY
1517            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1518        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1519    }
1520    #[test]
1521    fn test_dynamic_blacklist_blocks_url() {
1522        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1523        nm.set_page_url("https://example.com/".to_string());
1524
1525        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1526        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1527        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1528
1529        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1530    }
1531
1532    #[test]
1533    fn test_blacklist_strict_wins_over_whitelist() {
1534        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1535        nm.set_page_url("https://example.com/".to_string());
1536
1537        // Same URL in both lists.
1538        nm.set_blacklist_patterns(["beacon.min.js"]);
1539        nm.set_whitelist_patterns(["beacon.min.js"]);
1540
1541        nm.set_blacklist_strict(true);
1542
1543        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1544        assert!(nm.is_whitelisted(u));
1545        assert!(nm.is_blacklisted(u));
1546
1547        // In strict mode, it should still be considered blocked at decision time.
1548        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1549        assert!(nm.blacklist_strict);
1550    }
1551
1552    #[test]
1553    fn test_blacklist_non_strict_allows_whitelist_override() {
1554        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1555        nm.set_page_url("https://example.com/".to_string());
1556
1557        nm.set_blacklist_patterns(["beacon.min.js"]);
1558        nm.set_whitelist_patterns(["beacon.min.js"]);
1559
1560        nm.set_blacklist_strict(false);
1561
1562        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1563        assert!(nm.is_blacklisted(u));
1564        assert!(nm.is_whitelisted(u));
1565        assert!(!nm.blacklist_strict);
1566    }
1567}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs