Skip to main content

chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20    SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::Duration;
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235#[derive(Debug)]
236/// The base network manager.
237pub struct NetworkManager {
238    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
239    ///
240    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
241    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
242    /// Consumers pull from this queue via `poll()`.
243    queued_events: VecDeque<NetworkEvent>,
244    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
245    ///
246    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
247    /// certificates (self-signed, expired, MITM proxies, etc.).
248    ignore_httpserrors: bool,
249    /// Active in-flight requests keyed by CDP `RequestId`.
250    ///
251    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
252    /// and final state used to emit `RequestFinished` / `RequestFailed`.
253    requests: HashMap<RequestId, HttpRequest>,
254    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
255    /// `Fetch.requestPaused` arrives later (or vice versa).
256    ///
257    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
258    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
259    // TODO put event in an Arc?
260    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
261    /// Extra HTTP headers to apply to subsequent network requests via CDP.
262    ///
263    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
264    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
265    extra_headers: std::collections::HashMap<String, String>,
266    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
267    ///
268    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
269    /// store the interception id here so it can be attached to the `HttpRequest` once the
270    /// network request is observed.
271    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
272    /// Whether the user has disabled the browser cache.
273    ///
274    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
275    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
276    user_cache_disabled: bool,
277    /// Tracks which requests have already attempted authentication.
278    ///
279    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
280    /// authentication challenges (407/401). Once a request id is present here, subsequent
281    /// challenges for the same request are canceled.
282    attempted_authentications: HashSet<RequestId>,
283    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
284    ///
285    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
286    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
287    credentials: Option<Credentials>,
288    /// User-facing toggle indicating whether request interception is desired.
289    ///
290    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
291    /// not guarantee interception is active; interception is actually enabled/disabled by
292    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
293    ///
294    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
295    /// enabled to satisfy auth challenges.
296    pub(crate) user_request_interception_enabled: bool,
297    /// Hard kill-switch to block all network traffic.
298    ///
299    /// When `true`, the manager immediately blocks requests (typically via
300    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
301    /// and short-circuits most decision logic. This is used for safety conditions such as
302    /// exceeding `max_bytes_allowed` or other runtime protections.
303    block_all: bool,
304    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
305    ///
306    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
307    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
308    /// when `user_request_interception_enabled` or `credentials` change.
309    pub(crate) protocol_request_interception_enabled: bool,
310    /// The network is offline.
311    offline: bool,
312    /// The page request timeout.
313    pub request_timeout: Duration,
314    // made_request: bool,
315    /// Ignore visuals (no pings, prefetching, and etc).
316    pub ignore_visuals: bool,
317    /// Block CSS stylesheets.
318    pub block_stylesheets: bool,
319    /// Block javascript that is not critical to rendering.
320    ///
321    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
322    /// by itself (it remains for config compatibility).
323    pub block_javascript: bool,
324    /// Block analytics from rendering
325    pub block_analytics: bool,
326    /// Block pre-fetch request
327    pub block_prefetch: bool,
328    /// Only html from loading.
329    pub only_html: bool,
330    /// Is xml document?
331    pub xml_document: bool,
332    /// The custom intercept handle logic to run on the website.
333    pub intercept_manager: NetworkInterceptManager,
334    /// Track the amount of times the document reloaded.
335    pub document_reload_tracker: u8,
336    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
337    pub document_target_url: String,
338    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
339    pub document_target_domain: String,
340    /// The max bytes to receive.
341    pub max_bytes_allowed: Option<u64>,
342    #[cfg(feature = "_cache")]
343    /// The cache site_key to use.
344    pub cache_site_key: Option<String>,
345    /// The cache policy to use.
346    #[cfg(feature = "_cache")]
347    pub cache_policy: Option<BasicCachePolicy>,
348    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
349    whitelist_patterns: Vec<String>,
350    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
351    whitelist_matcher: Option<AhoCorasick>,
352    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
353    blacklist_patterns: Vec<String>,
354    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
355    blacklist_matcher: Option<AhoCorasick>,
356    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
357    blacklist_strict: bool,
358}
359
360impl NetworkManager {
361    /// A new network manager.
362    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
363        Self {
364            queued_events: Default::default(),
365            ignore_httpserrors,
366            requests: Default::default(),
367            requests_will_be_sent: Default::default(),
368            extra_headers: Default::default(),
369            request_id_to_interception_id: Default::default(),
370            user_cache_disabled: false,
371            attempted_authentications: Default::default(),
372            credentials: None,
373            block_all: false,
374            user_request_interception_enabled: false,
375            protocol_request_interception_enabled: false,
376            offline: false,
377            request_timeout,
378            ignore_visuals: false,
379            block_javascript: false,
380            block_stylesheets: false,
381            block_prefetch: true,
382            block_analytics: true,
383            only_html: false,
384            xml_document: false,
385            intercept_manager: NetworkInterceptManager::Unknown,
386            document_reload_tracker: 0,
387            document_target_url: String::new(),
388            document_target_domain: String::new(),
389            whitelist_patterns: Vec::new(),
390            whitelist_matcher: None,
391            blacklist_patterns: Vec::new(),
392            blacklist_matcher: None,
393            blacklist_strict: true,
394            max_bytes_allowed: None,
395            #[cfg(feature = "_cache")]
396            cache_site_key: None,
397            #[cfg(feature = "_cache")]
398            cache_policy: None,
399        }
400    }
401
402    /// Replace the whitelist patterns (compiled once).
403    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
404    where
405        I: IntoIterator<Item = S>,
406        S: Into<String>,
407    {
408        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
409        self.rebuild_whitelist_matcher();
410    }
411
412    /// Replace the blacklist patterns (compiled once).
413    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
414    where
415        I: IntoIterator<Item = S>,
416        S: Into<String>,
417    {
418        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
419        self.rebuild_blacklist_matcher();
420    }
421
422    /// Add one pattern (cheap) and rebuild (call this sparingly).
423    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
424        self.blacklist_patterns.push(pattern.into());
425        self.rebuild_blacklist_matcher();
426    }
427
428    /// Add many patterns and rebuild once.
429    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
430    where
431        I: IntoIterator<Item = S>,
432        S: Into<String>,
433    {
434        self.blacklist_patterns
435            .extend(patterns.into_iter().map(Into::into));
436        self.rebuild_blacklist_matcher();
437    }
438
439    /// Clear blacklist entirely.
440    pub fn clear_blacklist(&mut self) {
441        self.blacklist_patterns.clear();
442        self.blacklist_matcher = None;
443    }
444
445    /// Control precedence: when true, blacklist always wins.
446    pub fn set_blacklist_strict(&mut self, strict: bool) {
447        self.blacklist_strict = strict;
448    }
449
450    #[inline]
451    fn rebuild_blacklist_matcher(&mut self) {
452        if self.blacklist_patterns.is_empty() {
453            self.blacklist_matcher = None;
454            return;
455        }
456
457        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
458        self.blacklist_matcher = AhoCorasick::new(refs).ok();
459    }
460
461    #[inline]
462    fn is_blacklisted(&self, url: &str) -> bool {
463        self.blacklist_matcher
464            .as_ref()
465            .map(|m| m.is_match(url))
466            .unwrap_or(false)
467    }
468
469    /// Add one pattern (cheap) and rebuild (call this sparingly).
470    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
471        self.whitelist_patterns.push(pattern.into());
472        self.rebuild_whitelist_matcher();
473    }
474
475    /// Add many patterns and rebuild once.
476    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
477    where
478        I: IntoIterator<Item = S>,
479        S: Into<String>,
480    {
481        self.whitelist_patterns
482            .extend(patterns.into_iter().map(Into::into));
483        self.rebuild_whitelist_matcher();
484    }
485
486    #[inline]
487    fn rebuild_whitelist_matcher(&mut self) {
488        if self.whitelist_patterns.is_empty() {
489            self.whitelist_matcher = None;
490            return;
491        }
492
493        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
494
495        // If building fails (shouldn’t for simple patterns), just disable matcher.
496        self.whitelist_matcher = AhoCorasick::new(refs).ok();
497    }
498
499    #[inline]
500    fn is_whitelisted(&self, url: &str) -> bool {
501        self.whitelist_matcher
502            .as_ref()
503            .map(|m| m.is_match(url))
504            .unwrap_or(false)
505    }
506
507    /// Commands to init the chain with.
508    pub fn init_commands(&self) -> CommandChain {
509        let cmds = if self.ignore_httpserrors {
510            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
511        } else {
512            INIT_CHAIN.clone()
513        };
514        CommandChain::new(cmds, self.request_timeout)
515    }
516
517    /// Push the CDP request.
518    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
519        let method = cmd.identifier();
520        if let Ok(params) = serde_json::to_value(cmd) {
521            self.queued_events
522                .push_back(NetworkEvent::SendCdpRequest((method, params)));
523        }
524    }
525
526    /// The next event to handle.
527    pub fn poll(&mut self) -> Option<NetworkEvent> {
528        self.queued_events.pop_front()
529    }
530
531    /// Get the extra headers.
532    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
533        &self.extra_headers
534    }
535
536    /// Set extra HTTP headers.
537    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
538        self.extra_headers = headers;
539        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
540        self.extra_headers.remove("Proxy-Authorization");
541        if !self.extra_headers.is_empty() {
542            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
543                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
544            }
545        }
546    }
547
548    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
549        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
550    }
551
552    pub fn set_block_all(&mut self, block_all: bool) {
553        self.block_all = block_all;
554    }
555
556    pub fn set_request_interception(&mut self, enabled: bool) {
557        self.user_request_interception_enabled = enabled;
558        self.update_protocol_request_interception();
559    }
560
561    pub fn set_cache_enabled(&mut self, enabled: bool) {
562        let run = self.user_cache_disabled == enabled;
563        self.user_cache_disabled = !enabled;
564        if run {
565            self.update_protocol_cache_disabled();
566        }
567    }
568
569    /// Enable fetch interception.
570    pub fn enable_request_intercept(&mut self) {
571        self.protocol_request_interception_enabled = true;
572    }
573
574    /// Disable fetch interception.
575    pub fn disable_request_intercept(&mut self) {
576        self.protocol_request_interception_enabled = false;
577    }
578
579    /// Set the cache site key.
580    #[cfg(feature = "_cache")]
581    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
582        self.cache_site_key = cache_site_key;
583    }
584
585    /// Set the cache policy.
586    #[cfg(feature = "_cache")]
587    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
588        self.cache_policy = cache_policy;
589    }
590
591    pub fn update_protocol_cache_disabled(&mut self) {
592        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
593    }
594
595    pub fn authenticate(&mut self, credentials: Credentials) {
596        self.credentials = Some(credentials);
597        self.update_protocol_request_interception();
598        self.protocol_request_interception_enabled = true;
599    }
600
601    fn update_protocol_request_interception(&mut self) {
602        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
603
604        if enabled == self.protocol_request_interception_enabled {
605            return;
606        }
607
608        if enabled {
609            self.push_cdp_request(ENABLE_FETCH.clone())
610        } else {
611            self.push_cdp_request(DisableParams::default())
612        }
613    }
614
615    /// Blocklist-only script blocking.
616    /// Returns true only when the URL matches an explicit blocklist condition.
617    #[inline]
618    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
619        // If analytics blocking is off, skip all analytics tries.
620        let block_analytics = self.block_analytics;
621
622        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
623        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
624        {
625            return true;
626        }
627
628        // 2) Custom website block list (explicit).
629        if crate::handler::blockers::block_websites::block_website(url) {
630            return true;
631        }
632
633        // 3) Path-based explicit tries / fallbacks.
634        //
635        // We run these on:
636        // - path with leading slash ("/js/app.js")
637        // - path without leading slash ("js/app.js")
638        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
639        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
640            // Remove query/fragment so matching stays stable.
641            let p_slash = Self::strip_query_fragment(path_with_slash);
642            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
643
644            // Basename for filename-only lists.
645            let base = match p_slash.rsplit('/').next() {
646                Some(b) => b,
647                None => p_slash,
648            };
649
650            // ---- Trie checks ----
651            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
652            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
653                return true;
654            }
655            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
656                return true;
657            }
658            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
659                return true;
660            }
661
662            // Base-path ignore tries (framework noise / known ignorable script paths).
663            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
664            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
665                return true;
666            }
667
668            // Style path ignores only when visuals are ignored.
669            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
670                return true;
671            }
672        }
673
674        false
675    }
676
677    /// Extract the absolute URL path portion WITH the leading slash.
678    ///
679    /// Example:
680    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
681    #[inline]
682    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
683        // find scheme separator
684        let idx = url.find("//")?;
685        let after_slashes = idx + 2;
686
687        // find first slash after host
688        let slash_rel = url[after_slashes..].find('/')?;
689        let slash_idx = after_slashes + slash_rel;
690
691        if slash_idx < url.len() {
692            Some(&url[slash_idx..])
693        } else {
694            None
695        }
696    }
697
698    /// Strip query string and fragment from a path-ish string.
699    ///
700    /// Example:
701    /// - "/a/b.js?x=1#y" -> "/a/b.js"
702    #[inline]
703    fn strip_query_fragment(s: &str) -> &str {
704        let q = s.find('?');
705        let h = s.find('#');
706
707        match (q, h) {
708            (None, None) => s,
709            (Some(i), None) => &s[..i],
710            (None, Some(i)) => &s[..i],
711            (Some(i), Some(j)) => &s[..i.min(j)],
712        }
713    }
714
715    /// Determine if the request should be skipped.
716    #[inline]
717    fn skip_xhr(
718        &self,
719        skip_networking: bool,
720        event: &EventRequestPaused,
721        network_event: bool,
722    ) -> bool {
723        // XHR check
724        if !skip_networking && network_event {
725            let request_url = event.request.url.as_str();
726
727            // check if part of ignore scripts.
728            let skip_analytics =
729                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
730
731            if skip_analytics {
732                true
733            } else if self.block_stylesheets || self.ignore_visuals {
734                let block_css = self.block_stylesheets;
735                let block_media = self.ignore_visuals;
736
737                let mut block_request = false;
738
739                if let Some(position) = request_url.rfind('.') {
740                    let hlen = request_url.len();
741                    let has_asset = hlen - position;
742
743                    if has_asset >= 3 {
744                        let next_position = position + 1;
745
746                        if block_media
747                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
748                                &request_url[next_position..].into(),
749                            )
750                        {
751                            block_request = true;
752                        } else if block_css {
753                            block_request = CaseInsensitiveString::from(
754                                &request_url.as_bytes()[next_position..],
755                            )
756                            .contains(&**CSS_EXTENSION)
757                        }
758                    }
759                }
760
761                if !block_request {
762                    block_request = ignore_script_xhr_media(request_url);
763                }
764
765                block_request
766            } else {
767                skip_networking
768            }
769        } else {
770            skip_networking
771        }
772    }
773
774    #[cfg(feature = "adblock")]
775    #[inline]
776    /// Detect if ad enabled.
777    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
778        if skip_networking {
779            true
780        } else {
781            block_ads(&event.request.url) || self.detect_ad(event)
782        }
783    }
784
785    /// When adblock feature is disabled, this is a no-op.
786    #[cfg(not(feature = "adblock"))]
787    #[inline]
788    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
789        use crate::handler::blockers::block_websites::block_ads;
790        if skip_networking {
791            true
792        } else {
793            block_ads(&event.request.url)
794        }
795    }
796
797    #[inline]
798    /// Fail request
799    fn fail_request_blocked(
800        &mut self,
801        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
802    ) {
803        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
804            request_id.clone(),
805            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
806        );
807        self.push_cdp_request(params);
808    }
809
810    #[inline]
811    /// Fulfill request
812    fn fulfill_request_empty_200(
813        &mut self,
814        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
815    ) {
816        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
817            request_id.clone(),
818            200,
819        );
820        self.push_cdp_request(params);
821    }
822
823    #[cfg(feature = "_cache")]
824    #[inline]
825    /// Fulfill a paused Fetch request from cached bytes + header map.
826    ///
827    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
828    fn fulfill_request_from_cache(
829        &mut self,
830        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
831        body: &[u8],
832        headers: &std::collections::HashMap<String, String>,
833        status: i64,
834    ) {
835        use crate::cdp::browser_protocol::fetch::HeaderEntry;
836        use crate::handler::network::fetch::FulfillRequestParams;
837        use base64::Engine;
838
839        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
840
841        for (k, v) in headers.iter() {
842            resp_headers.push(HeaderEntry {
843                name: k.clone().into(),
844                value: v.clone().into(),
845            });
846        }
847
848        let mut params = FulfillRequestParams::new(request_id.clone(), status);
849
850        // TODO: have this already encoded prior.
851        params.body = Some(
852            base64::engine::general_purpose::STANDARD
853                .encode(body)
854                .into(),
855        );
856
857        params.response_headers = Some(resp_headers);
858
859        self.push_cdp_request(params);
860    }
861
862    #[inline]
863    /// Continue the request url.
864    fn continue_request_with_url(
865        &mut self,
866        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
867        url: Option<&str>,
868        intercept_response: bool,
869    ) {
870        let mut params = ContinueRequestParams::new(request_id.clone());
871        if let Some(url) = url {
872            params.url = Some(url.to_string());
873            params.intercept_response = Some(intercept_response);
874        }
875        self.push_cdp_request(params);
876    }
877
878    /// On fetch request paused interception.
879    #[inline]
880    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
881        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
882            return;
883        }
884
885        if self.block_all {
886            tracing::debug!(
887                "Blocked (block_all): {:?} - {}",
888                event.resource_type,
889                event.request.url
890            );
891            return self.fail_request_blocked(&event.request_id);
892        }
893
894        if let Some(network_id) = event.network_id.as_ref() {
895            if let Some(request_will_be_sent) =
896                self.requests_will_be_sent.remove(network_id.as_ref())
897            {
898                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
899            } else {
900                self.request_id_to_interception_id
901                    .insert(network_id.clone(), event.request_id.clone().into());
902            }
903        }
904
905        // From here on, we handle the full decision tree.
906        let javascript_resource = event.resource_type == ResourceType::Script;
907        let document_resource = event.resource_type == ResourceType::Document;
908        let network_resource =
909            !document_resource && crate::utils::is_data_resource(&event.resource_type);
910
911        // Start with static / cheap skip checks.
912        let mut skip_networking =
913            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
914
915        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
916            skip_networking = true;
917        }
918
919        // Also short-circuit if we've reloaded this document too many times.
920        if !skip_networking {
921            skip_networking = self.document_reload_tracker >= 3;
922        }
923
924        // Handle document redirect / masking and track xml documents.
925        let (current_url_cow, had_replacer) =
926            self.handle_document_replacement_and_tracking(event, document_resource);
927
928        let current_url: &str = current_url_cow.as_ref();
929
930        let blacklisted = self.is_blacklisted(current_url);
931
932        if !self.blacklist_strict && blacklisted {
933            skip_networking = true;
934        }
935
936        if !skip_networking {
937            // Allow XSL for sitemap XML.
938            if self.xml_document && current_url.ends_with(".xsl") {
939                skip_networking = false;
940            } else {
941                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
942            }
943        }
944
945        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
946
947        // Ignore embedded scripts when only_html or ignore_visuals is set.
948        if !skip_networking
949            && self.block_javascript
950            && (self.only_html || self.ignore_visuals)
951            && (javascript_resource || document_resource)
952        {
953            skip_networking = ignore_script_embedded(current_url);
954        }
955
956        // Script policy: allow-by-default.
957        // Block only if explicit block list patterns match.
958        if !skip_networking && javascript_resource {
959            skip_networking = self.should_block_script_blocklist_only(current_url);
960        }
961
962        // XHR / data resources.
963        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
964
965        // Custom interception layer.
966        if !skip_networking && (javascript_resource || network_resource || document_resource) {
967            skip_networking = self.intercept_manager.intercept_detection(
968                current_url,
969                self.ignore_visuals,
970                network_resource,
971            );
972        }
973
974        // Custom website block list.
975        if !skip_networking && (javascript_resource || network_resource) {
976            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
977        }
978
979        // whitelist 3rd party
980        // not required unless explicit blocking.
981        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
982        {
983            skip_networking = false;
984        }
985
986        // check if the url is in the whitelist.
987        if skip_networking && self.is_whitelisted(current_url) {
988            skip_networking = false;
989        }
990
991        if self.blacklist_strict && blacklisted {
992            skip_networking = true;
993        }
994
995        if skip_networking {
996            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
997            self.fulfill_request_empty_200(&event.request_id);
998        } else {
999            #[cfg(feature = "_cache")]
1000            {
1001                if let (Some(policy), Some(cache_site_key)) =
1002                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1003                {
1004                    let current_url = format!("{}:{}", event.request.method, &current_url);
1005
1006                    if let Some((res, cache_policy)) =
1007                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1008                    {
1009                        if policy.allows_cached(&cache_policy) {
1010                            tracing::debug!(
1011                                "Remote Cached: {:?} - {}",
1012                                &event.resource_type,
1013                                &current_url
1014                            );
1015                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1016                            return self.fulfill_request_from_cache(
1017                                &event.request_id,
1018                                &res.body,
1019                                &flat_headers,
1020                                res.status as i64,
1021                            );
1022                        }
1023                    }
1024                }
1025            }
1026
1027            // check our frame cache for the run.
1028            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1029            self.continue_request_with_url(
1030                &event.request_id,
1031                if had_replacer {
1032                    Some(current_url)
1033                } else {
1034                    None
1035                },
1036                !had_replacer,
1037            );
1038        }
1039    }
1040
1041    /// Shared "visuals + basic blocking" logic.
1042    ///
1043    /// IMPORTANT: Scripts are NOT blocked here anymore.
1044    /// Scripts are allowed by default and only blocked via explicit blocklists
1045    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1046    #[inline]
1047    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1048        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1049            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1050    }
1051
1052    /// Does the network manager have a target domain?
1053    pub fn has_target_domain(&self) -> bool {
1054        !self.document_target_url.is_empty()
1055    }
1056
1057    /// Set the target page url for tracking.
1058    pub fn set_page_url(&mut self, page_target_url: String) {
1059        let host_base = host_and_rest(&page_target_url)
1060            .map(|(h, _)| base_domain_from_host(h))
1061            .unwrap_or("");
1062
1063        self.document_target_domain = host_base.to_string();
1064        self.document_target_url = page_target_url;
1065    }
1066
1067    /// Clear the initial target domain on every navigation.
1068    pub fn clear_target_domain(&mut self) {
1069        self.document_reload_tracker = 0;
1070        self.document_target_url = Default::default();
1071        self.document_target_domain = Default::default();
1072    }
1073
1074    /// Handles:
1075    /// - document reload tracking (`document_reload_tracker`)
1076    /// - redirect masking / replacement
1077    /// - xml document detection (`xml_document`)
1078    /// - `document_target_url` updates
1079    ///
1080    /// Returns (current_url, had_replacer).
1081    #[inline]
1082    fn handle_document_replacement_and_tracking<'a>(
1083        &mut self,
1084        event: &'a EventRequestPaused,
1085        document_resource: bool,
1086    ) -> (Cow<'a, str>, bool) {
1087        let mut replacer: Option<String> = None;
1088        let current_url = event.request.url.as_str();
1089
1090        if document_resource {
1091            if self.document_target_url == current_url {
1092                self.document_reload_tracker += 1;
1093            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1094            {
1095                let (http_document_replacement, mut https_document_replacement) =
1096                    if self.document_target_url.starts_with("http://") {
1097                        (
1098                            self.document_target_url.replacen("http://", "http//", 1),
1099                            self.document_target_url.replacen("http://", "https://", 1),
1100                        )
1101                    } else {
1102                        (
1103                            self.document_target_url.replacen("https://", "https//", 1),
1104                            self.document_target_url.replacen("https://", "http://", 1),
1105                        )
1106                    };
1107
1108                // Track trailing slash to restore later.
1109                let trailing = https_document_replacement.ends_with('/');
1110                if trailing {
1111                    https_document_replacement.pop();
1112                }
1113                if https_document_replacement.ends_with('/') {
1114                    https_document_replacement.pop();
1115                }
1116
1117                let redirect_mask = format!(
1118                    "{}{}",
1119                    https_document_replacement, http_document_replacement
1120                );
1121
1122                if current_url == redirect_mask {
1123                    replacer = Some(if trailing {
1124                        format!("{}/", https_document_replacement)
1125                    } else {
1126                        https_document_replacement
1127                    });
1128                }
1129            }
1130
1131            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1132                self.xml_document = true;
1133            }
1134
1135            // Track last seen document URL.
1136            self.document_target_url = event.request.url.clone();
1137            self.document_target_domain = host_and_rest(&self.document_target_url)
1138                .map(|(h, _)| base_domain_from_host(h).to_string())
1139                .unwrap_or_default();
1140        }
1141
1142        let current_url_cow = match replacer {
1143            Some(r) => Cow::Owned(r),
1144            None => Cow::Borrowed(event.request.url.as_str()),
1145        };
1146
1147        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1148        (current_url_cow, had_replacer)
1149    }
1150
1151    /// Perform a page intercept for chrome
1152    #[cfg(feature = "adblock")]
1153    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1154        use adblock::{
1155            lists::{FilterSet, ParseOptions, RuleTypes},
1156            Engine,
1157        };
1158
1159        lazy_static::lazy_static! {
1160            static ref AD_ENGINE: Engine = {
1161                let mut filter_set = FilterSet::new(false);
1162                let mut rules = ParseOptions::default();
1163                rules.rule_types = RuleTypes::All;
1164
1165                filter_set.add_filters(
1166                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1167                    rules,
1168                );
1169
1170                Engine::from_filter_set(filter_set, true)
1171            };
1172        };
1173
1174        let blockable = ResourceType::Image == event.resource_type
1175            || event.resource_type == ResourceType::Media
1176            || event.resource_type == ResourceType::Stylesheet
1177            || event.resource_type == ResourceType::Document
1178            || event.resource_type == ResourceType::Fetch
1179            || event.resource_type == ResourceType::Xhr;
1180
1181        let u = &event.request.url;
1182
1183        let block_request = blockable
1184            // set it to example.com for 3rd party handling is_same_site
1185        && {
1186            let request = adblock::request::Request::preparsed(
1187                 &u,
1188                 "example.com",
1189                 "example.com",
1190                 &event.resource_type.as_ref().to_lowercase(),
1191                 !event.request.is_same_site.unwrap_or_default());
1192
1193            AD_ENGINE.check_network_request(&request).matched
1194        };
1195
1196        block_request
1197    }
1198
1199    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1200        let response = if self
1201            .attempted_authentications
1202            .contains(event.request_id.as_ref())
1203        {
1204            AuthChallengeResponseResponse::CancelAuth
1205        } else if self.credentials.is_some() {
1206            self.attempted_authentications
1207                .insert(event.request_id.clone().into());
1208            AuthChallengeResponseResponse::ProvideCredentials
1209        } else {
1210            AuthChallengeResponseResponse::Default
1211        };
1212
1213        let mut auth = AuthChallengeResponse::new(response);
1214        if let Some(creds) = self.credentials.clone() {
1215            auth.username = Some(creds.username);
1216            auth.password = Some(creds.password);
1217        }
1218        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1219    }
1220
1221    /// Set the page offline network emulation condition.
1222    pub fn set_offline_mode(&mut self, value: bool) {
1223        if self.offline == value {
1224            return;
1225        }
1226        self.offline = value;
1227        if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1228            .offline(self.offline)
1229            .matched_network_condition(
1230                NetworkConditions::builder()
1231                    .url_pattern("")
1232                    .latency(0)
1233                    .download_throughput(-1.)
1234                    .upload_throughput(-1.)
1235                    .build()
1236                    .unwrap(),
1237            )
1238            .build()
1239        {
1240            self.push_cdp_request(network);
1241        }
1242    }
1243
1244    /// Request interception doesn't happen for data URLs with Network Service.
1245    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1246        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1247            if let Some(interception_id) = self
1248                .request_id_to_interception_id
1249                .remove(event.request_id.as_ref())
1250            {
1251                self.on_request(event, Some(interception_id));
1252            } else {
1253                // TODO remove the clone for event
1254                self.requests_will_be_sent
1255                    .insert(event.request_id.clone(), event.clone());
1256            }
1257        } else {
1258            self.on_request(event, None);
1259        }
1260    }
1261
1262    /// The request was served from the cache.
1263    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1264        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1265            request.from_memory_cache = true;
1266        }
1267    }
1268
1269    /// On network response received.
1270    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1271        let mut request_failed = false;
1272
1273        // Track how many bytes we actually deducted from this target.
1274        let mut deducted: u64 = 0;
1275
1276        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1277            let before = *max_bytes;
1278
1279            // encoded_data_length -> saturating cast to u64
1280            let received_bytes: u64 = event.response.encoded_data_length as u64;
1281
1282            // Safe parse of Content-Length
1283            let content_length: Option<u64> = event
1284                .response
1285                .headers
1286                .inner()
1287                .get("content-length")
1288                .and_then(|v| v.as_str())
1289                .and_then(|s| s.trim().parse::<u64>().ok());
1290
1291            // Deduct what we actually received
1292            *max_bytes = max_bytes.saturating_sub(received_bytes);
1293
1294            // If the declared size can't fit, zero out now
1295            if let Some(cl) = content_length {
1296                if cl > *max_bytes {
1297                    *max_bytes = 0;
1298                }
1299            }
1300
1301            request_failed = *max_bytes == 0;
1302
1303            // Compute exact delta deducted on this event
1304            deducted = before.saturating_sub(*max_bytes);
1305        }
1306
1307        // Bubble up the deduction (even if request continues)
1308        if deducted > 0 {
1309            self.queued_events
1310                .push_back(NetworkEvent::BytesConsumed(deducted));
1311        }
1312
1313        // block all network request moving forward.
1314        if request_failed && self.max_bytes_allowed.is_some() {
1315            self.set_block_all(true);
1316        }
1317
1318        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1319            request.set_response(event.response.clone());
1320            self.queued_events.push_back(if request_failed {
1321                NetworkEvent::RequestFailed(request)
1322            } else {
1323                NetworkEvent::RequestFinished(request)
1324            });
1325        }
1326    }
1327
1328    /// On network loading finished.
1329    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1330        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1331            if let Some(interception_id) = request.interception_id.as_ref() {
1332                self.attempted_authentications
1333                    .remove(interception_id.as_ref());
1334            }
1335            self.queued_events
1336                .push_back(NetworkEvent::RequestFinished(request));
1337        }
1338    }
1339
1340    /// On network loading failed.
1341    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1342        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1343            request.failure_text = Some(event.error_text.clone());
1344            if let Some(interception_id) = request.interception_id.as_ref() {
1345                self.attempted_authentications
1346                    .remove(interception_id.as_ref());
1347            }
1348            self.queued_events
1349                .push_back(NetworkEvent::RequestFailed(request));
1350        }
1351    }
1352
1353    /// On request will be sent.
1354    fn on_request(
1355        &mut self,
1356        event: &EventRequestWillBeSent,
1357        interception_id: Option<InterceptionId>,
1358    ) {
1359        let mut redirect_chain = Vec::new();
1360        let mut redirect_location = None;
1361
1362        if let Some(redirect_resp) = &event.redirect_response {
1363            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1364                if is_redirect_status(redirect_resp.status) {
1365                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1366                        if redirect_resp.url != location {
1367                            let fixed_location = location.replace(&redirect_resp.url, "");
1368
1369                            if !fixed_location.is_empty() {
1370                                if let Some(resp) = request.response.as_mut() {
1371                                    resp.headers.0["Location"] =
1372                                        serde_json::Value::String(fixed_location.clone());
1373                                }
1374                            }
1375
1376                            redirect_location = Some(fixed_location);
1377                        }
1378                    }
1379                }
1380
1381                self.handle_request_redirect(
1382                    &mut request,
1383                    if let Some(redirect_location) = redirect_location {
1384                        let mut redirect_resp = redirect_resp.clone();
1385
1386                        if !redirect_location.is_empty() {
1387                            redirect_resp.headers.0["Location"] =
1388                                serde_json::Value::String(redirect_location);
1389                        }
1390
1391                        redirect_resp
1392                    } else {
1393                        redirect_resp.clone()
1394                    },
1395                );
1396
1397                redirect_chain = std::mem::take(&mut request.redirect_chain);
1398                redirect_chain.push(request);
1399            }
1400        }
1401
1402        let request = HttpRequest::new(
1403            event.request_id.clone(),
1404            event.frame_id.clone(),
1405            interception_id,
1406            self.user_request_interception_enabled,
1407            redirect_chain,
1408        );
1409
1410        self.requests.insert(event.request_id.clone(), request);
1411        self.queued_events
1412            .push_back(NetworkEvent::Request(event.request_id.clone()));
1413    }
1414
1415    /// Handle request redirect.
1416    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1417        request.set_response(response);
1418        if let Some(interception_id) = request.interception_id.as_ref() {
1419            self.attempted_authentications
1420                .remove(interception_id.as_ref());
1421        }
1422    }
1423}
1424
1425#[derive(Debug)]
1426pub enum NetworkEvent {
1427    /// Send a CDP request.
1428    SendCdpRequest((MethodId, serde_json::Value)),
1429    /// Request.
1430    Request(RequestId),
1431    /// Response
1432    Response(RequestId),
1433    /// Request failed.
1434    RequestFailed(HttpRequest),
1435    /// Request finished.
1436    RequestFinished(HttpRequest),
1437    /// Bytes consumed.
1438    BytesConsumed(u64),
1439}
1440
1441#[cfg(test)]
1442mod tests {
1443    use super::ALLOWED_MATCHER_3RD_PARTY;
1444    use crate::handler::network::NetworkManager;
1445    use std::time::Duration;
1446
1447    #[test]
1448    fn test_allowed_matcher_3rd_party() {
1449        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1450        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1451        assert!(
1452            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1453            "expected Cloudflare challenge script to be allowed"
1454        );
1455
1456        // Should NOT be allowed (not in allow-list)
1457        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1458        assert!(
1459            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1460            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1461        );
1462
1463        // A couple sanity checks for existing allow patterns
1464        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1465        assert!(ALLOWED_MATCHER_3RD_PARTY
1466            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1467        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1468    }
1469
1470    #[test]
1471    fn test_script_allowed_by_default_when_not_blocklisted() {
1472        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1473        nm.set_page_url(
1474            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1475        );
1476
1477        // A random script that should not match your block tries.
1478        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1479        assert!(
1480            !nm.should_block_script_blocklist_only(ok),
1481            "expected non-blocklisted script to be allowed"
1482        );
1483    }
1484
1485    #[test]
1486    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1487        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1488        nm.set_page_url(
1489            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1490        );
1491
1492        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1493        let bad = "https://cdn.example.net/js/analytics.js";
1494        assert!(
1495            nm.should_block_script_blocklist_only(bad),
1496            "expected analytics.js to be blocklisted"
1497        );
1498    }
1499
1500    #[test]
1501    fn test_allowed_matcher_3rd_party_sanity() {
1502        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1503        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1504        assert!(
1505            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1506            "expected Cloudflare challenge script to be allowed"
1507        );
1508
1509        // Should NOT be allowed (not in allow-list)
1510        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1511        assert!(
1512            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1513            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1514        );
1515
1516        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1517        assert!(ALLOWED_MATCHER_3RD_PARTY
1518            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1519        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1520    }
1521    #[test]
1522    fn test_dynamic_blacklist_blocks_url() {
1523        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1524        nm.set_page_url("https://example.com/".to_string());
1525
1526        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1527        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1528        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1529
1530        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1531    }
1532
1533    #[test]
1534    fn test_blacklist_strict_wins_over_whitelist() {
1535        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1536        nm.set_page_url("https://example.com/".to_string());
1537
1538        // Same URL in both lists.
1539        nm.set_blacklist_patterns(["beacon.min.js"]);
1540        nm.set_whitelist_patterns(["beacon.min.js"]);
1541
1542        nm.set_blacklist_strict(true);
1543
1544        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1545        assert!(nm.is_whitelisted(u));
1546        assert!(nm.is_blacklisted(u));
1547
1548        // In strict mode, it should still be considered blocked at decision time.
1549        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1550        assert!(nm.blacklist_strict);
1551    }
1552
1553    #[test]
1554    fn test_blacklist_non_strict_allows_whitelist_override() {
1555        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1556        nm.set_page_url("https://example.com/".to_string());
1557
1558        nm.set_blacklist_patterns(["beacon.min.js"]);
1559        nm.set_whitelist_patterns(["beacon.min.js"]);
1560
1561        nm.set_blacklist_strict(false);
1562
1563        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1564        assert!(nm.is_blacklisted(u));
1565        assert!(nm.is_whitelisted(u));
1566        assert!(!nm.blacklist_strict);
1567    }
1568}