chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/",
73        "https://google.com/recaptcha/api.js",
74        "https://www.gstatic.com/recaptcha/",
75        "https://captcha.px-cloud.net/",
76        "https://geo.captcha-delivery.com/",
77        "https://api.leminnow.com/captcha/",
78        "https://cdn.auth0.com/js/lock/",
79        "https://captcha.gtimg.com",
80        "https://client-api.arkoselabs.com/",
81        "https://www.capy.me/puzzle/",
82        "https://newassets.hcaptcha.com/",
83        "https://cdn.auth0.com/client",
84        "https://js.stripe.com/",
85        "https://cdn.prod.website-files.com/", // webflow cdn scripts
86        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
87        "https://code.jquery.com/jquery-"
88    ];
89
90    /// Determine if a script should be rendered in the browser by name.
91    ///
92    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
93    /// but we keep it for compatibility and other call sites.
94    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
95
96    /// General patterns for popular libraries and resources
97    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
98        // Verified 3rd parties for request
99        "https://m.stripe.network/",
100        "https://challenges.cloudflare.com/",
101        "https://js.stripe.com/",
102        "https://cdn.prod.website-files.com/", // webflow cdn scripts
103        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
104        "https://code.jquery.com/jquery-",
105        "https://ct.captcha-delivery.com/",
106        "https://geo.captcha-delivery.com/",
107        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
108        "https://cdn.auth0.com/client",
109        "https://captcha.px-cloud.net/",
110        "https://www.capy.me/puzzle/",
111        "https://www.gstatic.com/recaptcha/",
112        "https://google.com/recaptcha/",
113        "https://www.google.com/recaptcha/",
114        "https://www.recaptcha.net/recaptcha/",
115        "https://js.hcaptcha.com/1/api.js",
116        "https://hcaptcha.com/1/api.js",
117        "https://js.datadome.co/tags.js",
118        "https://api-js.datadome.co/",
119        "https://client.perimeterx.net/",
120        "https://captcha.px-cdn.net/",
121        "https://newassets.hcaptcha.com/",
122        "https://captcha.px-cloud.net/",
123        "https://s.perimeterx.net/",
124        "https://api.leminnow.com/captcha/",
125        "https://client-api.arkoselabs.com/",
126        "https://static.geetest.com/v4/gt4.js",
127        "https://static.geetest.com/",
128        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
129        "https://cdn.perfdrive.com/aperture/",
130        "https://assets.queue-it.net/",
131        "discourse-cdn.com/",
132        "hcaptcha.com",
133        "/cdn-cgi/challenge-platform/",
134        "/_Incapsula_Resource"
135    ];
136
137    /// Determine if a script should be rendered in the browser by name.
138    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
139
140    /// path of a js framework
141    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
142        phf::phf_set! {
143            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
144            "_astro/", "_app/immutable"
145        }
146    };
147
148    /// Ignore the content types.
149    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
150        "application/pdf",
151        "application/zip",
152        "application/x-rar-compressed",
153        "application/x-tar",
154        "image/png",
155        "image/jpeg",
156        "image/gif",
157        "image/bmp",
158        "image/webp",
159        "image/svg+xml",
160        "video/mp4",
161        "video/x-msvideo",
162        "video/x-matroska",
163        "video/webm",
164        "audio/mpeg",
165        "audio/ogg",
166        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
167        "application/vnd.ms-excel",
168        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
169        "application/vnd.ms-powerpoint",
170        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
171        "application/x-7z-compressed",
172        "application/x-rpm",
173        "application/x-shockwave-flash",
174        "application/rtf",
175    };
176
177    /// Ignore the resources for visual content types.
178    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
179        "Image",
180        "Media",
181        "Font"
182    };
183
184    /// Ignore the resources for visual content types.
185    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
186        "CspViolationReport",
187        "Other",
188        "Prefetch",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235#[derive(Debug)]
236/// The base network manager.
237pub struct NetworkManager {
238    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
239    ///
240    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
241    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
242    /// Consumers pull from this queue via `poll()`.
243    queued_events: VecDeque<NetworkEvent>,
244    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
245    ///
246    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
247    /// certificates (self-signed, expired, MITM proxies, etc.).
248    ignore_httpserrors: bool,
249    /// Active in-flight requests keyed by CDP `RequestId`.
250    ///
251    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
252    /// and final state used to emit `RequestFinished` / `RequestFailed`.
253    requests: HashMap<RequestId, HttpRequest>,
254    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
255    /// `Fetch.requestPaused` arrives later (or vice versa).
256    ///
257    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
258    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
259    // TODO put event in an Arc?
260    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
261    /// Extra HTTP headers to apply to subsequent network requests via CDP.
262    ///
263    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
264    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
265    extra_headers: std::collections::HashMap<String, String>,
266    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
267    ///
268    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
269    /// store the interception id here so it can be attached to the `HttpRequest` once the
270    /// network request is observed.
271    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
272    /// Whether the user has disabled the browser cache.
273    ///
274    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
275    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
276    user_cache_disabled: bool,
277    /// Tracks which requests have already attempted authentication.
278    ///
279    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
280    /// authentication challenges (407/401). Once a request id is present here, subsequent
281    /// challenges for the same request are canceled.
282    attempted_authentications: HashSet<RequestId>,
283    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
284    ///
285    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
286    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
287    credentials: Option<Credentials>,
288    /// User-facing toggle indicating whether request interception is desired.
289    ///
290    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
291    /// not guarantee interception is active; interception is actually enabled/disabled by
292    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
293    ///
294    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
295    /// enabled to satisfy auth challenges.
296    pub(crate) user_request_interception_enabled: bool,
297    /// Hard kill-switch to block all network traffic.
298    ///
299    /// When `true`, the manager immediately blocks requests (typically via
300    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
301    /// and short-circuits most decision logic. This is used for safety conditions such as
302    /// exceeding `max_bytes_allowed` or other runtime protections.
303    block_all: bool,
304    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
305    ///
306    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
307    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
308    /// when `user_request_interception_enabled` or `credentials` change.
309    pub(crate) protocol_request_interception_enabled: bool,
310    /// The network is offline.
311    offline: bool,
312    /// The page request timeout.
313    pub request_timeout: Duration,
314    // made_request: bool,
315    /// Ignore visuals (no pings, prefetching, and etc).
316    pub ignore_visuals: bool,
317    /// Block CSS stylesheets.
318    pub block_stylesheets: bool,
319    /// Block javascript that is not critical to rendering.
320    ///
321    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
322    /// by itself (it remains for config compatibility).
323    pub block_javascript: bool,
324    /// Block analytics from rendering
325    pub block_analytics: bool,
326    /// Only html from loading.
327    pub only_html: bool,
328    /// Is xml document?
329    pub xml_document: bool,
330    /// The custom intercept handle logic to run on the website.
331    pub intercept_manager: NetworkInterceptManager,
332    /// Track the amount of times the document reloaded.
333    pub document_reload_tracker: u8,
334    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
335    pub document_target_url: String,
336    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
337    pub document_target_domain: String,
338    /// The max bytes to receive.
339    pub max_bytes_allowed: Option<u64>,
340    #[cfg(feature = "_cache")]
341    /// The cache site_key to use.
342    pub cache_site_key: Option<String>,
343    /// The cache policy to use.
344    #[cfg(feature = "_cache")]
345    pub cache_policy: Option<BasicCachePolicy>,
346    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
347    whitelist_patterns: Vec<String>,
348    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
349    whitelist_matcher: Option<AhoCorasick>,
350    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
351    blacklist_patterns: Vec<String>,
352    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
353    blacklist_matcher: Option<AhoCorasick>,
354    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
355    blacklist_strict: bool,
356}
357
358impl NetworkManager {
359    /// A new network manager.
360    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
361        Self {
362            queued_events: Default::default(),
363            ignore_httpserrors,
364            requests: Default::default(),
365            requests_will_be_sent: Default::default(),
366            extra_headers: Default::default(),
367            request_id_to_interception_id: Default::default(),
368            user_cache_disabled: false,
369            attempted_authentications: Default::default(),
370            credentials: None,
371            block_all: false,
372            user_request_interception_enabled: false,
373            protocol_request_interception_enabled: false,
374            offline: false,
375            request_timeout,
376            ignore_visuals: false,
377            block_javascript: false,
378            block_stylesheets: false,
379            block_analytics: true,
380            only_html: false,
381            xml_document: false,
382            intercept_manager: NetworkInterceptManager::Unknown,
383            document_reload_tracker: 0,
384            document_target_url: String::new(),
385            document_target_domain: String::new(),
386            whitelist_patterns: Vec::new(),
387            whitelist_matcher: None,
388            blacklist_patterns: Vec::new(),
389            blacklist_matcher: None,
390            blacklist_strict: true,
391            max_bytes_allowed: None,
392            #[cfg(feature = "_cache")]
393            cache_site_key: None,
394            #[cfg(feature = "_cache")]
395            cache_policy: None,
396        }
397    }
398
399    /// Replace the whitelist patterns (compiled once).
400    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
401    where
402        I: IntoIterator<Item = S>,
403        S: Into<String>,
404    {
405        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
406        self.rebuild_whitelist_matcher();
407    }
408
409    /// Replace the blacklist patterns (compiled once).
410    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
411    where
412        I: IntoIterator<Item = S>,
413        S: Into<String>,
414    {
415        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
416        self.rebuild_blacklist_matcher();
417    }
418
419    /// Add one pattern (cheap) and rebuild (call this sparingly).
420    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
421        self.blacklist_patterns.push(pattern.into());
422        self.rebuild_blacklist_matcher();
423    }
424
425    /// Add many patterns and rebuild once.
426    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
427    where
428        I: IntoIterator<Item = S>,
429        S: Into<String>,
430    {
431        self.blacklist_patterns
432            .extend(patterns.into_iter().map(Into::into));
433        self.rebuild_blacklist_matcher();
434    }
435
436    /// Clear blacklist entirely.
437    pub fn clear_blacklist(&mut self) {
438        self.blacklist_patterns.clear();
439        self.blacklist_matcher = None;
440    }
441
442    /// Control precedence: when true, blacklist always wins.
443    pub fn set_blacklist_strict(&mut self, strict: bool) {
444        self.blacklist_strict = strict;
445    }
446
447    #[inline]
448    fn rebuild_blacklist_matcher(&mut self) {
449        if self.blacklist_patterns.is_empty() {
450            self.blacklist_matcher = None;
451            return;
452        }
453
454        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
455        self.blacklist_matcher = AhoCorasick::new(refs).ok();
456    }
457
458    #[inline]
459    fn is_blacklisted(&self, url: &str) -> bool {
460        self.blacklist_matcher
461            .as_ref()
462            .map(|m| m.is_match(url))
463            .unwrap_or(false)
464    }
465
466    /// Add one pattern (cheap) and rebuild (call this sparingly).
467    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
468        self.whitelist_patterns.push(pattern.into());
469        self.rebuild_whitelist_matcher();
470    }
471
472    /// Add many patterns and rebuild once.
473    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
474    where
475        I: IntoIterator<Item = S>,
476        S: Into<String>,
477    {
478        self.whitelist_patterns
479            .extend(patterns.into_iter().map(Into::into));
480        self.rebuild_whitelist_matcher();
481    }
482
483    #[inline]
484    fn rebuild_whitelist_matcher(&mut self) {
485        if self.whitelist_patterns.is_empty() {
486            self.whitelist_matcher = None;
487            return;
488        }
489
490        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
491
492        // If building fails (shouldn’t for simple patterns), just disable matcher.
493        self.whitelist_matcher = AhoCorasick::new(refs).ok();
494    }
495
496    #[inline]
497    fn is_whitelisted(&self, url: &str) -> bool {
498        self.whitelist_matcher
499            .as_ref()
500            .map(|m| m.is_match(url))
501            .unwrap_or(false)
502    }
503
504    /// Commands to init the chain with.
505    pub fn init_commands(&self) -> CommandChain {
506        let cmds = if self.ignore_httpserrors {
507            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
508        } else {
509            INIT_CHAIN.clone()
510        };
511        CommandChain::new(cmds, self.request_timeout)
512    }
513
514    /// Push the CDP request.
515    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
516        let method = cmd.identifier();
517        if let Ok(params) = serde_json::to_value(cmd) {
518            self.queued_events
519                .push_back(NetworkEvent::SendCdpRequest((method, params)));
520        }
521    }
522
523    /// The next event to handle.
524    pub fn poll(&mut self) -> Option<NetworkEvent> {
525        self.queued_events.pop_front()
526    }
527
528    /// Get the extra headers.
529    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
530        &self.extra_headers
531    }
532
533    /// Set extra HTTP headers.
534    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
535        self.extra_headers = headers;
536        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
537        self.extra_headers.remove("Proxy-Authorization");
538        if !self.extra_headers.is_empty() {
539            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
540                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
541            }
542        }
543    }
544
545    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
546        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
547    }
548
549    pub fn set_block_all(&mut self, block_all: bool) {
550        self.block_all = block_all;
551    }
552
553    pub fn set_request_interception(&mut self, enabled: bool) {
554        self.user_request_interception_enabled = enabled;
555        self.update_protocol_request_interception();
556    }
557
558    pub fn set_cache_enabled(&mut self, enabled: bool) {
559        let run = self.user_cache_disabled != !enabled;
560        self.user_cache_disabled = !enabled;
561        if run {
562            self.update_protocol_cache_disabled();
563        }
564    }
565
566    /// Enable fetch interception.
567    pub fn enable_request_intercept(&mut self) {
568        self.protocol_request_interception_enabled = true;
569    }
570
571    /// Disable fetch interception.
572    pub fn disable_request_intercept(&mut self) {
573        self.protocol_request_interception_enabled = false;
574    }
575
576    /// Set the cache site key.
577    #[cfg(feature = "_cache")]
578    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
579        self.cache_site_key = cache_site_key;
580    }
581
582    /// Set the cache policy.
583    #[cfg(feature = "_cache")]
584    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
585        self.cache_policy = cache_policy;
586    }
587
588    pub fn update_protocol_cache_disabled(&mut self) {
589        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
590    }
591
592    pub fn authenticate(&mut self, credentials: Credentials) {
593        self.credentials = Some(credentials);
594        self.update_protocol_request_interception();
595        self.protocol_request_interception_enabled = true;
596    }
597
598    fn update_protocol_request_interception(&mut self) {
599        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
600
601        if enabled == self.protocol_request_interception_enabled {
602            return;
603        }
604
605        if enabled {
606            self.push_cdp_request(ENABLE_FETCH.clone())
607        } else {
608            self.push_cdp_request(DisableParams::default())
609        }
610    }
611
612    /// Blocklist-only script blocking.
613    /// Returns true only when the URL matches an explicit blocklist condition.
614    #[inline]
615    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
616        // If analytics blocking is off, skip all analytics tries.
617        let block_analytics = self.block_analytics;
618
619        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
620        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
621        {
622            return true;
623        }
624
625        // 2) Custom website block list (explicit).
626        if crate::handler::blockers::block_websites::block_website(url) {
627            return true;
628        }
629
630        // 3) Path-based explicit tries / fallbacks.
631        //
632        // We run these on:
633        // - path with leading slash ("/js/app.js")
634        // - path without leading slash ("js/app.js")
635        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
636        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
637            // Remove query/fragment so matching stays stable.
638            let p_slash = Self::strip_query_fragment(path_with_slash);
639            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
640
641            // Basename for filename-only lists.
642            let base = match p_slash.rsplit('/').next() {
643                Some(b) => b,
644                None => p_slash,
645            };
646
647            // ---- Trie checks ----
648            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
649            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
650                return true;
651            }
652            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
653                return true;
654            }
655            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
656                return true;
657            }
658
659            // Base-path ignore tries (framework noise / known ignorable script paths).
660            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
661            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
662                return true;
663            }
664
665            // Style path ignores only when visuals are ignored.
666            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
667                return true;
668            }
669        }
670
671        false
672    }
673
674    /// Extract the absolute URL path portion WITH the leading slash.
675    ///
676    /// Example:
677    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
678    #[inline]
679    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
680        // find scheme separator
681        let idx = url.find("//")?;
682        let after_slashes = idx + 2;
683
684        // find first slash after host
685        let slash_rel = url[after_slashes..].find('/')?;
686        let slash_idx = after_slashes + slash_rel;
687
688        if slash_idx < url.len() {
689            Some(&url[slash_idx..])
690        } else {
691            None
692        }
693    }
694
695    /// Strip query string and fragment from a path-ish string.
696    ///
697    /// Example:
698    /// - "/a/b.js?x=1#y" -> "/a/b.js"
699    #[inline]
700    fn strip_query_fragment(s: &str) -> &str {
701        let q = s.find('?');
702        let h = s.find('#');
703
704        match (q, h) {
705            (None, None) => s,
706            (Some(i), None) => &s[..i],
707            (None, Some(i)) => &s[..i],
708            (Some(i), Some(j)) => &s[..i.min(j)],
709        }
710    }
711
712    /// Determine if the request should be skipped.
713    #[inline]
714    fn skip_xhr(
715        &self,
716        skip_networking: bool,
717        event: &EventRequestPaused,
718        network_event: bool,
719    ) -> bool {
720        // XHR check
721        if !skip_networking && network_event {
722            let request_url = event.request.url.as_str();
723
724            // check if part of ignore scripts.
725            let skip_analytics =
726                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
727
728            if skip_analytics {
729                true
730            } else if self.block_stylesheets || self.ignore_visuals {
731                let block_css = self.block_stylesheets;
732                let block_media = self.ignore_visuals;
733
734                let mut block_request = false;
735
736                if let Some(position) = request_url.rfind('.') {
737                    let hlen = request_url.len();
738                    let has_asset = hlen - position;
739
740                    if has_asset >= 3 {
741                        let next_position = position + 1;
742
743                        if block_media
744                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
745                                &request_url[next_position..].into(),
746                            )
747                        {
748                            block_request = true;
749                        } else if block_css {
750                            block_request =
751                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
752                                    .contains(&**CSS_EXTENSION)
753                        }
754                    }
755                }
756
757                if !block_request {
758                    block_request = ignore_script_xhr_media(request_url);
759                }
760
761                block_request
762            } else {
763                skip_networking
764            }
765        } else {
766            skip_networking
767        }
768    }
769
770    #[cfg(feature = "adblock")]
771    #[inline]
772    /// Detect if ad enabled.
773    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
774        if skip_networking {
775            true
776        } else {
777            block_ads(&event.request.url) || self.detect_ad(event)
778        }
779    }
780
781    /// When adblock feature is disabled, this is a no-op.
782    #[cfg(not(feature = "adblock"))]
783    #[inline]
784    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
785        use crate::handler::blockers::block_websites::block_ads;
786        if skip_networking {
787            true
788        } else {
789            block_ads(&event.request.url)
790        }
791    }
792
793    #[inline]
794    /// Fail request
795    fn fail_request_blocked(
796        &mut self,
797        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
798    ) {
799        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
800            request_id.clone(),
801            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
802        );
803        self.push_cdp_request(params);
804    }
805
806    #[inline]
807    /// Fulfill request
808    fn fulfill_request_empty_200(
809        &mut self,
810        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
811    ) {
812        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
813            request_id.clone(),
814            200,
815        );
816        self.push_cdp_request(params);
817    }
818
819    #[cfg(feature = "_cache")]
820    #[inline]
821    /// Fulfill a paused Fetch request from cached bytes + header map.
822    ///
823    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
824    fn fulfill_request_from_cache(
825        &mut self,
826        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
827        body: &[u8],
828        headers: &std::collections::HashMap<String, String>,
829        status: i64,
830    ) {
831        use crate::cdp::browser_protocol::fetch::HeaderEntry;
832        use crate::handler::network::fetch::FulfillRequestParams;
833        use base64::Engine;
834
835        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
836
837        for (k, v) in headers.iter() {
838            resp_headers.push(HeaderEntry {
839                name: k.clone().into(),
840                value: v.clone().into(),
841            });
842        }
843
844        let mut params = FulfillRequestParams::new(request_id.clone(), status);
845
846        // TODO: have this already encoded prior.
847        params.body = Some(
848            base64::engine::general_purpose::STANDARD
849                .encode(body)
850                .into(),
851        );
852
853        params.response_headers = Some(resp_headers);
854
855        self.push_cdp_request(params);
856    }
857
858    #[inline]
859    /// Continue the request url.
860    fn continue_request_with_url(
861        &mut self,
862        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
863        url: Option<&str>,
864        intercept_response: bool,
865    ) {
866        let mut params = ContinueRequestParams::new(request_id.clone());
867        if let Some(url) = url {
868            params.url = Some(url.to_string());
869            params.intercept_response = Some(intercept_response);
870        }
871        self.push_cdp_request(params);
872    }
873
874    /// On fetch request paused interception.
875    #[inline]
876    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
877        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
878            return;
879        }
880
881        let resource_type = &event.resource_type;
882
883        if self.block_all {
884            tracing::debug!(
885                "Blocked (block_all): {:?} - {}",
886                event.resource_type,
887                event.request.url
888            );
889            return self.fail_request_blocked(&event.request_id);
890        }
891
892        if let Some(network_id) = event.network_id.as_ref() {
893            if let Some(request_will_be_sent) =
894                self.requests_will_be_sent.remove(network_id.as_ref())
895            {
896                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
897            } else {
898                self.request_id_to_interception_id
899                    .insert(network_id.clone(), event.request_id.clone().into());
900            }
901        }
902
903        // From here on, we handle the full decision tree.
904        let javascript_resource = *resource_type == ResourceType::Script;
905        let document_resource = *resource_type == ResourceType::Document;
906        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
907
908        // Start with static / cheap skip checks.
909        let mut skip_networking =
910            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
911
912        // Also short-circuit if we've reloaded this document too many times.
913        if !skip_networking {
914            skip_networking = self.document_reload_tracker >= 3;
915        }
916
917        // Handle document redirect / masking and track xml documents.
918        let (current_url_cow, had_replacer) =
919            self.handle_document_replacement_and_tracking(event, document_resource);
920
921        let current_url: &str = current_url_cow.as_ref();
922
923        let blacklisted = self.is_blacklisted(current_url);
924
925        if !self.blacklist_strict && blacklisted {
926            skip_networking = true;
927        }
928
929        if !skip_networking {
930            // Allow XSL for sitemap XML.
931            if self.xml_document && current_url.ends_with(".xsl") {
932                skip_networking = false;
933            } else {
934                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
935            }
936        }
937
938        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
939
940        // Ignore embedded scripts when only_html or ignore_visuals is set.
941        if !skip_networking
942            && self.block_javascript
943            && (self.only_html || self.ignore_visuals)
944            && (javascript_resource || document_resource)
945        {
946            skip_networking = ignore_script_embedded(current_url);
947        }
948
949        // Script policy: allow-by-default.
950        // Block only if explicit block list patterns match.
951        if !skip_networking && javascript_resource {
952            skip_networking = self.should_block_script_blocklist_only(current_url);
953        }
954
955        // XHR / data resources.
956        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
957
958        // Custom interception layer.
959        if !skip_networking && (javascript_resource || network_resource || document_resource) {
960            skip_networking = self.intercept_manager.intercept_detection(
961                current_url,
962                self.ignore_visuals,
963                network_resource,
964            );
965        }
966
967        // Custom website block list.
968        if !skip_networking && (javascript_resource || network_resource) {
969            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
970        }
971
972        // whitelist 3rd party
973        // not required unless explicit blocking.
974        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
975        {
976            skip_networking = false;
977        }
978
979        // check if the url is in the whitelist.
980        if skip_networking && self.is_whitelisted(current_url) {
981            skip_networking = false;
982        }
983
984        if self.blacklist_strict && blacklisted {
985            skip_networking = true;
986        }
987
988        if skip_networking {
989            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
990            self.fulfill_request_empty_200(&event.request_id);
991        } else {
992            #[cfg(feature = "_cache")]
993            {
994                if let (Some(policy), Some(cache_site_key)) =
995                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
996                {
997                    let current_url = format!("{}:{}", event.request.method, &current_url);
998
999                    if let Some((res, cache_policy)) =
1000                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1001                    {
1002                        if policy.allows_cached(&cache_policy) {
1003                            tracing::debug!(
1004                                "Remote Cached: {:?} - {}",
1005                                resource_type,
1006                                &current_url
1007                            );
1008                            return self.fulfill_request_from_cache(
1009                                &event.request_id,
1010                                &res.body,
1011                                &res.headers,
1012                                res.status as i64,
1013                            );
1014                        }
1015                    }
1016                }
1017            }
1018
1019            // check our frame cache for the run.
1020            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1021            self.continue_request_with_url(
1022                &event.request_id,
1023                if had_replacer {
1024                    Some(current_url)
1025                } else {
1026                    None
1027                },
1028                !had_replacer,
1029            );
1030        }
1031    }
1032
1033    /// Shared "visuals + basic blocking" logic.
1034    ///
1035    /// IMPORTANT: Scripts are NOT blocked here anymore.
1036    /// Scripts are allowed by default and only blocked via explicit blocklists
1037    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1038    #[inline]
1039    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1040        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1041            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1042    }
1043
1044    /// Does the network manager have a target domain?
1045    pub fn has_target_domain(&self) -> bool {
1046        !self.document_target_url.is_empty()
1047    }
1048
1049    /// Set the target page url for tracking.
1050    pub fn set_page_url(&mut self, page_target_url: String) {
1051        let host_base = host_and_rest(&page_target_url)
1052            .map(|(h, _)| base_domain_from_host(h))
1053            .unwrap_or("");
1054
1055        self.document_target_domain = host_base.to_string();
1056        self.document_target_url = page_target_url;
1057    }
1058
1059    /// Clear the initial target domain on every navigation.
1060    pub fn clear_target_domain(&mut self) {
1061        self.document_reload_tracker = 0;
1062        self.document_target_url = Default::default();
1063        self.document_target_domain = Default::default();
1064    }
1065
1066    /// Handles:
1067    /// - document reload tracking (`document_reload_tracker`)
1068    /// - redirect masking / replacement
1069    /// - xml document detection (`xml_document`)
1070    /// - `document_target_url` updates
1071    ///
1072    /// Returns (current_url, had_replacer).
1073    #[inline]
1074    fn handle_document_replacement_and_tracking<'a>(
1075        &mut self,
1076        event: &'a EventRequestPaused,
1077        document_resource: bool,
1078    ) -> (Cow<'a, str>, bool) {
1079        let mut replacer: Option<String> = None;
1080        let current_url = event.request.url.as_str();
1081
1082        if document_resource {
1083            if self.document_target_url == current_url {
1084                self.document_reload_tracker += 1;
1085            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1086            {
1087                let (http_document_replacement, mut https_document_replacement) =
1088                    if self.document_target_url.starts_with("http://") {
1089                        (
1090                            self.document_target_url.replacen("http://", "http//", 1),
1091                            self.document_target_url.replacen("http://", "https://", 1),
1092                        )
1093                    } else {
1094                        (
1095                            self.document_target_url.replacen("https://", "https//", 1),
1096                            self.document_target_url.replacen("https://", "http://", 1),
1097                        )
1098                    };
1099
1100                // Track trailing slash to restore later.
1101                let trailing = https_document_replacement.ends_with('/');
1102                if trailing {
1103                    https_document_replacement.pop();
1104                }
1105                if https_document_replacement.ends_with('/') {
1106                    https_document_replacement.pop();
1107                }
1108
1109                let redirect_mask = format!(
1110                    "{}{}",
1111                    https_document_replacement, http_document_replacement
1112                );
1113
1114                if current_url == redirect_mask {
1115                    replacer = Some(if trailing {
1116                        format!("{}/", https_document_replacement)
1117                    } else {
1118                        https_document_replacement
1119                    });
1120                }
1121            }
1122
1123            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1124                self.xml_document = true;
1125            }
1126
1127            // Track last seen document URL.
1128            self.document_target_url = event.request.url.clone();
1129            self.document_target_domain = host_and_rest(&self.document_target_url)
1130                .map(|(h, _)| base_domain_from_host(h).to_string())
1131                .unwrap_or_default();
1132        }
1133
1134        let current_url_cow = match replacer {
1135            Some(r) => Cow::Owned(r),
1136            None => Cow::Borrowed(event.request.url.as_str()),
1137        };
1138
1139        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1140        (current_url_cow, had_replacer)
1141    }
1142
1143    /// Perform a page intercept for chrome
1144    #[cfg(feature = "adblock")]
1145    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1146        use adblock::{
1147            lists::{FilterSet, ParseOptions, RuleTypes},
1148            Engine,
1149        };
1150
1151        lazy_static::lazy_static! {
1152            static ref AD_ENGINE: Engine = {
1153                let mut filter_set = FilterSet::new(false);
1154                let mut rules = ParseOptions::default();
1155                rules.rule_types = RuleTypes::All;
1156
1157                filter_set.add_filters(
1158                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1159                    rules,
1160                );
1161
1162                Engine::from_filter_set(filter_set, true)
1163            };
1164        };
1165
1166        let blockable = ResourceType::Image == event.resource_type
1167            || event.resource_type == ResourceType::Media
1168            || event.resource_type == ResourceType::Stylesheet
1169            || event.resource_type == ResourceType::Document
1170            || event.resource_type == ResourceType::Fetch
1171            || event.resource_type == ResourceType::Xhr;
1172
1173        let u = &event.request.url;
1174
1175        let block_request = blockable
1176            // set it to example.com for 3rd party handling is_same_site
1177        && {
1178            let request = adblock::request::Request::preparsed(
1179                 &u,
1180                 "example.com",
1181                 "example.com",
1182                 &event.resource_type.as_ref().to_lowercase(),
1183                 !event.request.is_same_site.unwrap_or_default());
1184
1185            AD_ENGINE.check_network_request(&request).matched
1186        };
1187
1188        block_request
1189    }
1190
1191    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1192        let response = if self
1193            .attempted_authentications
1194            .contains(event.request_id.as_ref())
1195        {
1196            AuthChallengeResponseResponse::CancelAuth
1197        } else if self.credentials.is_some() {
1198            self.attempted_authentications
1199                .insert(event.request_id.clone().into());
1200            AuthChallengeResponseResponse::ProvideCredentials
1201        } else {
1202            AuthChallengeResponseResponse::Default
1203        };
1204
1205        let mut auth = AuthChallengeResponse::new(response);
1206        if let Some(creds) = self.credentials.clone() {
1207            auth.username = Some(creds.username);
1208            auth.password = Some(creds.password);
1209        }
1210        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1211    }
1212
1213    /// Set the page offline network emulation condition.
1214    pub fn set_offline_mode(&mut self, value: bool) {
1215        if self.offline == value {
1216            return;
1217        }
1218        self.offline = value;
1219        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1220            .offline(self.offline)
1221            .latency(0)
1222            .download_throughput(-1.)
1223            .upload_throughput(-1.)
1224            .build()
1225        {
1226            self.push_cdp_request(network);
1227        }
1228    }
1229
1230    /// Request interception doesn't happen for data URLs with Network Service.
1231    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1232        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1233            if let Some(interception_id) = self
1234                .request_id_to_interception_id
1235                .remove(event.request_id.as_ref())
1236            {
1237                self.on_request(event, Some(interception_id));
1238            } else {
1239                // TODO remove the clone for event
1240                self.requests_will_be_sent
1241                    .insert(event.request_id.clone(), event.clone());
1242            }
1243        } else {
1244            self.on_request(event, None);
1245        }
1246    }
1247
1248    /// The request was served from the cache.
1249    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1250        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1251            request.from_memory_cache = true;
1252        }
1253    }
1254
1255    /// On network response received.
1256    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1257        let mut request_failed = false;
1258
1259        // Track how many bytes we actually deducted from this target.
1260        let mut deducted: u64 = 0;
1261
1262        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1263            let before = *max_bytes;
1264
1265            // encoded_data_length -> saturating cast to u64
1266            let received_bytes: u64 = event.response.encoded_data_length as u64;
1267
1268            // Safe parse of Content-Length
1269            let content_length: Option<u64> = event
1270                .response
1271                .headers
1272                .inner()
1273                .get("content-length")
1274                .and_then(|v| v.as_str())
1275                .and_then(|s| s.trim().parse::<u64>().ok());
1276
1277            // Deduct what we actually received
1278            *max_bytes = max_bytes.saturating_sub(received_bytes);
1279
1280            // If the declared size can't fit, zero out now
1281            if let Some(cl) = content_length {
1282                if cl > *max_bytes {
1283                    *max_bytes = 0;
1284                }
1285            }
1286
1287            request_failed = *max_bytes == 0;
1288
1289            // Compute exact delta deducted on this event
1290            deducted = before.saturating_sub(*max_bytes);
1291        }
1292
1293        // Bubble up the deduction (even if request continues)
1294        if deducted > 0 {
1295            self.queued_events
1296                .push_back(NetworkEvent::BytesConsumed(deducted));
1297        }
1298
1299        // block all network request moving forward.
1300        if request_failed && self.max_bytes_allowed.is_some() {
1301            self.set_block_all(true);
1302        }
1303
1304        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1305            request.set_response(event.response.clone());
1306            self.queued_events.push_back(if request_failed {
1307                NetworkEvent::RequestFailed(request)
1308            } else {
1309                NetworkEvent::RequestFinished(request)
1310            });
1311        }
1312    }
1313
1314    /// On network loading finished.
1315    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1316        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1317            if let Some(interception_id) = request.interception_id.as_ref() {
1318                self.attempted_authentications
1319                    .remove(interception_id.as_ref());
1320            }
1321            self.queued_events
1322                .push_back(NetworkEvent::RequestFinished(request));
1323        }
1324    }
1325
1326    /// On network loading failed.
1327    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1328        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1329            request.failure_text = Some(event.error_text.clone());
1330            if let Some(interception_id) = request.interception_id.as_ref() {
1331                self.attempted_authentications
1332                    .remove(interception_id.as_ref());
1333            }
1334            self.queued_events
1335                .push_back(NetworkEvent::RequestFailed(request));
1336        }
1337    }
1338
1339    /// On request will be sent.
1340    fn on_request(
1341        &mut self,
1342        event: &EventRequestWillBeSent,
1343        interception_id: Option<InterceptionId>,
1344    ) {
1345        let mut redirect_chain = Vec::new();
1346        let mut redirect_location = None;
1347
1348        if let Some(redirect_resp) = &event.redirect_response {
1349            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1350                if is_redirect_status(redirect_resp.status) {
1351                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1352                        if redirect_resp.url != location {
1353                            let fixed_location = location.replace(&redirect_resp.url, "");
1354
1355                            if !fixed_location.is_empty() {
1356                                request.response.as_mut().map(|resp| {
1357                                    resp.headers.0["Location"] =
1358                                        serde_json::Value::String(fixed_location.clone());
1359                                });
1360                            }
1361
1362                            redirect_location = Some(fixed_location);
1363                        }
1364                    }
1365                }
1366
1367                self.handle_request_redirect(
1368                    &mut request,
1369                    if let Some(redirect_location) = redirect_location {
1370                        let mut redirect_resp = redirect_resp.clone();
1371
1372                        if !redirect_location.is_empty() {
1373                            redirect_resp.headers.0["Location"] =
1374                                serde_json::Value::String(redirect_location);
1375                        }
1376
1377                        redirect_resp
1378                    } else {
1379                        redirect_resp.clone()
1380                    },
1381                );
1382
1383                redirect_chain = std::mem::take(&mut request.redirect_chain);
1384                redirect_chain.push(request);
1385            }
1386        }
1387
1388        let request = HttpRequest::new(
1389            event.request_id.clone(),
1390            event.frame_id.clone(),
1391            interception_id,
1392            self.user_request_interception_enabled,
1393            redirect_chain,
1394        );
1395
1396        self.requests.insert(event.request_id.clone(), request);
1397        self.queued_events
1398            .push_back(NetworkEvent::Request(event.request_id.clone()));
1399    }
1400
1401    /// Handle request redirect.
1402    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1403        request.set_response(response);
1404        if let Some(interception_id) = request.interception_id.as_ref() {
1405            self.attempted_authentications
1406                .remove(interception_id.as_ref());
1407        }
1408    }
1409}
1410
1411#[derive(Debug)]
1412pub enum NetworkEvent {
1413    /// Send a CDP request.
1414    SendCdpRequest((MethodId, serde_json::Value)),
1415    /// Request.
1416    Request(RequestId),
1417    /// Response
1418    Response(RequestId),
1419    /// Request failed.
1420    RequestFailed(HttpRequest),
1421    /// Request finished.
1422    RequestFinished(HttpRequest),
1423    /// Bytes consumed.
1424    BytesConsumed(u64),
1425}
1426
1427#[cfg(test)]
1428mod tests {
1429    use super::ALLOWED_MATCHER_3RD_PARTY;
1430    use crate::handler::network::NetworkManager;
1431    use std::time::Duration;
1432
1433    #[test]
1434    fn test_allowed_matcher_3rd_party() {
1435        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1436        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1437        assert!(
1438            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1439            "expected Cloudflare challenge script to be allowed"
1440        );
1441
1442        // Should NOT be allowed (not in allow-list)
1443        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1444        assert!(
1445            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1446            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1447        );
1448
1449        // A couple sanity checks for existing allow patterns
1450        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1451        assert!(ALLOWED_MATCHER_3RD_PARTY
1452            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1453        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1454    }
1455
1456    #[test]
1457    fn test_script_allowed_by_default_when_not_blocklisted() {
1458        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1459        nm.set_page_url(
1460            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1461        );
1462
1463        // A random script that should not match your block tries.
1464        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1465        assert!(
1466            !nm.should_block_script_blocklist_only(ok),
1467            "expected non-blocklisted script to be allowed"
1468        );
1469    }
1470
1471    #[test]
1472    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1473        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1474        nm.set_page_url(
1475            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1476        );
1477
1478        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1479        let bad = "https://cdn.example.net/js/analytics.js";
1480        assert!(
1481            nm.should_block_script_blocklist_only(bad),
1482            "expected analytics.js to be blocklisted"
1483        );
1484    }
1485
1486    #[test]
1487    fn test_allowed_matcher_3rd_party_sanity() {
1488        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1489        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1490        assert!(
1491            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1492            "expected Cloudflare challenge script to be allowed"
1493        );
1494
1495        // Should NOT be allowed (not in allow-list)
1496        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1497        assert!(
1498            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1499            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1500        );
1501
1502        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1503        assert!(ALLOWED_MATCHER_3RD_PARTY
1504            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1505        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1506    }
1507    #[test]
1508    fn test_dynamic_blacklist_blocks_url() {
1509        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1510        nm.set_page_url("https://example.com/".to_string());
1511
1512        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1513        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1514        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1515
1516        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1517    }
1518
1519    #[test]
1520    fn test_blacklist_strict_wins_over_whitelist() {
1521        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1522        nm.set_page_url("https://example.com/".to_string());
1523
1524        // Same URL in both lists.
1525        nm.set_blacklist_patterns(["beacon.min.js"]);
1526        nm.set_whitelist_patterns(["beacon.min.js"]);
1527
1528        nm.set_blacklist_strict(true);
1529
1530        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1531        assert!(nm.is_whitelisted(u));
1532        assert!(nm.is_blacklisted(u));
1533
1534        // In strict mode, it should still be considered blocked at decision time.
1535        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1536        assert!(nm.blacklist_strict);
1537    }
1538
1539    #[test]
1540    fn test_blacklist_non_strict_allows_whitelist_override() {
1541        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1542        nm.set_page_url("https://example.com/".to_string());
1543
1544        nm.set_blacklist_patterns(["beacon.min.js"]);
1545        nm.set_whitelist_patterns(["beacon.min.js"]);
1546
1547        nm.set_blacklist_strict(false);
1548
1549        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1550        assert!(nm.is_blacklisted(u));
1551        assert!(nm.is_whitelisted(u));
1552        assert!(!nm.blacklist_strict);
1553    }
1554}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs