chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr,
3    ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
4    xhr::IGNORE_XHR_ASSETS,
5};
6#[cfg(any(feature = "adblock", feature = "firewall"))]
7use super::blockers::block_websites::block_ads;
8use crate::auth::Credentials;
9#[cfg(feature = "_cache")]
10use crate::cache::BasicCachePolicy;
11use crate::cmd::CommandChain;
12use crate::handler::http::HttpRequest;
13use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
19    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
21    SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24    fetch::{
25        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27    },
28    network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46    /// General patterns for popular libraries and resources
47    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48        "jquery",           // Covers jquery.min.js, jquery.js, etc.
49        "angular",
50        "react",            // Covers all React-related patterns
51        "vue",              // Covers all Vue-related patterns
52        "bootstrap",
53        "d3",
54        "lodash",
55        "ajax",
56        "application",
57        "app",              // Covers general app scripts like app.js
58        "main",
59        "index",
60        "bundle",
61        "vendor",
62        "runtime",
63        "polyfill",
64        "scripts",
65        "es2015.",
66        "es2020.",
67        "webpack",
68        "captcha",
69        "client",
70        "/cdn-cgi/challenge-platform/",
71        "/wp-content/js/",  // Covers Wordpress content
72        // Verified 3rd parties for request
73        "https://m.stripe.network/",
74        "https://challenges.cloudflare.com/",
75        "https://www.google.com/recaptcha/",
76        "https://google.com/recaptcha/api.js",
77        "https://www.gstatic.com/recaptcha/",
78        "https://captcha.px-cloud.net/",
79        "https://geo.captcha-delivery.com/",
80        "https://api.leminnow.com/captcha/",
81        "https://cdn.auth0.com/js/lock/",
82        "https://captcha.gtimg.com",
83        "https://client-api.arkoselabs.com/",
84        "https://www.capy.me/puzzle/",
85        "https://newassets.hcaptcha.com/",
86        "https://cdn.auth0.com/client",
87        "https://js.stripe.com/",
88        "https://cdn.prod.website-files.com/", // webflow cdn scripts
89        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
90        "https://code.jquery.com/jquery-"
91    ];
92
93    /// Determine if a script should be rendered in the browser by name.
94    ///
95    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
96    /// but we keep it for compatibility and other call sites.
97    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
98
99    /// General patterns for popular libraries and resources
100    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
101        // Verified 3rd parties for request
102        "https://m.stripe.network/",
103        "https://challenges.cloudflare.com/",
104        "https://js.stripe.com/",
105        "https://cdn.prod.website-files.com/", // webflow cdn scripts
106        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
107        "https://code.jquery.com/jquery-",
108        "https://ct.captcha-delivery.com/",
109        "https://geo.captcha-delivery.com/",
110        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
111        "https://cdn.auth0.com/client",
112        "https://captcha.px-cloud.net/",
113        "https://www.capy.me/puzzle/",
114        "https://www.gstatic.com/recaptcha/",
115        "https://google.com/recaptcha/",
116        "https://www.google.com/recaptcha/",
117        "https://www.recaptcha.net/recaptcha/",
118        "https://js.hcaptcha.com/1/api.js",
119        "https://hcaptcha.com/1/api.js",
120        "https://js.datadome.co/tags.js",
121        "https://api-js.datadome.co/",
122        "https://client.perimeterx.net/",
123        "https://captcha.px-cdn.net/",
124        "https://newassets.hcaptcha.com/",
125        "https://captcha.px-cloud.net/",
126        "https://s.perimeterx.net/",
127        "https://api.leminnow.com/captcha/",
128        "https://client-api.arkoselabs.com/",
129        "https://static.geetest.com/v4/gt4.js",
130        "https://static.geetest.com/",
131        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
132        "https://cdn.perfdrive.com/aperture/",
133        "https://assets.queue-it.net/",
134        "discourse-cdn.com/",
135        "hcaptcha.com",
136        "/cdn-cgi/challenge-platform/",
137        "/_Incapsula_Resource"
138    ];
139
140    /// Determine if a script should be rendered in the browser by name.
141    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
142
143    /// path of a js framework
144    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
145        phf::phf_set! {
146            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
147            "_astro/", "_app/immutable"
148        }
149    };
150
151    /// Ignore the content types.
152    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
153        "application/pdf",
154        "application/zip",
155        "application/x-rar-compressed",
156        "application/x-tar",
157        "image/png",
158        "image/jpeg",
159        "image/gif",
160        "image/bmp",
161        "image/webp",
162        "image/svg+xml",
163        "video/mp4",
164        "video/x-msvideo",
165        "video/x-matroska",
166        "video/webm",
167        "audio/mpeg",
168        "audio/ogg",
169        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
170        "application/vnd.ms-excel",
171        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
172        "application/vnd.ms-powerpoint",
173        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
174        "application/x-7z-compressed",
175        "application/x-rpm",
176        "application/x-shockwave-flash",
177        "application/rtf",
178    };
179
180    /// Ignore the resources for visual content types.
181    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
182        "Image",
183        "Media",
184        "Font"
185    };
186
187    /// Ignore the resources for visual content types.
188    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
189        "CspViolationReport",
190        "Ping",
191    };
192
193    /// Case insenstive css matching
194    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
195
196    /// The command chain.
197    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
198        let enable = EnableParams::default();
199
200        if let Ok(c) = serde_json::to_value(&enable) {
201            vec![(enable.identifier(), c)]
202        } else {
203            vec![]
204        }
205    };
206
207    /// The command chain with https ignore.
208    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
209        let enable = EnableParams::default();
210        let mut v = vec![];
211        if let Ok(c) = serde_json::to_value(&enable) {
212            v.push((enable.identifier(), c));
213        }
214        let ignore = SetIgnoreCertificateErrorsParams::new(true);
215        if let Ok(ignored) = serde_json::to_value(&ignore) {
216            v.push((ignore.identifier(), ignored));
217        }
218
219        v
220    };
221
222    /// Enable the fetch intercept command
223    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
224        fetch::EnableParams::builder()
225        .handle_auth_requests(true)
226        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
227        .build()
228    };
229}
230
231/// Determine if a redirect is true.
232pub(crate) fn is_redirect_status(status: i64) -> bool {
233    matches!(status, 301 | 302 | 303 | 307 | 308)
234}
235
236#[derive(Debug)]
237/// The base network manager.
238pub struct NetworkManager {
239    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
240    ///
241    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
242    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
243    /// Consumers pull from this queue via `poll()`.
244    queued_events: VecDeque<NetworkEvent>,
245    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
246    ///
247    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
248    /// certificates (self-signed, expired, MITM proxies, etc.).
249    ignore_httpserrors: bool,
250    /// Active in-flight requests keyed by CDP `RequestId`.
251    ///
252    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
253    /// and final state used to emit `RequestFinished` / `RequestFailed`.
254    requests: HashMap<RequestId, HttpRequest>,
255    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
256    /// `Fetch.requestPaused` arrives later (or vice versa).
257    ///
258    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
259    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
260    // TODO put event in an Arc?
261    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
262    /// Extra HTTP headers to apply to subsequent network requests via CDP.
263    ///
264    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
265    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
266    extra_headers: std::collections::HashMap<String, String>,
267    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
268    ///
269    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
270    /// store the interception id here so it can be attached to the `HttpRequest` once the
271    /// network request is observed.
272    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
273    /// Whether the user has disabled the browser cache.
274    ///
275    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
276    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
277    user_cache_disabled: bool,
278    /// Tracks which requests have already attempted authentication.
279    ///
280    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
281    /// authentication challenges (407/401). Once a request id is present here, subsequent
282    /// challenges for the same request are canceled.
283    attempted_authentications: HashSet<RequestId>,
284    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
285    ///
286    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
287    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
288    credentials: Option<Credentials>,
289    /// User-facing toggle indicating whether request interception is desired.
290    ///
291    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
292    /// not guarantee interception is active; interception is actually enabled/disabled by
293    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
294    ///
295    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
296    /// enabled to satisfy auth challenges.
297    pub(crate) user_request_interception_enabled: bool,
298    /// Hard kill-switch to block all network traffic.
299    ///
300    /// When `true`, the manager immediately blocks requests (typically via
301    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
302    /// and short-circuits most decision logic. This is used for safety conditions such as
303    /// exceeding `max_bytes_allowed` or other runtime protections.
304    block_all: bool,
305    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
306    ///
307    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
308    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
309    /// when `user_request_interception_enabled` or `credentials` change.
310    pub(crate) protocol_request_interception_enabled: bool,
311    /// The network is offline.
312    offline: bool,
313    /// The page request timeout.
314    pub request_timeout: Duration,
315    // made_request: bool,
316    /// Ignore visuals (no pings, prefetching, and etc).
317    pub ignore_visuals: bool,
318    /// Block CSS stylesheets.
319    pub block_stylesheets: bool,
320    /// Block javascript that is not critical to rendering.
321    ///
322    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
323    /// by itself (it remains for config compatibility).
324    pub block_javascript: bool,
325    /// Block analytics from rendering
326    pub block_analytics: bool,
327    /// Block pre-fetch request
328    pub block_prefetch: bool,
329    /// Only html from loading.
330    pub only_html: bool,
331    /// Is xml document?
332    pub xml_document: bool,
333    /// The custom intercept handle logic to run on the website.
334    pub intercept_manager: NetworkInterceptManager,
335    /// Track the amount of times the document reloaded.
336    pub document_reload_tracker: u8,
337    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
338    pub document_target_url: String,
339    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
340    pub document_target_domain: String,
341    /// The max bytes to receive.
342    pub max_bytes_allowed: Option<u64>,
343    #[cfg(feature = "_cache")]
344    /// The cache site_key to use.
345    pub cache_site_key: Option<String>,
346    /// The cache policy to use.
347    #[cfg(feature = "_cache")]
348    pub cache_policy: Option<BasicCachePolicy>,
349    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
350    whitelist_patterns: Vec<String>,
351    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
352    whitelist_matcher: Option<AhoCorasick>,
353    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
354    blacklist_patterns: Vec<String>,
355    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
356    blacklist_matcher: Option<AhoCorasick>,
357    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
358    blacklist_strict: bool,
359}
360
361impl NetworkManager {
362    /// A new network manager.
363    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
364        Self {
365            queued_events: Default::default(),
366            ignore_httpserrors,
367            requests: Default::default(),
368            requests_will_be_sent: Default::default(),
369            extra_headers: Default::default(),
370            request_id_to_interception_id: Default::default(),
371            user_cache_disabled: false,
372            attempted_authentications: Default::default(),
373            credentials: None,
374            block_all: false,
375            user_request_interception_enabled: false,
376            protocol_request_interception_enabled: false,
377            offline: false,
378            request_timeout,
379            ignore_visuals: false,
380            block_javascript: false,
381            block_stylesheets: false,
382            block_prefetch: true,
383            block_analytics: true,
384            only_html: false,
385            xml_document: false,
386            intercept_manager: NetworkInterceptManager::Unknown,
387            document_reload_tracker: 0,
388            document_target_url: String::new(),
389            document_target_domain: String::new(),
390            whitelist_patterns: Vec::new(),
391            whitelist_matcher: None,
392            blacklist_patterns: Vec::new(),
393            blacklist_matcher: None,
394            blacklist_strict: true,
395            max_bytes_allowed: None,
396            #[cfg(feature = "_cache")]
397            cache_site_key: None,
398            #[cfg(feature = "_cache")]
399            cache_policy: None,
400        }
401    }
402
403    /// Replace the whitelist patterns (compiled once).
404    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
405    where
406        I: IntoIterator<Item = S>,
407        S: Into<String>,
408    {
409        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
410        self.rebuild_whitelist_matcher();
411    }
412
413    /// Replace the blacklist patterns (compiled once).
414    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
415    where
416        I: IntoIterator<Item = S>,
417        S: Into<String>,
418    {
419        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
420        self.rebuild_blacklist_matcher();
421    }
422
423    /// Add one pattern (cheap) and rebuild (call this sparingly).
424    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
425        self.blacklist_patterns.push(pattern.into());
426        self.rebuild_blacklist_matcher();
427    }
428
429    /// Add many patterns and rebuild once.
430    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
431    where
432        I: IntoIterator<Item = S>,
433        S: Into<String>,
434    {
435        self.blacklist_patterns
436            .extend(patterns.into_iter().map(Into::into));
437        self.rebuild_blacklist_matcher();
438    }
439
440    /// Clear blacklist entirely.
441    pub fn clear_blacklist(&mut self) {
442        self.blacklist_patterns.clear();
443        self.blacklist_matcher = None;
444    }
445
446    /// Control precedence: when true, blacklist always wins.
447    pub fn set_blacklist_strict(&mut self, strict: bool) {
448        self.blacklist_strict = strict;
449    }
450
451    #[inline]
452    fn rebuild_blacklist_matcher(&mut self) {
453        if self.blacklist_patterns.is_empty() {
454            self.blacklist_matcher = None;
455            return;
456        }
457
458        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
459        self.blacklist_matcher = AhoCorasick::new(refs).ok();
460    }
461
462    #[inline]
463    fn is_blacklisted(&self, url: &str) -> bool {
464        self.blacklist_matcher
465            .as_ref()
466            .map(|m| m.is_match(url))
467            .unwrap_or(false)
468    }
469
470    /// Add one pattern (cheap) and rebuild (call this sparingly).
471    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
472        self.whitelist_patterns.push(pattern.into());
473        self.rebuild_whitelist_matcher();
474    }
475
476    /// Add many patterns and rebuild once.
477    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
478    where
479        I: IntoIterator<Item = S>,
480        S: Into<String>,
481    {
482        self.whitelist_patterns
483            .extend(patterns.into_iter().map(Into::into));
484        self.rebuild_whitelist_matcher();
485    }
486
487    #[inline]
488    fn rebuild_whitelist_matcher(&mut self) {
489        if self.whitelist_patterns.is_empty() {
490            self.whitelist_matcher = None;
491            return;
492        }
493
494        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
495
496        // If building fails (shouldn’t for simple patterns), just disable matcher.
497        self.whitelist_matcher = AhoCorasick::new(refs).ok();
498    }
499
500    #[inline]
501    fn is_whitelisted(&self, url: &str) -> bool {
502        self.whitelist_matcher
503            .as_ref()
504            .map(|m| m.is_match(url))
505            .unwrap_or(false)
506    }
507
508    /// Commands to init the chain with.
509    pub fn init_commands(&self) -> CommandChain {
510        let cmds = if self.ignore_httpserrors {
511            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
512        } else {
513            INIT_CHAIN.clone()
514        };
515        CommandChain::new(cmds, self.request_timeout)
516    }
517
518    /// Push the CDP request.
519    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
520        let method = cmd.identifier();
521        if let Ok(params) = serde_json::to_value(cmd) {
522            self.queued_events
523                .push_back(NetworkEvent::SendCdpRequest((method, params)));
524        }
525    }
526
527    /// The next event to handle.
528    pub fn poll(&mut self) -> Option<NetworkEvent> {
529        self.queued_events.pop_front()
530    }
531
532    /// Get the extra headers.
533    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
534        &self.extra_headers
535    }
536
537    /// Set extra HTTP headers.
538    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
539        self.extra_headers = headers;
540        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
541        self.extra_headers.remove("Proxy-Authorization");
542        if !self.extra_headers.is_empty() {
543            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
544                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
545            }
546        }
547    }
548
549    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
550        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
551    }
552
553    pub fn set_block_all(&mut self, block_all: bool) {
554        self.block_all = block_all;
555    }
556
557    pub fn set_request_interception(&mut self, enabled: bool) {
558        self.user_request_interception_enabled = enabled;
559        self.update_protocol_request_interception();
560    }
561
562    pub fn set_cache_enabled(&mut self, enabled: bool) {
563        let run = self.user_cache_disabled != !enabled;
564        self.user_cache_disabled = !enabled;
565        if run {
566            self.update_protocol_cache_disabled();
567        }
568    }
569
570    /// Enable fetch interception.
571    pub fn enable_request_intercept(&mut self) {
572        self.protocol_request_interception_enabled = true;
573    }
574
575    /// Disable fetch interception.
576    pub fn disable_request_intercept(&mut self) {
577        self.protocol_request_interception_enabled = false;
578    }
579
580    /// Set the cache site key.
581    #[cfg(feature = "_cache")]
582    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
583        self.cache_site_key = cache_site_key;
584    }
585
586    /// Set the cache policy.
587    #[cfg(feature = "_cache")]
588    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
589        self.cache_policy = cache_policy;
590    }
591
592    pub fn update_protocol_cache_disabled(&mut self) {
593        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
594    }
595
596    pub fn authenticate(&mut self, credentials: Credentials) {
597        self.credentials = Some(credentials);
598        self.update_protocol_request_interception();
599        self.protocol_request_interception_enabled = true;
600    }
601
602    fn update_protocol_request_interception(&mut self) {
603        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
604
605        if enabled == self.protocol_request_interception_enabled {
606            return;
607        }
608
609        if enabled {
610            self.push_cdp_request(ENABLE_FETCH.clone())
611        } else {
612            self.push_cdp_request(DisableParams::default())
613        }
614    }
615
616    /// Blocklist-only script blocking.
617    /// Returns true only when the URL matches an explicit blocklist condition.
618    #[inline]
619    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
620        // If analytics blocking is off, skip all analytics tries.
621        let block_analytics = self.block_analytics;
622
623        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
624        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
625        {
626            return true;
627        }
628
629        // 2) Custom website block list (explicit).
630        if crate::handler::blockers::block_websites::block_website(url) {
631            return true;
632        }
633
634        // 3) Path-based explicit tries / fallbacks.
635        //
636        // We run these on:
637        // - path with leading slash ("/js/app.js")
638        // - path without leading slash ("js/app.js")
639        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
640        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
641            // Remove query/fragment so matching stays stable.
642            let p_slash = Self::strip_query_fragment(path_with_slash);
643            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
644
645            // Basename for filename-only lists.
646            let base = match p_slash.rsplit('/').next() {
647                Some(b) => b,
648                None => p_slash,
649            };
650
651            // ---- Trie checks ----
652            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
653            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
654                return true;
655            }
656            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
657                return true;
658            }
659            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
660                return true;
661            }
662
663            // Base-path ignore tries (framework noise / known ignorable script paths).
664            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
665            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
666                return true;
667            }
668
669            // Style path ignores only when visuals are ignored.
670            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
671                return true;
672            }
673        }
674
675        false
676    }
677
678    /// Extract the absolute URL path portion WITH the leading slash.
679    ///
680    /// Example:
681    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
682    #[inline]
683    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
684        // find scheme separator
685        let idx = url.find("//")?;
686        let after_slashes = idx + 2;
687
688        // find first slash after host
689        let slash_rel = url[after_slashes..].find('/')?;
690        let slash_idx = after_slashes + slash_rel;
691
692        if slash_idx < url.len() {
693            Some(&url[slash_idx..])
694        } else {
695            None
696        }
697    }
698
699    /// Strip query string and fragment from a path-ish string.
700    ///
701    /// Example:
702    /// - "/a/b.js?x=1#y" -> "/a/b.js"
703    #[inline]
704    fn strip_query_fragment(s: &str) -> &str {
705        let q = s.find('?');
706        let h = s.find('#');
707
708        match (q, h) {
709            (None, None) => s,
710            (Some(i), None) => &s[..i],
711            (None, Some(i)) => &s[..i],
712            (Some(i), Some(j)) => &s[..i.min(j)],
713        }
714    }
715
716    /// Determine if the request should be skipped.
717    #[inline]
718    fn skip_xhr(
719        &self,
720        skip_networking: bool,
721        event: &EventRequestPaused,
722        network_event: bool,
723    ) -> bool {
724        // XHR check
725        if !skip_networking && network_event {
726            let request_url = event.request.url.as_str();
727
728            // check if part of ignore scripts.
729            let skip_analytics =
730                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
731
732            if skip_analytics {
733                true
734            } else if self.block_stylesheets || self.ignore_visuals {
735                let block_css = self.block_stylesheets;
736                let block_media = self.ignore_visuals;
737
738                let mut block_request = false;
739
740                if let Some(position) = request_url.rfind('.') {
741                    let hlen = request_url.len();
742                    let has_asset = hlen - position;
743
744                    if has_asset >= 3 {
745                        let next_position = position + 1;
746
747                        if block_media
748                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
749                                &request_url[next_position..].into(),
750                            )
751                        {
752                            block_request = true;
753                        } else if block_css {
754                            block_request =
755                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
756                                    .contains(&**CSS_EXTENSION)
757                        }
758                    }
759                }
760
761                if !block_request {
762                    block_request = ignore_script_xhr_media(request_url);
763                }
764
765                block_request
766            } else {
767                skip_networking
768            }
769        } else {
770            skip_networking
771        }
772    }
773
774    #[cfg(feature = "adblock")]
775    #[inline]
776    /// Detect if ad enabled.
777    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
778        if skip_networking {
779            true
780        } else {
781            block_ads(&event.request.url) || self.detect_ad(event)
782        }
783    }
784
785    /// When adblock feature is disabled, this is a no-op.
786    #[cfg(not(feature = "adblock"))]
787    #[inline]
788    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
789        use crate::handler::blockers::block_websites::block_ads;
790        if skip_networking {
791            true
792        } else {
793            block_ads(&event.request.url)
794        }
795    }
796
797    #[inline]
798    /// Fail request
799    fn fail_request_blocked(
800        &mut self,
801        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
802    ) {
803        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
804            request_id.clone(),
805            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
806        );
807        self.push_cdp_request(params);
808    }
809
810    #[inline]
811    /// Fulfill request
812    fn fulfill_request_empty_200(
813        &mut self,
814        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
815    ) {
816        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
817            request_id.clone(),
818            200,
819        );
820        self.push_cdp_request(params);
821    }
822
823    #[cfg(feature = "_cache")]
824    #[inline]
825    /// Fulfill a paused Fetch request from cached bytes + header map.
826    ///
827    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
828    fn fulfill_request_from_cache(
829        &mut self,
830        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
831        body: &[u8],
832        headers: &std::collections::HashMap<String, String>,
833        status: i64,
834    ) {
835        use crate::cdp::browser_protocol::fetch::HeaderEntry;
836        use crate::handler::network::fetch::FulfillRequestParams;
837        use base64::Engine;
838
839        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
840
841        for (k, v) in headers.iter() {
842            resp_headers.push(HeaderEntry {
843                name: k.clone().into(),
844                value: v.clone().into(),
845            });
846        }
847
848        let mut params = FulfillRequestParams::new(request_id.clone(), status);
849
850        // TODO: have this already encoded prior.
851        params.body = Some(
852            base64::engine::general_purpose::STANDARD
853                .encode(body)
854                .into(),
855        );
856
857        params.response_headers = Some(resp_headers);
858
859        self.push_cdp_request(params);
860    }
861
862    #[inline]
863    /// Continue the request url.
864    fn continue_request_with_url(
865        &mut self,
866        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
867        url: Option<&str>,
868        intercept_response: bool,
869    ) {
870        let mut params = ContinueRequestParams::new(request_id.clone());
871        if let Some(url) = url {
872            params.url = Some(url.to_string());
873            params.intercept_response = Some(intercept_response);
874        }
875        self.push_cdp_request(params);
876    }
877
878    /// On fetch request paused interception.
879    #[inline]
880    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
881        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
882            return;
883        }
884
885        if self.block_all {
886            tracing::debug!(
887                "Blocked (block_all): {:?} - {}",
888                event.resource_type,
889                event.request.url
890            );
891            return self.fail_request_blocked(&event.request_id);
892        }
893
894        if let Some(network_id) = event.network_id.as_ref() {
895            if let Some(request_will_be_sent) =
896                self.requests_will_be_sent.remove(network_id.as_ref())
897            {
898                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
899            } else {
900                self.request_id_to_interception_id
901                    .insert(network_id.clone(), event.request_id.clone().into());
902            }
903        }
904
905        // From here on, we handle the full decision tree.
906        let javascript_resource = event.resource_type == ResourceType::Script;
907        let document_resource = event.resource_type == ResourceType::Document;
908        let network_resource =
909            !document_resource && crate::utils::is_data_resource(&event.resource_type);
910
911        // Start with static / cheap skip checks.
912        let mut skip_networking =
913            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
914
915        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
916            skip_networking = true;
917        }
918
919        // Also short-circuit if we've reloaded this document too many times.
920        if !skip_networking {
921            skip_networking = self.document_reload_tracker >= 3;
922        }
923
924        // Handle document redirect / masking and track xml documents.
925        let (current_url_cow, had_replacer) =
926            self.handle_document_replacement_and_tracking(event, document_resource);
927
928        let current_url: &str = current_url_cow.as_ref();
929
930        let blacklisted = self.is_blacklisted(current_url);
931
932        if !self.blacklist_strict && blacklisted {
933            skip_networking = true;
934        }
935
936        if !skip_networking {
937            // Allow XSL for sitemap XML.
938            if self.xml_document && current_url.ends_with(".xsl") {
939                skip_networking = false;
940            } else {
941                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
942            }
943        }
944
945        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
946
947        // Ignore embedded scripts when only_html or ignore_visuals is set.
948        if !skip_networking
949            && self.block_javascript
950            && (self.only_html || self.ignore_visuals)
951            && (javascript_resource || document_resource)
952        {
953            skip_networking = ignore_script_embedded(current_url);
954        }
955
956        // Script policy: allow-by-default.
957        // Block only if explicit block list patterns match.
958        if !skip_networking && javascript_resource {
959            skip_networking = self.should_block_script_blocklist_only(current_url);
960        }
961
962        // XHR / data resources.
963        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
964
965        // Custom interception layer.
966        if !skip_networking && (javascript_resource || network_resource || document_resource) {
967            skip_networking = self.intercept_manager.intercept_detection(
968                current_url,
969                self.ignore_visuals,
970                network_resource,
971            );
972        }
973
974        // Custom website block list.
975        if !skip_networking && (javascript_resource || network_resource) {
976            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
977        }
978
979        // whitelist 3rd party
980        // not required unless explicit blocking.
981        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
982        {
983            skip_networking = false;
984        }
985
986        // check if the url is in the whitelist.
987        if skip_networking && self.is_whitelisted(current_url) {
988            skip_networking = false;
989        }
990
991        if self.blacklist_strict && blacklisted {
992            skip_networking = true;
993        }
994
995        if skip_networking {
996            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
997            self.fulfill_request_empty_200(&event.request_id);
998        } else {
999            #[cfg(feature = "_cache")]
1000            {
1001                if let (Some(policy), Some(cache_site_key)) =
1002                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1003                {
1004                    let current_url = format!("{}:{}", event.request.method, &current_url);
1005
1006                    if let Some((res, cache_policy)) =
1007                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1008                    {
1009                        if policy.allows_cached(&cache_policy) {
1010                            tracing::debug!(
1011                                "Remote Cached: {:?} - {}",
1012                                &event.resource_type,
1013                                &current_url
1014                            );
1015                            return self.fulfill_request_from_cache(
1016                                &event.request_id,
1017                                &res.body,
1018                                &res.headers,
1019                                res.status as i64,
1020                            );
1021                        }
1022                    }
1023                }
1024            }
1025
1026            // check our frame cache for the run.
1027            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1028            self.continue_request_with_url(
1029                &event.request_id,
1030                if had_replacer {
1031                    Some(current_url)
1032                } else {
1033                    None
1034                },
1035                !had_replacer,
1036            );
1037        }
1038    }
1039
1040    /// Shared "visuals + basic blocking" logic.
1041    ///
1042    /// IMPORTANT: Scripts are NOT blocked here anymore.
1043    /// Scripts are allowed by default and only blocked via explicit blocklists
1044    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1045    #[inline]
1046    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1047        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1048            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1049    }
1050
1051    /// Does the network manager have a target domain?
1052    pub fn has_target_domain(&self) -> bool {
1053        !self.document_target_url.is_empty()
1054    }
1055
1056    /// Set the target page url for tracking.
1057    pub fn set_page_url(&mut self, page_target_url: String) {
1058        let host_base = host_and_rest(&page_target_url)
1059            .map(|(h, _)| base_domain_from_host(h))
1060            .unwrap_or("");
1061
1062        self.document_target_domain = host_base.to_string();
1063        self.document_target_url = page_target_url;
1064    }
1065
1066    /// Clear the initial target domain on every navigation.
1067    pub fn clear_target_domain(&mut self) {
1068        self.document_reload_tracker = 0;
1069        self.document_target_url = Default::default();
1070        self.document_target_domain = Default::default();
1071    }
1072
1073    /// Handles:
1074    /// - document reload tracking (`document_reload_tracker`)
1075    /// - redirect masking / replacement
1076    /// - xml document detection (`xml_document`)
1077    /// - `document_target_url` updates
1078    ///
1079    /// Returns (current_url, had_replacer).
1080    #[inline]
1081    fn handle_document_replacement_and_tracking<'a>(
1082        &mut self,
1083        event: &'a EventRequestPaused,
1084        document_resource: bool,
1085    ) -> (Cow<'a, str>, bool) {
1086        let mut replacer: Option<String> = None;
1087        let current_url = event.request.url.as_str();
1088
1089        if document_resource {
1090            if self.document_target_url == current_url {
1091                self.document_reload_tracker += 1;
1092            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1093            {
1094                let (http_document_replacement, mut https_document_replacement) =
1095                    if self.document_target_url.starts_with("http://") {
1096                        (
1097                            self.document_target_url.replacen("http://", "http//", 1),
1098                            self.document_target_url.replacen("http://", "https://", 1),
1099                        )
1100                    } else {
1101                        (
1102                            self.document_target_url.replacen("https://", "https//", 1),
1103                            self.document_target_url.replacen("https://", "http://", 1),
1104                        )
1105                    };
1106
1107                // Track trailing slash to restore later.
1108                let trailing = https_document_replacement.ends_with('/');
1109                if trailing {
1110                    https_document_replacement.pop();
1111                }
1112                if https_document_replacement.ends_with('/') {
1113                    https_document_replacement.pop();
1114                }
1115
1116                let redirect_mask = format!(
1117                    "{}{}",
1118                    https_document_replacement, http_document_replacement
1119                );
1120
1121                if current_url == redirect_mask {
1122                    replacer = Some(if trailing {
1123                        format!("{}/", https_document_replacement)
1124                    } else {
1125                        https_document_replacement
1126                    });
1127                }
1128            }
1129
1130            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1131                self.xml_document = true;
1132            }
1133
1134            // Track last seen document URL.
1135            self.document_target_url = event.request.url.clone();
1136            self.document_target_domain = host_and_rest(&self.document_target_url)
1137                .map(|(h, _)| base_domain_from_host(h).to_string())
1138                .unwrap_or_default();
1139        }
1140
1141        let current_url_cow = match replacer {
1142            Some(r) => Cow::Owned(r),
1143            None => Cow::Borrowed(event.request.url.as_str()),
1144        };
1145
1146        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1147        (current_url_cow, had_replacer)
1148    }
1149
1150    /// Perform a page intercept for chrome
1151    #[cfg(feature = "adblock")]
1152    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1153        use adblock::{
1154            lists::{FilterSet, ParseOptions, RuleTypes},
1155            Engine,
1156        };
1157
1158        lazy_static::lazy_static! {
1159            static ref AD_ENGINE: Engine = {
1160                let mut filter_set = FilterSet::new(false);
1161                let mut rules = ParseOptions::default();
1162                rules.rule_types = RuleTypes::All;
1163
1164                filter_set.add_filters(
1165                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1166                    rules,
1167                );
1168
1169                Engine::from_filter_set(filter_set, true)
1170            };
1171        };
1172
1173        let blockable = ResourceType::Image == event.resource_type
1174            || event.resource_type == ResourceType::Media
1175            || event.resource_type == ResourceType::Stylesheet
1176            || event.resource_type == ResourceType::Document
1177            || event.resource_type == ResourceType::Fetch
1178            || event.resource_type == ResourceType::Xhr;
1179
1180        let u = &event.request.url;
1181
1182        let block_request = blockable
1183            // set it to example.com for 3rd party handling is_same_site
1184        && {
1185            let request = adblock::request::Request::preparsed(
1186                 &u,
1187                 "example.com",
1188                 "example.com",
1189                 &event.resource_type.as_ref().to_lowercase(),
1190                 !event.request.is_same_site.unwrap_or_default());
1191
1192            AD_ENGINE.check_network_request(&request).matched
1193        };
1194
1195        block_request
1196    }
1197
1198    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1199        let response = if self
1200            .attempted_authentications
1201            .contains(event.request_id.as_ref())
1202        {
1203            AuthChallengeResponseResponse::CancelAuth
1204        } else if self.credentials.is_some() {
1205            self.attempted_authentications
1206                .insert(event.request_id.clone().into());
1207            AuthChallengeResponseResponse::ProvideCredentials
1208        } else {
1209            AuthChallengeResponseResponse::Default
1210        };
1211
1212        let mut auth = AuthChallengeResponse::new(response);
1213        if let Some(creds) = self.credentials.clone() {
1214            auth.username = Some(creds.username);
1215            auth.password = Some(creds.password);
1216        }
1217        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1218    }
1219
1220    /// Set the page offline network emulation condition.
1221    pub fn set_offline_mode(&mut self, value: bool) {
1222        if self.offline == value {
1223            return;
1224        }
1225        self.offline = value;
1226        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1227            .offline(self.offline)
1228            .latency(0)
1229            .download_throughput(-1.)
1230            .upload_throughput(-1.)
1231            .build()
1232        {
1233            self.push_cdp_request(network);
1234        }
1235    }
1236
1237    /// Request interception doesn't happen for data URLs with Network Service.
1238    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1239        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1240            if let Some(interception_id) = self
1241                .request_id_to_interception_id
1242                .remove(event.request_id.as_ref())
1243            {
1244                self.on_request(event, Some(interception_id));
1245            } else {
1246                // TODO remove the clone for event
1247                self.requests_will_be_sent
1248                    .insert(event.request_id.clone(), event.clone());
1249            }
1250        } else {
1251            self.on_request(event, None);
1252        }
1253    }
1254
1255    /// The request was served from the cache.
1256    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1257        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1258            request.from_memory_cache = true;
1259        }
1260    }
1261
1262    /// On network response received.
1263    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1264        let mut request_failed = false;
1265
1266        // Track how many bytes we actually deducted from this target.
1267        let mut deducted: u64 = 0;
1268
1269        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1270            let before = *max_bytes;
1271
1272            // encoded_data_length -> saturating cast to u64
1273            let received_bytes: u64 = event.response.encoded_data_length as u64;
1274
1275            // Safe parse of Content-Length
1276            let content_length: Option<u64> = event
1277                .response
1278                .headers
1279                .inner()
1280                .get("content-length")
1281                .and_then(|v| v.as_str())
1282                .and_then(|s| s.trim().parse::<u64>().ok());
1283
1284            // Deduct what we actually received
1285            *max_bytes = max_bytes.saturating_sub(received_bytes);
1286
1287            // If the declared size can't fit, zero out now
1288            if let Some(cl) = content_length {
1289                if cl > *max_bytes {
1290                    *max_bytes = 0;
1291                }
1292            }
1293
1294            request_failed = *max_bytes == 0;
1295
1296            // Compute exact delta deducted on this event
1297            deducted = before.saturating_sub(*max_bytes);
1298        }
1299
1300        // Bubble up the deduction (even if request continues)
1301        if deducted > 0 {
1302            self.queued_events
1303                .push_back(NetworkEvent::BytesConsumed(deducted));
1304        }
1305
1306        // block all network request moving forward.
1307        if request_failed && self.max_bytes_allowed.is_some() {
1308            self.set_block_all(true);
1309        }
1310
1311        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1312            request.set_response(event.response.clone());
1313            self.queued_events.push_back(if request_failed {
1314                NetworkEvent::RequestFailed(request)
1315            } else {
1316                NetworkEvent::RequestFinished(request)
1317            });
1318        }
1319    }
1320
1321    /// On network loading finished.
1322    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1323        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1324            if let Some(interception_id) = request.interception_id.as_ref() {
1325                self.attempted_authentications
1326                    .remove(interception_id.as_ref());
1327            }
1328            self.queued_events
1329                .push_back(NetworkEvent::RequestFinished(request));
1330        }
1331    }
1332
1333    /// On network loading failed.
1334    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1335        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1336            request.failure_text = Some(event.error_text.clone());
1337            if let Some(interception_id) = request.interception_id.as_ref() {
1338                self.attempted_authentications
1339                    .remove(interception_id.as_ref());
1340            }
1341            self.queued_events
1342                .push_back(NetworkEvent::RequestFailed(request));
1343        }
1344    }
1345
1346    /// On request will be sent.
1347    fn on_request(
1348        &mut self,
1349        event: &EventRequestWillBeSent,
1350        interception_id: Option<InterceptionId>,
1351    ) {
1352        let mut redirect_chain = Vec::new();
1353        let mut redirect_location = None;
1354
1355        if let Some(redirect_resp) = &event.redirect_response {
1356            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1357                if is_redirect_status(redirect_resp.status) {
1358                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1359                        if redirect_resp.url != location {
1360                            let fixed_location = location.replace(&redirect_resp.url, "");
1361
1362                            if !fixed_location.is_empty() {
1363                                request.response.as_mut().map(|resp| {
1364                                    resp.headers.0["Location"] =
1365                                        serde_json::Value::String(fixed_location.clone());
1366                                });
1367                            }
1368
1369                            redirect_location = Some(fixed_location);
1370                        }
1371                    }
1372                }
1373
1374                self.handle_request_redirect(
1375                    &mut request,
1376                    if let Some(redirect_location) = redirect_location {
1377                        let mut redirect_resp = redirect_resp.clone();
1378
1379                        if !redirect_location.is_empty() {
1380                            redirect_resp.headers.0["Location"] =
1381                                serde_json::Value::String(redirect_location);
1382                        }
1383
1384                        redirect_resp
1385                    } else {
1386                        redirect_resp.clone()
1387                    },
1388                );
1389
1390                redirect_chain = std::mem::take(&mut request.redirect_chain);
1391                redirect_chain.push(request);
1392            }
1393        }
1394
1395        let request = HttpRequest::new(
1396            event.request_id.clone(),
1397            event.frame_id.clone(),
1398            interception_id,
1399            self.user_request_interception_enabled,
1400            redirect_chain,
1401        );
1402
1403        self.requests.insert(event.request_id.clone(), request);
1404        self.queued_events
1405            .push_back(NetworkEvent::Request(event.request_id.clone()));
1406    }
1407
1408    /// Handle request redirect.
1409    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1410        request.set_response(response);
1411        if let Some(interception_id) = request.interception_id.as_ref() {
1412            self.attempted_authentications
1413                .remove(interception_id.as_ref());
1414        }
1415    }
1416}
1417
1418#[derive(Debug)]
1419pub enum NetworkEvent {
1420    /// Send a CDP request.
1421    SendCdpRequest((MethodId, serde_json::Value)),
1422    /// Request.
1423    Request(RequestId),
1424    /// Response
1425    Response(RequestId),
1426    /// Request failed.
1427    RequestFailed(HttpRequest),
1428    /// Request finished.
1429    RequestFinished(HttpRequest),
1430    /// Bytes consumed.
1431    BytesConsumed(u64),
1432}
1433
1434#[cfg(test)]
1435mod tests {
1436    use super::ALLOWED_MATCHER_3RD_PARTY;
1437    use crate::handler::network::NetworkManager;
1438    use std::time::Duration;
1439
1440    #[test]
1441    fn test_allowed_matcher_3rd_party() {
1442        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1443        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1444        assert!(
1445            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1446            "expected Cloudflare challenge script to be allowed"
1447        );
1448
1449        // Should NOT be allowed (not in allow-list)
1450        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1451        assert!(
1452            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1453            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1454        );
1455
1456        // A couple sanity checks for existing allow patterns
1457        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1458        assert!(ALLOWED_MATCHER_3RD_PARTY
1459            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1460        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1461    }
1462
1463    #[test]
1464    fn test_script_allowed_by_default_when_not_blocklisted() {
1465        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1466        nm.set_page_url(
1467            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1468        );
1469
1470        // A random script that should not match your block tries.
1471        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1472        assert!(
1473            !nm.should_block_script_blocklist_only(ok),
1474            "expected non-blocklisted script to be allowed"
1475        );
1476    }
1477
1478    #[test]
1479    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1480        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1481        nm.set_page_url(
1482            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1483        );
1484
1485        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1486        let bad = "https://cdn.example.net/js/analytics.js";
1487        assert!(
1488            nm.should_block_script_blocklist_only(bad),
1489            "expected analytics.js to be blocklisted"
1490        );
1491    }
1492
1493    #[test]
1494    fn test_allowed_matcher_3rd_party_sanity() {
1495        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1496        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1497        assert!(
1498            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1499            "expected Cloudflare challenge script to be allowed"
1500        );
1501
1502        // Should NOT be allowed (not in allow-list)
1503        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1504        assert!(
1505            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1506            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1507        );
1508
1509        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1510        assert!(ALLOWED_MATCHER_3RD_PARTY
1511            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1512        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1513    }
1514    #[test]
1515    fn test_dynamic_blacklist_blocks_url() {
1516        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1517        nm.set_page_url("https://example.com/".to_string());
1518
1519        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1520        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1521        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1522
1523        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1524    }
1525
1526    #[test]
1527    fn test_blacklist_strict_wins_over_whitelist() {
1528        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1529        nm.set_page_url("https://example.com/".to_string());
1530
1531        // Same URL in both lists.
1532        nm.set_blacklist_patterns(["beacon.min.js"]);
1533        nm.set_whitelist_patterns(["beacon.min.js"]);
1534
1535        nm.set_blacklist_strict(true);
1536
1537        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1538        assert!(nm.is_whitelisted(u));
1539        assert!(nm.is_blacklisted(u));
1540
1541        // In strict mode, it should still be considered blocked at decision time.
1542        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1543        assert!(nm.blacklist_strict);
1544    }
1545
1546    #[test]
1547    fn test_blacklist_non_strict_allows_whitelist_override() {
1548        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1549        nm.set_page_url("https://example.com/".to_string());
1550
1551        nm.set_blacklist_patterns(["beacon.min.js"]);
1552        nm.set_whitelist_patterns(["beacon.min.js"]);
1553
1554        nm.set_blacklist_strict(false);
1555
1556        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1557        assert!(nm.is_blacklisted(u));
1558        assert!(nm.is_whitelisted(u));
1559        assert!(!nm.blacklist_strict);
1560    }
1561}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs