chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr,
3    ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
4    xhr::IGNORE_XHR_ASSETS,
5};
6#[cfg(any(feature = "adblock", feature = "firewall"))]
7use super::blockers::block_websites::block_ads;
8use crate::auth::Credentials;
9#[cfg(feature = "_cache")]
10use crate::cache::BasicCachePolicy;
11use crate::cmd::CommandChain;
12use crate::handler::http::HttpRequest;
13use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
19    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20    InterceptionId, NetworkConditions, RequestId, ResourceType, Response,
21    SetCacheDisabledParams, SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24    fetch::{
25        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27    },
28    network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46    /// General patterns for popular libraries and resources
47    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48        "jquery",           // Covers jquery.min.js, jquery.js, etc.
49        "angular",
50        "react",            // Covers all React-related patterns
51        "vue",              // Covers all Vue-related patterns
52        "bootstrap",
53        "d3",
54        "lodash",
55        "ajax",
56        "application",
57        "app",              // Covers general app scripts like app.js
58        "main",
59        "index",
60        "bundle",
61        "vendor",
62        "runtime",
63        "polyfill",
64        "scripts",
65        "es2015.",
66        "es2020.",
67        "webpack",
68        "captcha",
69        "client",
70        "/cdn-cgi/challenge-platform/",
71        "/wp-content/js/",  // Covers Wordpress content
72        // Verified 3rd parties for request
73        "https://m.stripe.network/",
74        "https://challenges.cloudflare.com/",
75        "https://www.google.com/recaptcha/",
76        "https://google.com/recaptcha/api.js",
77        "https://www.gstatic.com/recaptcha/",
78        "https://captcha.px-cloud.net/",
79        "https://geo.captcha-delivery.com/",
80        "https://api.leminnow.com/captcha/",
81        "https://cdn.auth0.com/js/lock/",
82        "https://captcha.gtimg.com",
83        "https://client-api.arkoselabs.com/",
84        "https://www.capy.me/puzzle/",
85        "https://newassets.hcaptcha.com/",
86        "https://cdn.auth0.com/client",
87        "https://js.stripe.com/",
88        "https://cdn.prod.website-files.com/", // webflow cdn scripts
89        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
90        "https://code.jquery.com/jquery-"
91    ];
92
93    /// Determine if a script should be rendered in the browser by name.
94    ///
95    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
96    /// but we keep it for compatibility and other call sites.
97    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
98
99    /// General patterns for popular libraries and resources
100    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
101        // Verified 3rd parties for request
102        "https://m.stripe.network/",
103        "https://challenges.cloudflare.com/",
104        "https://js.stripe.com/",
105        "https://cdn.prod.website-files.com/", // webflow cdn scripts
106        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
107        "https://code.jquery.com/jquery-",
108        "https://ct.captcha-delivery.com/",
109        "https://geo.captcha-delivery.com/",
110        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
111        "https://cdn.auth0.com/client",
112        "https://captcha.px-cloud.net/",
113        "https://www.capy.me/puzzle/",
114        "https://www.gstatic.com/recaptcha/",
115        "https://google.com/recaptcha/",
116        "https://www.google.com/recaptcha/",
117        "https://www.recaptcha.net/recaptcha/",
118        "https://js.hcaptcha.com/1/api.js",
119        "https://hcaptcha.com/1/api.js",
120        "https://js.datadome.co/tags.js",
121        "https://api-js.datadome.co/",
122        "https://client.perimeterx.net/",
123        "https://captcha.px-cdn.net/",
124        "https://newassets.hcaptcha.com/",
125        "https://captcha.px-cloud.net/",
126        "https://s.perimeterx.net/",
127        "https://api.leminnow.com/captcha/",
128        "https://client-api.arkoselabs.com/",
129        "https://static.geetest.com/v4/gt4.js",
130        "https://static.geetest.com/",
131        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
132        "https://cdn.perfdrive.com/aperture/",
133        "https://assets.queue-it.net/",
134        "discourse-cdn.com/",
135        "hcaptcha.com",
136        "/cdn-cgi/challenge-platform/",
137        "/_Incapsula_Resource"
138    ];
139
140    /// Determine if a script should be rendered in the browser by name.
141    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
142
143    /// path of a js framework
144    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
145        phf::phf_set! {
146            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
147            "_astro/", "_app/immutable"
148        }
149    };
150
151    /// Ignore the content types.
152    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
153        "application/pdf",
154        "application/zip",
155        "application/x-rar-compressed",
156        "application/x-tar",
157        "image/png",
158        "image/jpeg",
159        "image/gif",
160        "image/bmp",
161        "image/webp",
162        "image/svg+xml",
163        "video/mp4",
164        "video/x-msvideo",
165        "video/x-matroska",
166        "video/webm",
167        "audio/mpeg",
168        "audio/ogg",
169        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
170        "application/vnd.ms-excel",
171        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
172        "application/vnd.ms-powerpoint",
173        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
174        "application/x-7z-compressed",
175        "application/x-rpm",
176        "application/x-shockwave-flash",
177        "application/rtf",
178    };
179
180    /// Ignore the resources for visual content types.
181    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
182        "Image",
183        "Media",
184        "Font"
185    };
186
187    /// Ignore the resources for visual content types.
188    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
189        "CspViolationReport",
190        "Ping",
191    };
192
193    /// Case insenstive css matching
194    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
195
196    /// The command chain.
197    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
198        let enable = EnableParams::default();
199
200        if let Ok(c) = serde_json::to_value(&enable) {
201            vec![(enable.identifier(), c)]
202        } else {
203            vec![]
204        }
205    };
206
207    /// The command chain with https ignore.
208    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
209        let enable = EnableParams::default();
210        let mut v = vec![];
211        if let Ok(c) = serde_json::to_value(&enable) {
212            v.push((enable.identifier(), c));
213        }
214        let ignore = SetIgnoreCertificateErrorsParams::new(true);
215        if let Ok(ignored) = serde_json::to_value(&ignore) {
216            v.push((ignore.identifier(), ignored));
217        }
218
219        v
220    };
221
222    /// Enable the fetch intercept command
223    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
224        fetch::EnableParams::builder()
225        .handle_auth_requests(true)
226        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
227        .build()
228    };
229}
230
231/// Determine if a redirect is true.
232pub(crate) fn is_redirect_status(status: i64) -> bool {
233    matches!(status, 301 | 302 | 303 | 307 | 308)
234}
235
236#[derive(Debug)]
237/// The base network manager.
238pub struct NetworkManager {
239    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
240    ///
241    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
242    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
243    /// Consumers pull from this queue via `poll()`.
244    queued_events: VecDeque<NetworkEvent>,
245    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
246    ///
247    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
248    /// certificates (self-signed, expired, MITM proxies, etc.).
249    ignore_httpserrors: bool,
250    /// Active in-flight requests keyed by CDP `RequestId`.
251    ///
252    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
253    /// and final state used to emit `RequestFinished` / `RequestFailed`.
254    requests: HashMap<RequestId, HttpRequest>,
255    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
256    /// `Fetch.requestPaused` arrives later (or vice versa).
257    ///
258    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
259    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
260    // TODO put event in an Arc?
261    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
262    /// Extra HTTP headers to apply to subsequent network requests via CDP.
263    ///
264    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
265    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
266    extra_headers: std::collections::HashMap<String, String>,
267    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
268    ///
269    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
270    /// store the interception id here so it can be attached to the `HttpRequest` once the
271    /// network request is observed.
272    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
273    /// Whether the user has disabled the browser cache.
274    ///
275    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
276    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
277    user_cache_disabled: bool,
278    /// Tracks which requests have already attempted authentication.
279    ///
280    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
281    /// authentication challenges (407/401). Once a request id is present here, subsequent
282    /// challenges for the same request are canceled.
283    attempted_authentications: HashSet<RequestId>,
284    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
285    ///
286    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
287    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
288    credentials: Option<Credentials>,
289    /// User-facing toggle indicating whether request interception is desired.
290    ///
291    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
292    /// not guarantee interception is active; interception is actually enabled/disabled by
293    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
294    ///
295    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
296    /// enabled to satisfy auth challenges.
297    pub(crate) user_request_interception_enabled: bool,
298    /// Hard kill-switch to block all network traffic.
299    ///
300    /// When `true`, the manager immediately blocks requests (typically via
301    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
302    /// and short-circuits most decision logic. This is used for safety conditions such as
303    /// exceeding `max_bytes_allowed` or other runtime protections.
304    block_all: bool,
305    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
306    ///
307    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
308    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
309    /// when `user_request_interception_enabled` or `credentials` change.
310    pub(crate) protocol_request_interception_enabled: bool,
311    /// The network is offline.
312    offline: bool,
313    /// The page request timeout.
314    pub request_timeout: Duration,
315    // made_request: bool,
316    /// Ignore visuals (no pings, prefetching, and etc).
317    pub ignore_visuals: bool,
318    /// Block CSS stylesheets.
319    pub block_stylesheets: bool,
320    /// Block javascript that is not critical to rendering.
321    ///
322    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
323    /// by itself (it remains for config compatibility).
324    pub block_javascript: bool,
325    /// Block analytics from rendering
326    pub block_analytics: bool,
327    /// Block pre-fetch request
328    pub block_prefetch: bool,
329    /// Only html from loading.
330    pub only_html: bool,
331    /// Is xml document?
332    pub xml_document: bool,
333    /// The custom intercept handle logic to run on the website.
334    pub intercept_manager: NetworkInterceptManager,
335    /// Track the amount of times the document reloaded.
336    pub document_reload_tracker: u8,
337    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
338    pub document_target_url: String,
339    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
340    pub document_target_domain: String,
341    /// The max bytes to receive.
342    pub max_bytes_allowed: Option<u64>,
343    #[cfg(feature = "_cache")]
344    /// The cache site_key to use.
345    pub cache_site_key: Option<String>,
346    /// The cache policy to use.
347    #[cfg(feature = "_cache")]
348    pub cache_policy: Option<BasicCachePolicy>,
349    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
350    whitelist_patterns: Vec<String>,
351    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
352    whitelist_matcher: Option<AhoCorasick>,
353    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
354    blacklist_patterns: Vec<String>,
355    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
356    blacklist_matcher: Option<AhoCorasick>,
357    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
358    blacklist_strict: bool,
359}
360
361impl NetworkManager {
362    /// A new network manager.
363    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
364        Self {
365            queued_events: Default::default(),
366            ignore_httpserrors,
367            requests: Default::default(),
368            requests_will_be_sent: Default::default(),
369            extra_headers: Default::default(),
370            request_id_to_interception_id: Default::default(),
371            user_cache_disabled: false,
372            attempted_authentications: Default::default(),
373            credentials: None,
374            block_all: false,
375            user_request_interception_enabled: false,
376            protocol_request_interception_enabled: false,
377            offline: false,
378            request_timeout,
379            ignore_visuals: false,
380            block_javascript: false,
381            block_stylesheets: false,
382            block_prefetch: true,
383            block_analytics: true,
384            only_html: false,
385            xml_document: false,
386            intercept_manager: NetworkInterceptManager::Unknown,
387            document_reload_tracker: 0,
388            document_target_url: String::new(),
389            document_target_domain: String::new(),
390            whitelist_patterns: Vec::new(),
391            whitelist_matcher: None,
392            blacklist_patterns: Vec::new(),
393            blacklist_matcher: None,
394            blacklist_strict: true,
395            max_bytes_allowed: None,
396            #[cfg(feature = "_cache")]
397            cache_site_key: None,
398            #[cfg(feature = "_cache")]
399            cache_policy: None,
400        }
401    }
402
403    /// Replace the whitelist patterns (compiled once).
404    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
405    where
406        I: IntoIterator<Item = S>,
407        S: Into<String>,
408    {
409        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
410        self.rebuild_whitelist_matcher();
411    }
412
413    /// Replace the blacklist patterns (compiled once).
414    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
415    where
416        I: IntoIterator<Item = S>,
417        S: Into<String>,
418    {
419        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
420        self.rebuild_blacklist_matcher();
421    }
422
423    /// Add one pattern (cheap) and rebuild (call this sparingly).
424    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
425        self.blacklist_patterns.push(pattern.into());
426        self.rebuild_blacklist_matcher();
427    }
428
429    /// Add many patterns and rebuild once.
430    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
431    where
432        I: IntoIterator<Item = S>,
433        S: Into<String>,
434    {
435        self.blacklist_patterns
436            .extend(patterns.into_iter().map(Into::into));
437        self.rebuild_blacklist_matcher();
438    }
439
440    /// Clear blacklist entirely.
441    pub fn clear_blacklist(&mut self) {
442        self.blacklist_patterns.clear();
443        self.blacklist_matcher = None;
444    }
445
446    /// Control precedence: when true, blacklist always wins.
447    pub fn set_blacklist_strict(&mut self, strict: bool) {
448        self.blacklist_strict = strict;
449    }
450
451    #[inline]
452    fn rebuild_blacklist_matcher(&mut self) {
453        if self.blacklist_patterns.is_empty() {
454            self.blacklist_matcher = None;
455            return;
456        }
457
458        let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
459        self.blacklist_matcher = AhoCorasick::new(refs).ok();
460    }
461
462    #[inline]
463    fn is_blacklisted(&self, url: &str) -> bool {
464        self.blacklist_matcher
465            .as_ref()
466            .map(|m| m.is_match(url))
467            .unwrap_or(false)
468    }
469
470    /// Add one pattern (cheap) and rebuild (call this sparingly).
471    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
472        self.whitelist_patterns.push(pattern.into());
473        self.rebuild_whitelist_matcher();
474    }
475
476    /// Add many patterns and rebuild once.
477    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
478    where
479        I: IntoIterator<Item = S>,
480        S: Into<String>,
481    {
482        self.whitelist_patterns
483            .extend(patterns.into_iter().map(Into::into));
484        self.rebuild_whitelist_matcher();
485    }
486
487    #[inline]
488    fn rebuild_whitelist_matcher(&mut self) {
489        if self.whitelist_patterns.is_empty() {
490            self.whitelist_matcher = None;
491            return;
492        }
493
494        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
495
496        // If building fails (shouldn’t for simple patterns), just disable matcher.
497        self.whitelist_matcher = AhoCorasick::new(refs).ok();
498    }
499
500    #[inline]
501    fn is_whitelisted(&self, url: &str) -> bool {
502        self.whitelist_matcher
503            .as_ref()
504            .map(|m| m.is_match(url))
505            .unwrap_or(false)
506    }
507
508    /// Commands to init the chain with.
509    pub fn init_commands(&self) -> CommandChain {
510        let cmds = if self.ignore_httpserrors {
511            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
512        } else {
513            INIT_CHAIN.clone()
514        };
515        CommandChain::new(cmds, self.request_timeout)
516    }
517
518    /// Push the CDP request.
519    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
520        let method = cmd.identifier();
521        if let Ok(params) = serde_json::to_value(cmd) {
522            self.queued_events
523                .push_back(NetworkEvent::SendCdpRequest((method, params)));
524        }
525    }
526
527    /// The next event to handle.
528    pub fn poll(&mut self) -> Option<NetworkEvent> {
529        self.queued_events.pop_front()
530    }
531
532    /// Get the extra headers.
533    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
534        &self.extra_headers
535    }
536
537    /// Set extra HTTP headers.
538    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
539        self.extra_headers = headers;
540        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
541        self.extra_headers.remove("Proxy-Authorization");
542        if !self.extra_headers.is_empty() {
543            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
544                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
545            }
546        }
547    }
548
549    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
550        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
551    }
552
553    pub fn set_block_all(&mut self, block_all: bool) {
554        self.block_all = block_all;
555    }
556
557    pub fn set_request_interception(&mut self, enabled: bool) {
558        self.user_request_interception_enabled = enabled;
559        self.update_protocol_request_interception();
560    }
561
562    pub fn set_cache_enabled(&mut self, enabled: bool) {
563        let run = self.user_cache_disabled != !enabled;
564        self.user_cache_disabled = !enabled;
565        if run {
566            self.update_protocol_cache_disabled();
567        }
568    }
569
570    /// Enable fetch interception.
571    pub fn enable_request_intercept(&mut self) {
572        self.protocol_request_interception_enabled = true;
573    }
574
575    /// Disable fetch interception.
576    pub fn disable_request_intercept(&mut self) {
577        self.protocol_request_interception_enabled = false;
578    }
579
580    /// Set the cache site key.
581    #[cfg(feature = "_cache")]
582    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
583        self.cache_site_key = cache_site_key;
584    }
585
586    /// Set the cache policy.
587    #[cfg(feature = "_cache")]
588    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
589        self.cache_policy = cache_policy;
590    }
591
592    pub fn update_protocol_cache_disabled(&mut self) {
593        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
594    }
595
596    pub fn authenticate(&mut self, credentials: Credentials) {
597        self.credentials = Some(credentials);
598        self.update_protocol_request_interception();
599        self.protocol_request_interception_enabled = true;
600    }
601
602    fn update_protocol_request_interception(&mut self) {
603        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
604
605        if enabled == self.protocol_request_interception_enabled {
606            return;
607        }
608
609        if enabled {
610            self.push_cdp_request(ENABLE_FETCH.clone())
611        } else {
612            self.push_cdp_request(DisableParams::default())
613        }
614    }
615
616    /// Blocklist-only script blocking.
617    /// Returns true only when the URL matches an explicit blocklist condition.
618    #[inline]
619    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
620        // If analytics blocking is off, skip all analytics tries.
621        let block_analytics = self.block_analytics;
622
623        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
624        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
625        {
626            return true;
627        }
628
629        // 2) Custom website block list (explicit).
630        if crate::handler::blockers::block_websites::block_website(url) {
631            return true;
632        }
633
634        // 3) Path-based explicit tries / fallbacks.
635        //
636        // We run these on:
637        // - path with leading slash ("/js/app.js")
638        // - path without leading slash ("js/app.js")
639        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
640        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
641            // Remove query/fragment so matching stays stable.
642            let p_slash = Self::strip_query_fragment(path_with_slash);
643            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
644
645            // Basename for filename-only lists.
646            let base = match p_slash.rsplit('/').next() {
647                Some(b) => b,
648                None => p_slash,
649            };
650
651            // ---- Trie checks ----
652            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
653            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
654                return true;
655            }
656            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
657                return true;
658            }
659            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
660                return true;
661            }
662
663            // Base-path ignore tries (framework noise / known ignorable script paths).
664            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
665            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
666                return true;
667            }
668
669            // Style path ignores only when visuals are ignored.
670            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
671                return true;
672            }
673        }
674
675        false
676    }
677
678    /// Extract the absolute URL path portion WITH the leading slash.
679    ///
680    /// Example:
681    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
682    #[inline]
683    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
684        // find scheme separator
685        let idx = url.find("//")?;
686        let after_slashes = idx + 2;
687
688        // find first slash after host
689        let slash_rel = url[after_slashes..].find('/')?;
690        let slash_idx = after_slashes + slash_rel;
691
692        if slash_idx < url.len() {
693            Some(&url[slash_idx..])
694        } else {
695            None
696        }
697    }
698
699    /// Strip query string and fragment from a path-ish string.
700    ///
701    /// Example:
702    /// - "/a/b.js?x=1#y" -> "/a/b.js"
703    #[inline]
704    fn strip_query_fragment(s: &str) -> &str {
705        let q = s.find('?');
706        let h = s.find('#');
707
708        match (q, h) {
709            (None, None) => s,
710            (Some(i), None) => &s[..i],
711            (None, Some(i)) => &s[..i],
712            (Some(i), Some(j)) => &s[..i.min(j)],
713        }
714    }
715
716    /// Determine if the request should be skipped.
717    #[inline]
718    fn skip_xhr(
719        &self,
720        skip_networking: bool,
721        event: &EventRequestPaused,
722        network_event: bool,
723    ) -> bool {
724        // XHR check
725        if !skip_networking && network_event {
726            let request_url = event.request.url.as_str();
727
728            // check if part of ignore scripts.
729            let skip_analytics =
730                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
731
732            if skip_analytics {
733                true
734            } else if self.block_stylesheets || self.ignore_visuals {
735                let block_css = self.block_stylesheets;
736                let block_media = self.ignore_visuals;
737
738                let mut block_request = false;
739
740                if let Some(position) = request_url.rfind('.') {
741                    let hlen = request_url.len();
742                    let has_asset = hlen - position;
743
744                    if has_asset >= 3 {
745                        let next_position = position + 1;
746
747                        if block_media
748                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
749                                &request_url[next_position..].into(),
750                            )
751                        {
752                            block_request = true;
753                        } else if block_css {
754                            block_request =
755                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
756                                    .contains(&**CSS_EXTENSION)
757                        }
758                    }
759                }
760
761                if !block_request {
762                    block_request = ignore_script_xhr_media(request_url);
763                }
764
765                block_request
766            } else {
767                skip_networking
768            }
769        } else {
770            skip_networking
771        }
772    }
773
774    #[cfg(feature = "adblock")]
775    #[inline]
776    /// Detect if ad enabled.
777    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
778        if skip_networking {
779            true
780        } else {
781            block_ads(&event.request.url) || self.detect_ad(event)
782        }
783    }
784
785    /// When adblock feature is disabled, this is a no-op.
786    #[cfg(not(feature = "adblock"))]
787    #[inline]
788    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
789        use crate::handler::blockers::block_websites::block_ads;
790        if skip_networking {
791            true
792        } else {
793            block_ads(&event.request.url)
794        }
795    }
796
797    #[inline]
798    /// Fail request
799    fn fail_request_blocked(
800        &mut self,
801        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
802    ) {
803        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
804            request_id.clone(),
805            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
806        );
807        self.push_cdp_request(params);
808    }
809
810    #[inline]
811    /// Fulfill request
812    fn fulfill_request_empty_200(
813        &mut self,
814        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
815    ) {
816        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
817            request_id.clone(),
818            200,
819        );
820        self.push_cdp_request(params);
821    }
822
823    #[cfg(feature = "_cache")]
824    #[inline]
825    /// Fulfill a paused Fetch request from cached bytes + header map.
826    ///
827    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
828    fn fulfill_request_from_cache(
829        &mut self,
830        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
831        body: &[u8],
832        headers: &std::collections::HashMap<String, String>,
833        status: i64,
834    ) {
835        use crate::cdp::browser_protocol::fetch::HeaderEntry;
836        use crate::handler::network::fetch::FulfillRequestParams;
837        use base64::Engine;
838
839        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
840
841        for (k, v) in headers.iter() {
842            resp_headers.push(HeaderEntry {
843                name: k.clone().into(),
844                value: v.clone().into(),
845            });
846        }
847
848        let mut params = FulfillRequestParams::new(request_id.clone(), status);
849
850        // TODO: have this already encoded prior.
851        params.body = Some(
852            base64::engine::general_purpose::STANDARD
853                .encode(body)
854                .into(),
855        );
856
857        params.response_headers = Some(resp_headers);
858
859        self.push_cdp_request(params);
860    }
861
862    #[inline]
863    /// Continue the request url.
864    fn continue_request_with_url(
865        &mut self,
866        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
867        url: Option<&str>,
868        intercept_response: bool,
869    ) {
870        let mut params = ContinueRequestParams::new(request_id.clone());
871        if let Some(url) = url {
872            params.url = Some(url.to_string());
873            params.intercept_response = Some(intercept_response);
874        }
875        self.push_cdp_request(params);
876    }
877
878    /// On fetch request paused interception.
879    #[inline]
880    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
881        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
882            return;
883        }
884
885        if self.block_all {
886            tracing::debug!(
887                "Blocked (block_all): {:?} - {}",
888                event.resource_type,
889                event.request.url
890            );
891            return self.fail_request_blocked(&event.request_id);
892        }
893
894        if let Some(network_id) = event.network_id.as_ref() {
895            if let Some(request_will_be_sent) =
896                self.requests_will_be_sent.remove(network_id.as_ref())
897            {
898                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
899            } else {
900                self.request_id_to_interception_id
901                    .insert(network_id.clone(), event.request_id.clone().into());
902            }
903        }
904
905        // From here on, we handle the full decision tree.
906        let javascript_resource = event.resource_type == ResourceType::Script;
907        let document_resource = event.resource_type == ResourceType::Document;
908        let network_resource =
909            !document_resource && crate::utils::is_data_resource(&event.resource_type);
910
911        // Start with static / cheap skip checks.
912        let mut skip_networking =
913            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
914
915        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
916            skip_networking = true;
917        }
918
919        // Also short-circuit if we've reloaded this document too many times.
920        if !skip_networking {
921            skip_networking = self.document_reload_tracker >= 3;
922        }
923
924        // Handle document redirect / masking and track xml documents.
925        let (current_url_cow, had_replacer) =
926            self.handle_document_replacement_and_tracking(event, document_resource);
927
928        let current_url: &str = current_url_cow.as_ref();
929
930        let blacklisted = self.is_blacklisted(current_url);
931
932        if !self.blacklist_strict && blacklisted {
933            skip_networking = true;
934        }
935
936        if !skip_networking {
937            // Allow XSL for sitemap XML.
938            if self.xml_document && current_url.ends_with(".xsl") {
939                skip_networking = false;
940            } else {
941                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
942            }
943        }
944
945        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
946
947        // Ignore embedded scripts when only_html or ignore_visuals is set.
948        if !skip_networking
949            && self.block_javascript
950            && (self.only_html || self.ignore_visuals)
951            && (javascript_resource || document_resource)
952        {
953            skip_networking = ignore_script_embedded(current_url);
954        }
955
956        // Script policy: allow-by-default.
957        // Block only if explicit block list patterns match.
958        if !skip_networking && javascript_resource {
959            skip_networking = self.should_block_script_blocklist_only(current_url);
960        }
961
962        // XHR / data resources.
963        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
964
965        // Custom interception layer.
966        if !skip_networking && (javascript_resource || network_resource || document_resource) {
967            skip_networking = self.intercept_manager.intercept_detection(
968                current_url,
969                self.ignore_visuals,
970                network_resource,
971            );
972        }
973
974        // Custom website block list.
975        if !skip_networking && (javascript_resource || network_resource) {
976            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
977        }
978
979        // whitelist 3rd party
980        // not required unless explicit blocking.
981        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
982        {
983            skip_networking = false;
984        }
985
986        // check if the url is in the whitelist.
987        if skip_networking && self.is_whitelisted(current_url) {
988            skip_networking = false;
989        }
990
991        if self.blacklist_strict && blacklisted {
992            skip_networking = true;
993        }
994
995        if skip_networking {
996            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
997            self.fulfill_request_empty_200(&event.request_id);
998        } else {
999            #[cfg(feature = "_cache")]
1000            {
1001                if let (Some(policy), Some(cache_site_key)) =
1002                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1003                {
1004                    let current_url = format!("{}:{}", event.request.method, &current_url);
1005
1006                    if let Some((res, cache_policy)) =
1007                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1008                    {
1009                        if policy.allows_cached(&cache_policy) {
1010                            tracing::debug!(
1011                                "Remote Cached: {:?} - {}",
1012                                &event.resource_type,
1013                                &current_url
1014                            );
1015                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1016                            return self.fulfill_request_from_cache(
1017                                &event.request_id,
1018                                &res.body,
1019                                &flat_headers,
1020                                res.status as i64,
1021                            );
1022                        }
1023                    }
1024                }
1025            }
1026
1027            // check our frame cache for the run.
1028            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1029            self.continue_request_with_url(
1030                &event.request_id,
1031                if had_replacer {
1032                    Some(current_url)
1033                } else {
1034                    None
1035                },
1036                !had_replacer,
1037            );
1038        }
1039    }
1040
1041    /// Shared "visuals + basic blocking" logic.
1042    ///
1043    /// IMPORTANT: Scripts are NOT blocked here anymore.
1044    /// Scripts are allowed by default and only blocked via explicit blocklists
1045    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1046    #[inline]
1047    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1048        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1049            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1050    }
1051
1052    /// Does the network manager have a target domain?
1053    pub fn has_target_domain(&self) -> bool {
1054        !self.document_target_url.is_empty()
1055    }
1056
1057    /// Set the target page url for tracking.
1058    pub fn set_page_url(&mut self, page_target_url: String) {
1059        let host_base = host_and_rest(&page_target_url)
1060            .map(|(h, _)| base_domain_from_host(h))
1061            .unwrap_or("");
1062
1063        self.document_target_domain = host_base.to_string();
1064        self.document_target_url = page_target_url;
1065    }
1066
1067    /// Clear the initial target domain on every navigation.
1068    pub fn clear_target_domain(&mut self) {
1069        self.document_reload_tracker = 0;
1070        self.document_target_url = Default::default();
1071        self.document_target_domain = Default::default();
1072    }
1073
1074    /// Handles:
1075    /// - document reload tracking (`document_reload_tracker`)
1076    /// - redirect masking / replacement
1077    /// - xml document detection (`xml_document`)
1078    /// - `document_target_url` updates
1079    ///
1080    /// Returns (current_url, had_replacer).
1081    #[inline]
1082    fn handle_document_replacement_and_tracking<'a>(
1083        &mut self,
1084        event: &'a EventRequestPaused,
1085        document_resource: bool,
1086    ) -> (Cow<'a, str>, bool) {
1087        let mut replacer: Option<String> = None;
1088        let current_url = event.request.url.as_str();
1089
1090        if document_resource {
1091            if self.document_target_url == current_url {
1092                self.document_reload_tracker += 1;
1093            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1094            {
1095                let (http_document_replacement, mut https_document_replacement) =
1096                    if self.document_target_url.starts_with("http://") {
1097                        (
1098                            self.document_target_url.replacen("http://", "http//", 1),
1099                            self.document_target_url.replacen("http://", "https://", 1),
1100                        )
1101                    } else {
1102                        (
1103                            self.document_target_url.replacen("https://", "https//", 1),
1104                            self.document_target_url.replacen("https://", "http://", 1),
1105                        )
1106                    };
1107
1108                // Track trailing slash to restore later.
1109                let trailing = https_document_replacement.ends_with('/');
1110                if trailing {
1111                    https_document_replacement.pop();
1112                }
1113                if https_document_replacement.ends_with('/') {
1114                    https_document_replacement.pop();
1115                }
1116
1117                let redirect_mask = format!(
1118                    "{}{}",
1119                    https_document_replacement, http_document_replacement
1120                );
1121
1122                if current_url == redirect_mask {
1123                    replacer = Some(if trailing {
1124                        format!("{}/", https_document_replacement)
1125                    } else {
1126                        https_document_replacement
1127                    });
1128                }
1129            }
1130
1131            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1132                self.xml_document = true;
1133            }
1134
1135            // Track last seen document URL.
1136            self.document_target_url = event.request.url.clone();
1137            self.document_target_domain = host_and_rest(&self.document_target_url)
1138                .map(|(h, _)| base_domain_from_host(h).to_string())
1139                .unwrap_or_default();
1140        }
1141
1142        let current_url_cow = match replacer {
1143            Some(r) => Cow::Owned(r),
1144            None => Cow::Borrowed(event.request.url.as_str()),
1145        };
1146
1147        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1148        (current_url_cow, had_replacer)
1149    }
1150
1151    /// Perform a page intercept for chrome
1152    #[cfg(feature = "adblock")]
1153    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1154        use adblock::{
1155            lists::{FilterSet, ParseOptions, RuleTypes},
1156            Engine,
1157        };
1158
1159        lazy_static::lazy_static! {
1160            static ref AD_ENGINE: Engine = {
1161                let mut filter_set = FilterSet::new(false);
1162                let mut rules = ParseOptions::default();
1163                rules.rule_types = RuleTypes::All;
1164
1165                filter_set.add_filters(
1166                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1167                    rules,
1168                );
1169
1170                Engine::from_filter_set(filter_set, true)
1171            };
1172        };
1173
1174        let blockable = ResourceType::Image == event.resource_type
1175            || event.resource_type == ResourceType::Media
1176            || event.resource_type == ResourceType::Stylesheet
1177            || event.resource_type == ResourceType::Document
1178            || event.resource_type == ResourceType::Fetch
1179            || event.resource_type == ResourceType::Xhr;
1180
1181        let u = &event.request.url;
1182
1183        let block_request = blockable
1184            // set it to example.com for 3rd party handling is_same_site
1185        && {
1186            let request = adblock::request::Request::preparsed(
1187                 &u,
1188                 "example.com",
1189                 "example.com",
1190                 &event.resource_type.as_ref().to_lowercase(),
1191                 !event.request.is_same_site.unwrap_or_default());
1192
1193            AD_ENGINE.check_network_request(&request).matched
1194        };
1195
1196        block_request
1197    }
1198
1199    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1200        let response = if self
1201            .attempted_authentications
1202            .contains(event.request_id.as_ref())
1203        {
1204            AuthChallengeResponseResponse::CancelAuth
1205        } else if self.credentials.is_some() {
1206            self.attempted_authentications
1207                .insert(event.request_id.clone().into());
1208            AuthChallengeResponseResponse::ProvideCredentials
1209        } else {
1210            AuthChallengeResponseResponse::Default
1211        };
1212
1213        let mut auth = AuthChallengeResponse::new(response);
1214        if let Some(creds) = self.credentials.clone() {
1215            auth.username = Some(creds.username);
1216            auth.password = Some(creds.password);
1217        }
1218        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1219    }
1220
1221    /// Set the page offline network emulation condition.
1222    pub fn set_offline_mode(&mut self, value: bool) {
1223        if self.offline == value {
1224            return;
1225        }
1226        self.offline = value;
1227        if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1228            .offline(self.offline)
1229            .matched_network_condition(
1230                NetworkConditions::builder()
1231                    .url_pattern("")
1232                    .latency(0)
1233                    .download_throughput(-1.)
1234                    .upload_throughput(-1.)
1235                    .build()
1236                    .unwrap(),
1237            )
1238            .build()
1239        {
1240            self.push_cdp_request(network);
1241        }
1242    }
1243
1244    /// Request interception doesn't happen for data URLs with Network Service.
1245    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1246        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1247            if let Some(interception_id) = self
1248                .request_id_to_interception_id
1249                .remove(event.request_id.as_ref())
1250            {
1251                self.on_request(event, Some(interception_id));
1252            } else {
1253                // TODO remove the clone for event
1254                self.requests_will_be_sent
1255                    .insert(event.request_id.clone(), event.clone());
1256            }
1257        } else {
1258            self.on_request(event, None);
1259        }
1260    }
1261
1262    /// The request was served from the cache.
1263    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1264        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1265            request.from_memory_cache = true;
1266        }
1267    }
1268
1269    /// On network response received.
1270    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1271        let mut request_failed = false;
1272
1273        // Track how many bytes we actually deducted from this target.
1274        let mut deducted: u64 = 0;
1275
1276        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1277            let before = *max_bytes;
1278
1279            // encoded_data_length -> saturating cast to u64
1280            let received_bytes: u64 = event.response.encoded_data_length as u64;
1281
1282            // Safe parse of Content-Length
1283            let content_length: Option<u64> = event
1284                .response
1285                .headers
1286                .inner()
1287                .get("content-length")
1288                .and_then(|v| v.as_str())
1289                .and_then(|s| s.trim().parse::<u64>().ok());
1290
1291            // Deduct what we actually received
1292            *max_bytes = max_bytes.saturating_sub(received_bytes);
1293
1294            // If the declared size can't fit, zero out now
1295            if let Some(cl) = content_length {
1296                if cl > *max_bytes {
1297                    *max_bytes = 0;
1298                }
1299            }
1300
1301            request_failed = *max_bytes == 0;
1302
1303            // Compute exact delta deducted on this event
1304            deducted = before.saturating_sub(*max_bytes);
1305        }
1306
1307        // Bubble up the deduction (even if request continues)
1308        if deducted > 0 {
1309            self.queued_events
1310                .push_back(NetworkEvent::BytesConsumed(deducted));
1311        }
1312
1313        // block all network request moving forward.
1314        if request_failed && self.max_bytes_allowed.is_some() {
1315            self.set_block_all(true);
1316        }
1317
1318        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1319            request.set_response(event.response.clone());
1320            self.queued_events.push_back(if request_failed {
1321                NetworkEvent::RequestFailed(request)
1322            } else {
1323                NetworkEvent::RequestFinished(request)
1324            });
1325        }
1326    }
1327
1328    /// On network loading finished.
1329    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1330        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1331            if let Some(interception_id) = request.interception_id.as_ref() {
1332                self.attempted_authentications
1333                    .remove(interception_id.as_ref());
1334            }
1335            self.queued_events
1336                .push_back(NetworkEvent::RequestFinished(request));
1337        }
1338    }
1339
1340    /// On network loading failed.
1341    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1342        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1343            request.failure_text = Some(event.error_text.clone());
1344            if let Some(interception_id) = request.interception_id.as_ref() {
1345                self.attempted_authentications
1346                    .remove(interception_id.as_ref());
1347            }
1348            self.queued_events
1349                .push_back(NetworkEvent::RequestFailed(request));
1350        }
1351    }
1352
1353    /// On request will be sent.
1354    fn on_request(
1355        &mut self,
1356        event: &EventRequestWillBeSent,
1357        interception_id: Option<InterceptionId>,
1358    ) {
1359        let mut redirect_chain = Vec::new();
1360        let mut redirect_location = None;
1361
1362        if let Some(redirect_resp) = &event.redirect_response {
1363            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1364                if is_redirect_status(redirect_resp.status) {
1365                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1366                        if redirect_resp.url != location {
1367                            let fixed_location = location.replace(&redirect_resp.url, "");
1368
1369                            if !fixed_location.is_empty() {
1370                                request.response.as_mut().map(|resp| {
1371                                    resp.headers.0["Location"] =
1372                                        serde_json::Value::String(fixed_location.clone());
1373                                });
1374                            }
1375
1376                            redirect_location = Some(fixed_location);
1377                        }
1378                    }
1379                }
1380
1381                self.handle_request_redirect(
1382                    &mut request,
1383                    if let Some(redirect_location) = redirect_location {
1384                        let mut redirect_resp = redirect_resp.clone();
1385
1386                        if !redirect_location.is_empty() {
1387                            redirect_resp.headers.0["Location"] =
1388                                serde_json::Value::String(redirect_location);
1389                        }
1390
1391                        redirect_resp
1392                    } else {
1393                        redirect_resp.clone()
1394                    },
1395                );
1396
1397                redirect_chain = std::mem::take(&mut request.redirect_chain);
1398                redirect_chain.push(request);
1399            }
1400        }
1401
1402        let request = HttpRequest::new(
1403            event.request_id.clone(),
1404            event.frame_id.clone(),
1405            interception_id,
1406            self.user_request_interception_enabled,
1407            redirect_chain,
1408        );
1409
1410        self.requests.insert(event.request_id.clone(), request);
1411        self.queued_events
1412            .push_back(NetworkEvent::Request(event.request_id.clone()));
1413    }
1414
1415    /// Handle request redirect.
1416    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1417        request.set_response(response);
1418        if let Some(interception_id) = request.interception_id.as_ref() {
1419            self.attempted_authentications
1420                .remove(interception_id.as_ref());
1421        }
1422    }
1423}
1424
1425#[derive(Debug)]
1426pub enum NetworkEvent {
1427    /// Send a CDP request.
1428    SendCdpRequest((MethodId, serde_json::Value)),
1429    /// Request.
1430    Request(RequestId),
1431    /// Response
1432    Response(RequestId),
1433    /// Request failed.
1434    RequestFailed(HttpRequest),
1435    /// Request finished.
1436    RequestFinished(HttpRequest),
1437    /// Bytes consumed.
1438    BytesConsumed(u64),
1439}
1440
1441#[cfg(test)]
1442mod tests {
1443    use super::ALLOWED_MATCHER_3RD_PARTY;
1444    use crate::handler::network::NetworkManager;
1445    use std::time::Duration;
1446
1447    #[test]
1448    fn test_allowed_matcher_3rd_party() {
1449        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1450        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1451        assert!(
1452            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1453            "expected Cloudflare challenge script to be allowed"
1454        );
1455
1456        // Should NOT be allowed (not in allow-list)
1457        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1458        assert!(
1459            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1460            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1461        );
1462
1463        // A couple sanity checks for existing allow patterns
1464        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1465        assert!(ALLOWED_MATCHER_3RD_PARTY
1466            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1467        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1468    }
1469
1470    #[test]
1471    fn test_script_allowed_by_default_when_not_blocklisted() {
1472        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1473        nm.set_page_url(
1474            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1475        );
1476
1477        // A random script that should not match your block tries.
1478        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1479        assert!(
1480            !nm.should_block_script_blocklist_only(ok),
1481            "expected non-blocklisted script to be allowed"
1482        );
1483    }
1484
1485    #[test]
1486    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1487        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1488        nm.set_page_url(
1489            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1490        );
1491
1492        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1493        let bad = "https://cdn.example.net/js/analytics.js";
1494        assert!(
1495            nm.should_block_script_blocklist_only(bad),
1496            "expected analytics.js to be blocklisted"
1497        );
1498    }
1499
1500    #[test]
1501    fn test_allowed_matcher_3rd_party_sanity() {
1502        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1503        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1504        assert!(
1505            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1506            "expected Cloudflare challenge script to be allowed"
1507        );
1508
1509        // Should NOT be allowed (not in allow-list)
1510        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1511        assert!(
1512            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1513            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1514        );
1515
1516        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1517        assert!(ALLOWED_MATCHER_3RD_PARTY
1518            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1519        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1520    }
1521    #[test]
1522    fn test_dynamic_blacklist_blocks_url() {
1523        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1524        nm.set_page_url("https://example.com/".to_string());
1525
1526        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1527        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1528        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1529
1530        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1531    }
1532
1533    #[test]
1534    fn test_blacklist_strict_wins_over_whitelist() {
1535        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1536        nm.set_page_url("https://example.com/".to_string());
1537
1538        // Same URL in both lists.
1539        nm.set_blacklist_patterns(["beacon.min.js"]);
1540        nm.set_whitelist_patterns(["beacon.min.js"]);
1541
1542        nm.set_blacklist_strict(true);
1543
1544        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1545        assert!(nm.is_whitelisted(u));
1546        assert!(nm.is_blacklisted(u));
1547
1548        // In strict mode, it should still be considered blocked at decision time.
1549        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1550        assert!(nm.blacklist_strict);
1551    }
1552
1553    #[test]
1554    fn test_blacklist_non_strict_allows_whitelist_override() {
1555        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1556        nm.set_page_url("https://example.com/".to_string());
1557
1558        nm.set_blacklist_patterns(["beacon.min.js"]);
1559        nm.set_whitelist_patterns(["beacon.min.js"]);
1560
1561        nm.set_blacklist_strict(false);
1562
1563        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1564        assert!(nm.is_blacklisted(u));
1565        assert!(nm.is_whitelisted(u));
1566        assert!(!nm.blacklist_strict);
1567    }
1568}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs