chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18    SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21    fetch::{
22        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24    },
25    network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43    /// General patterns for popular libraries and resources
44    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45        "jquery",           // Covers jquery.min.js, jquery.js, etc.
46        "angular",
47        "react",            // Covers all React-related patterns
48        "vue",              // Covers all Vue-related patterns
49        "bootstrap",
50        "d3",
51        "lodash",
52        "ajax",
53        "application",
54        "app",              // Covers general app scripts like app.js
55        "main",
56        "index",
57        "bundle",
58        "vendor",
59        "runtime",
60        "polyfill",
61        "scripts",
62        "es2015.",
63        "es2020.",
64        "webpack",
65        "captcha",
66        "client",
67        "/cdn-cgi/challenge-platform/",
68        "/wp-content/js/",  // Covers Wordpress content
69        // Verified 3rd parties for request
70        "https://m.stripe.network/",
71        "https://challenges.cloudflare.com/",
72        "https://www.google.com/recaptcha/enterprise.js",
73        "https://www.google.com/recaptcha/api.js",
74        "https://google.com/recaptcha/api.js",
75        "https://captcha.px-cloud.net/",
76        "https://cdn.auth0.com/js/lock/",
77        "https://captcha.gtimg.com",
78        "https://cdn.auth0.com/client",
79        "https://js.stripe.com/",
80        "https://cdn.prod.website-files.com/", // webflow cdn scripts
81        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
82        "https://code.jquery.com/jquery-"
83    ];
84
85    /// Determine if a script should be rendered in the browser by name.
86    ///
87    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
88    /// but we keep it for compatibility and other call sites.
89    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
90
91    /// General patterns for popular libraries and resources
92    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
93        // Verified 3rd parties for request
94        "https://m.stripe.network/",
95        "https://challenges.cloudflare.com/",
96        "https://www.google.com/recaptcha/api.js",
97        "https://google.com/recaptcha/api.js",
98        "https://www.google.com/recaptcha/enterprise.js",
99        "https://js.stripe.com/",
100        "https://cdn.prod.website-files.com/", // webflow cdn scripts
101        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
102        "https://code.jquery.com/jquery-",
103        "https://ct.captcha-delivery.com/",
104        "https://geo.captcha-delivery.com/captcha/",
105        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
106        "https://ct.captcha-delivery.com/",
107        "https://cdn.auth0.com/client",
108        "https://captcha.px-cloud.net/",
109        "https://www.gstatic.com/recaptcha/",
110        "https://www.google.com/recaptcha/api2/",
111        "https://www.recaptcha.net/recaptcha/",
112        "https://js.hcaptcha.com/1/api.js",
113        "https://hcaptcha.com/1/api.js",
114        "https://js.datadome.co/tags.js",
115        "https://api-js.datadome.co/",
116        "https://client.perimeterx.net/",
117        "https://captcha.px-cdn.net/",
118        "https://captcha.px-cloud.net/",
119        "https://s.perimeterx.net/",
120        "https://client-api.arkoselabs.com/v2/",
121        "https://static.geetest.com/v4/gt4.js",
122        "https://static.geetest.com/",
123        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
124        "https://cdn.perfdrive.com/aperture/",
125        "https://assets.queue-it.net/",
126        "/cdn-cgi/challenge-platform/",
127        "/_Incapsula_Resource",
128        "discourse-cdn.com/"
129    ];
130
131    /// Determine if a script should be rendered in the browser by name.
132    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
133
134    /// path of a js framework
135    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
136        phf::phf_set! {
137            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
138            "_astro/", "_app/immutable"
139        }
140    };
141
142    /// Ignore the content types.
143    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
144        "application/pdf",
145        "application/zip",
146        "application/x-rar-compressed",
147        "application/x-tar",
148        "image/png",
149        "image/jpeg",
150        "image/gif",
151        "image/bmp",
152        "image/webp",
153        "image/svg+xml",
154        "video/mp4",
155        "video/x-msvideo",
156        "video/x-matroska",
157        "video/webm",
158        "audio/mpeg",
159        "audio/ogg",
160        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
161        "application/vnd.ms-excel",
162        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
163        "application/vnd.ms-powerpoint",
164        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
165        "application/x-7z-compressed",
166        "application/x-rpm",
167        "application/x-shockwave-flash",
168        "application/rtf",
169    };
170
171    /// Ignore the resources for visual content types.
172    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
173        "Image",
174        "Media",
175        "Font"
176    };
177
178    /// Ignore the resources for visual content types.
179    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180        "CspViolationReport",
181        "Manifest",
182        "Other",
183        "Prefetch",
184        "Ping",
185    };
186
187    /// Case insenstive css matching
188    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
189
190    /// The command chain.
191    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
192        let enable = EnableParams::default();
193
194        if let Ok(c) = serde_json::to_value(&enable) {
195            vec![(enable.identifier(), c)]
196        } else {
197            vec![]
198        }
199    };
200
201    /// The command chain with https ignore.
202    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
203        let enable = EnableParams::default();
204        let mut v = vec![];
205        if let Ok(c) = serde_json::to_value(&enable) {
206            v.push((enable.identifier(), c));
207        }
208        let ignore = SetIgnoreCertificateErrorsParams::new(true);
209        if let Ok(ignored) = serde_json::to_value(&ignore) {
210            v.push((ignore.identifier(), ignored));
211        }
212
213        v
214    };
215
216    /// Enable the fetch intercept command
217    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
218        fetch::EnableParams::builder()
219        .handle_auth_requests(true)
220        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
221        .build()
222    };
223}
224
225#[derive(Debug, Clone, Default)]
226struct SiteKeyword {
227    // lowercased ASCII keyword, e.g. b"gelcom"
228    kw_lower: Box<[u8]>,
229    // lowercased ASCII "/kw/" pattern, e.g. b"/gelcom/"
230    kw_slash: Box<[u8]>,
231}
232
233impl SiteKeyword {
234    #[inline]
235    fn new_from_base_domain(base: &str) -> Option<Self> {
236        let s = base.trim().trim_matches('.');
237        if s.is_empty() {
238            return None;
239        }
240
241        let kw = s.split('.').next().unwrap_or(s).trim();
242        if kw.len() < 4 {
243            return None;
244        }
245
246        // Precompute lowercase bytes once.
247        let mut kw_lower = Vec::with_capacity(kw.len());
248        for &b in kw.as_bytes() {
249            kw_lower.push(b.to_ascii_lowercase());
250        }
251
252        // Build "/kw/" once.
253        let mut kw_slash = Vec::with_capacity(kw_lower.len() + 2);
254        kw_slash.push(b'/');
255        kw_slash.extend_from_slice(&kw_lower);
256        kw_slash.push(b'/');
257
258        Some(Self {
259            kw_lower: kw_lower.into_boxed_slice(),
260            kw_slash: kw_slash.into_boxed_slice(),
261        })
262    }
263}
264
265/// ASCII-only case-insensitive "contains" without allocations.
266/// `needle_lower` must be lowercase ASCII bytes.
267#[inline]
268fn contains_ascii_ci(haystack: &[u8], needle_lower: &[u8]) -> bool {
269    let n = needle_lower.len();
270    if n == 0 {
271        return true;
272    }
273    if haystack.len() < n {
274        return false;
275    }
276
277    // Naive scan but tight + branch-light; fast enough for short needles (brand keywords).
278    // Compare by lowercasing each haystack byte on-the-fly.
279    let last = haystack.len() - n;
280    let first = needle_lower[0];
281
282    let mut i = 0usize;
283    while i <= last {
284        // quick skip until first byte matches (case-insensitive)
285        let b0 = haystack[i].to_ascii_lowercase();
286        if b0 != first {
287            i += 1;
288            continue;
289        }
290
291        // verify full needle
292        let mut j = 1usize;
293        while j < n {
294            if haystack[i + j].to_ascii_lowercase() != needle_lower[j] {
295                break;
296            }
297            j += 1;
298        }
299        if j == n {
300            return true;
301        }
302
303        i += 1;
304    }
305    false
306}
307
308/// Determine if a redirect is true.
309pub(crate) fn is_redirect_status(status: i64) -> bool {
310    matches!(status, 301 | 302 | 303 | 307 | 308)
311}
312
313#[derive(Debug)]
314/// The base network manager.
315pub struct NetworkManager {
316    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
317    ///
318    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
319    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
320    /// Consumers pull from this queue via `poll()`.
321    queued_events: VecDeque<NetworkEvent>,
322    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
323    ///
324    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
325    /// certificates (self-signed, expired, MITM proxies, etc.).
326    ignore_httpserrors: bool,
327    /// Active in-flight requests keyed by CDP `RequestId`.
328    ///
329    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
330    /// and final state used to emit `RequestFinished` / `RequestFailed`.
331    requests: HashMap<RequestId, HttpRequest>,
332    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
333    /// `Fetch.requestPaused` arrives later (or vice versa).
334    ///
335    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
336    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
337    // TODO put event in an Arc?
338    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
339    /// Extra HTTP headers to apply to subsequent network requests via CDP.
340    ///
341    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
342    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
343    extra_headers: std::collections::HashMap<String, String>,
344    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
345    ///
346    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
347    /// store the interception id here so it can be attached to the `HttpRequest` once the
348    /// network request is observed.
349    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
350    /// Whether the user has disabled the browser cache.
351    ///
352    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
353    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
354    user_cache_disabled: bool,
355    /// Tracks which requests have already attempted authentication.
356    ///
357    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
358    /// authentication challenges (407/401). Once a request id is present here, subsequent
359    /// challenges for the same request are canceled.
360    attempted_authentications: HashSet<RequestId>,
361    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
362    ///
363    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
364    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
365    credentials: Option<Credentials>,
366    /// User-facing toggle indicating whether request interception is desired.
367    ///
368    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
369    /// not guarantee interception is active; interception is actually enabled/disabled by
370    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
371    ///
372    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
373    /// enabled to satisfy auth challenges.
374    pub(crate) user_request_interception_enabled: bool,
375    /// Hard kill-switch to block all network traffic.
376    ///
377    /// When `true`, the manager immediately blocks requests (typically via
378    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
379    /// and short-circuits most decision logic. This is used for safety conditions such as
380    /// exceeding `max_bytes_allowed` or other runtime protections.
381    block_all: bool,
382    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
383    ///
384    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
385    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
386    /// when `user_request_interception_enabled` or `credentials` change.
387    pub(crate) protocol_request_interception_enabled: bool,
388    /// The network is offline.
389    offline: bool,
390    /// The page request timeout.
391    pub request_timeout: Duration,
392    // made_request: bool,
393    /// Ignore visuals (no pings, prefetching, and etc).
394    pub ignore_visuals: bool,
395    /// Block CSS stylesheets.
396    pub block_stylesheets: bool,
397    /// Block javascript that is not critical to rendering.
398    ///
399    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
400    /// by itself (it remains for config compatibility).
401    pub block_javascript: bool,
402    /// Block analytics from rendering
403    pub block_analytics: bool,
404    /// Only html from loading.
405    pub only_html: bool,
406    /// Is xml document?
407    pub xml_document: bool,
408    /// The custom intercept handle logic to run on the website.
409    pub intercept_manager: NetworkInterceptManager,
410    /// Track the amount of times the document reloaded.
411    pub document_reload_tracker: u8,
412    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
413    pub document_target_url: String,
414    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
415    pub document_target_domain: String,
416    /// The max bytes to receive.
417    pub max_bytes_allowed: Option<u64>,
418    #[cfg(feature = "_cache")]
419    /// The cache site_key to use.
420    pub cache_site_key: Option<String>,
421    /// The cache policy to use.
422    #[cfg(feature = "_cache")]
423    pub cache_policy: Option<BasicCachePolicy>,
424    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
425    whitelist_patterns: Vec<String>,
426    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
427    whitelist_matcher: Option<AhoCorasick>,
428    /// Cached site keyword derived from `document_target_domain` for fast related-3p allow.
429    /// Computed once per navigation in `set_page_url()` and kept in sync as target domain changes.
430    site_keyword: Option<SiteKeyword>,
431}
432
433impl NetworkManager {
434    /// A new network manager.
435    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
436        Self {
437            queued_events: Default::default(),
438            ignore_httpserrors,
439            requests: Default::default(),
440            requests_will_be_sent: Default::default(),
441            extra_headers: Default::default(),
442            request_id_to_interception_id: Default::default(),
443            user_cache_disabled: false,
444            attempted_authentications: Default::default(),
445            credentials: None,
446            block_all: false,
447            user_request_interception_enabled: false,
448            protocol_request_interception_enabled: false,
449            offline: false,
450            request_timeout,
451            ignore_visuals: false,
452            block_javascript: false,
453            block_stylesheets: false,
454            block_analytics: true,
455            only_html: false,
456            xml_document: false,
457            intercept_manager: NetworkInterceptManager::Unknown,
458            document_reload_tracker: 0,
459            document_target_url: String::new(),
460            document_target_domain: String::new(),
461            whitelist_patterns: Vec::new(),
462            whitelist_matcher: None,
463            max_bytes_allowed: None,
464            site_keyword: None,
465            #[cfg(feature = "_cache")]
466            cache_site_key: None,
467            #[cfg(feature = "_cache")]
468            cache_policy: None,
469        }
470    }
471
472    /// Fast allow for 3rd-party URLs that embed the current site's keyword.
473    #[inline]
474    fn is_related_3rd_party_by_keyword_fast(&self, url: &str) -> bool {
475        let Some(kw) = self.site_keyword.as_ref() else {
476            return false;
477        };
478
479        let Some((host, rest)) = host_and_rest(url) else {
480            return false;
481        };
482
483        let host_b = host.as_bytes();
484        if contains_ascii_ci(host_b, &kw.kw_lower) {
485            return true;
486        }
487
488        let rest_b = rest.as_bytes();
489        if contains_ascii_ci(rest_b, &kw.kw_slash) {
490            return true;
491        }
492
493        false
494    }
495
496    /// Replace the whitelist patterns (compiled once).
497    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
498    where
499        I: IntoIterator<Item = S>,
500        S: Into<String>,
501    {
502        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
503        self.rebuild_whitelist_matcher();
504    }
505
506    /// Add one pattern (cheap) and rebuild (call this sparingly).
507    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
508        self.whitelist_patterns.push(pattern.into());
509        self.rebuild_whitelist_matcher();
510    }
511
512    /// Add many patterns and rebuild once.
513    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
514    where
515        I: IntoIterator<Item = S>,
516        S: Into<String>,
517    {
518        self.whitelist_patterns
519            .extend(patterns.into_iter().map(Into::into));
520        self.rebuild_whitelist_matcher();
521    }
522
523    #[inline]
524    fn rebuild_whitelist_matcher(&mut self) {
525        if self.whitelist_patterns.is_empty() {
526            self.whitelist_matcher = None;
527            return;
528        }
529
530        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
531
532        // If building fails (shouldn’t for simple patterns), just disable matcher.
533        self.whitelist_matcher = AhoCorasick::new(refs).ok();
534    }
535
536    #[inline]
537    fn is_whitelisted(&self, url: &str) -> bool {
538        self.whitelist_matcher
539            .as_ref()
540            .map(|m| m.is_match(url))
541            .unwrap_or(false)
542    }
543
544    /// Commands to init the chain with.
545    pub fn init_commands(&self) -> CommandChain {
546        let cmds = if self.ignore_httpserrors {
547            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
548        } else {
549            INIT_CHAIN.clone()
550        };
551        CommandChain::new(cmds, self.request_timeout)
552    }
553
554    /// Push the CDP request.
555    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
556        let method = cmd.identifier();
557        if let Ok(params) = serde_json::to_value(cmd) {
558            self.queued_events
559                .push_back(NetworkEvent::SendCdpRequest((method, params)));
560        }
561    }
562
563    /// The next event to handle.
564    pub fn poll(&mut self) -> Option<NetworkEvent> {
565        self.queued_events.pop_front()
566    }
567
568    /// Get the extra headers.
569    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
570        &self.extra_headers
571    }
572
573    /// Set extra HTTP headers.
574    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
575        self.extra_headers = headers;
576        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
577        self.extra_headers.remove("Proxy-Authorization");
578        if !self.extra_headers.is_empty() {
579            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
580                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
581            }
582        }
583    }
584
585    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
586        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
587    }
588
589    pub fn set_block_all(&mut self, block_all: bool) {
590        self.block_all = block_all;
591    }
592
593    pub fn set_request_interception(&mut self, enabled: bool) {
594        self.user_request_interception_enabled = enabled;
595        self.update_protocol_request_interception();
596    }
597
598    pub fn set_cache_enabled(&mut self, enabled: bool) {
599        let run = self.user_cache_disabled != !enabled;
600        self.user_cache_disabled = !enabled;
601        if run {
602            self.update_protocol_cache_disabled();
603        }
604    }
605
606    /// Enable fetch interception.
607    pub fn enable_request_intercept(&mut self) {
608        self.protocol_request_interception_enabled = true;
609    }
610
611    /// Disable fetch interception.
612    pub fn disable_request_intercept(&mut self) {
613        self.protocol_request_interception_enabled = false;
614    }
615
616    /// Set the cache site key.
617    #[cfg(feature = "_cache")]
618    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
619        self.cache_site_key = cache_site_key;
620    }
621
622    /// Set the cache policy.
623    #[cfg(feature = "_cache")]
624    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
625        self.cache_policy = cache_policy;
626    }
627
628    pub fn update_protocol_cache_disabled(&mut self) {
629        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
630    }
631
632    pub fn authenticate(&mut self, credentials: Credentials) {
633        self.credentials = Some(credentials);
634        self.update_protocol_request_interception();
635        self.protocol_request_interception_enabled = true;
636    }
637
638    fn update_protocol_request_interception(&mut self) {
639        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
640
641        if enabled == self.protocol_request_interception_enabled {
642            return;
643        }
644
645        if enabled {
646            self.push_cdp_request(ENABLE_FETCH.clone())
647        } else {
648            self.push_cdp_request(DisableParams::default())
649        }
650    }
651
652    /// Blocklist-only script blocking.
653    /// Returns true only when the URL matches an explicit blocklist condition.
654    ///
655    /// IMPORTANT:
656    /// - Scripts are ALLOW-BY-DEFAULT.
657    /// - We only block when explicit blocklist signals match.
658    /// - We do NOT call ignore_script() here because ignore_script() treats absolute URLs as
659    ///   "ignored by default", which is the opposite of what we want.
660    #[inline]
661    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
662        // If analytics blocking is off, skip all analytics tries.
663        let block_analytics = self.block_analytics;
664
665        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
666        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
667        {
668            return true;
669        }
670
671        // 2) Custom website block list (explicit).
672        if crate::handler::blockers::block_websites::block_website(url) {
673            return true;
674        }
675
676        // 3) Path-based explicit tries / fallbacks.
677        //
678        // We run these on:
679        // - path with leading slash ("/js/app.js")
680        // - path without leading slash ("js/app.js")
681        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
682        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
683            // Remove query/fragment so matching stays stable.
684            let p_slash = Self::strip_query_fragment(path_with_slash);
685            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
686
687            // Basename for filename-only lists.
688            let base = match p_slash.rsplit('/').next() {
689                Some(b) => b,
690                None => p_slash,
691            };
692
693            // ---- Filename fallback (VERY fast) ----
694            // This is the behavior your test expects: block "analytics.js" anywhere in the path.
695            // (You can add more filename-only fallbacks here if needed.)
696            if block_analytics && (base == "analytics.js" || p_noslash.ends_with("/analytics.js")) {
697                return true;
698            }
699
700            // ---- Trie checks ----
701            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
702            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
703                return true;
704            }
705            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
706                return true;
707            }
708            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
709                return true;
710            }
711
712            // Base-path ignore tries (framework noise / known ignorable script paths).
713            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
714            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
715                return true;
716            }
717
718            // Style path ignores only when visuals are ignored.
719            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
720                return true;
721            }
722        }
723
724        false
725    }
726
727    /// Extract the absolute URL path portion WITH the leading slash.
728    ///
729    /// Example:
730    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
731    #[inline]
732    fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
733        // find scheme separator
734        let idx = url.find("//")?;
735        let after_slashes = idx + 2;
736
737        // find first slash after host
738        let slash_rel = url[after_slashes..].find('/')?;
739        let slash_idx = after_slashes + slash_rel;
740
741        if slash_idx < url.len() {
742            Some(&url[slash_idx..])
743        } else {
744            None
745        }
746    }
747
748    /// Strip query string and fragment from a path-ish string.
749    ///
750    /// Example:
751    /// - "/a/b.js?x=1#y" -> "/a/b.js"
752    #[inline]
753    fn strip_query_fragment(s: &str) -> &str {
754        let q = s.find('?');
755        let h = s.find('#');
756
757        match (q, h) {
758            (None, None) => s,
759            (Some(i), None) => &s[..i],
760            (None, Some(i)) => &s[..i],
761            (Some(i), Some(j)) => &s[..i.min(j)],
762        }
763    }
764
765    /// Determine if the request should be skipped.
766    #[inline]
767    fn skip_xhr(
768        &self,
769        skip_networking: bool,
770        event: &EventRequestPaused,
771        network_event: bool,
772    ) -> bool {
773        // XHR check
774        if !skip_networking && network_event {
775            let request_url = event.request.url.as_str();
776
777            // check if part of ignore scripts.
778            let skip_analytics =
779                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
780
781            if skip_analytics {
782                true
783            } else if self.block_stylesheets || self.ignore_visuals {
784                let block_css = self.block_stylesheets;
785                let block_media = self.ignore_visuals;
786
787                let mut block_request = false;
788
789                if let Some(position) = request_url.rfind('.') {
790                    let hlen = request_url.len();
791                    let has_asset = hlen - position;
792
793                    if has_asset >= 3 {
794                        let next_position = position + 1;
795
796                        if block_media
797                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
798                                &request_url[next_position..].into(),
799                            )
800                        {
801                            block_request = true;
802                        } else if block_css {
803                            block_request =
804                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
805                                    .contains(&**CSS_EXTENSION)
806                        }
807                    }
808                }
809
810                if !block_request {
811                    block_request = ignore_script_xhr_media(request_url);
812                }
813
814                block_request
815            } else {
816                skip_networking
817            }
818        } else {
819            skip_networking
820        }
821    }
822
823    #[cfg(feature = "adblock")]
824    #[inline]
825    /// Detect if ad enabled.
826    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
827        if skip_networking {
828            true
829        } else {
830            self.detect_ad(event)
831        }
832    }
833
834    /// When adblock feature is disabled, this is a no-op.
835    #[cfg(not(feature = "adblock"))]
836    #[inline]
837    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
838        skip_networking
839    }
840
841    #[inline]
842    /// Fail request
843    fn fail_request_blocked(
844        &mut self,
845        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
846    ) {
847        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
848            request_id.clone(),
849            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
850        );
851        self.push_cdp_request(params);
852    }
853
854    #[inline]
855    /// Fulfill request
856    fn fulfill_request_empty_200(
857        &mut self,
858        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
859    ) {
860        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
861            request_id.clone(),
862            200,
863        );
864        self.push_cdp_request(params);
865    }
866
867    #[cfg(feature = "_cache")]
868    #[inline]
869    /// Fulfill a paused Fetch request from cached bytes + header map.
870    ///
871    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
872    fn fulfill_request_from_cache(
873        &mut self,
874        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
875        body: &[u8],
876        headers: &std::collections::HashMap<String, String>,
877        status: i64,
878    ) {
879        use crate::cdp::browser_protocol::fetch::HeaderEntry;
880        use crate::handler::network::fetch::FulfillRequestParams;
881        use base64::Engine;
882
883        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
884
885        for (k, v) in headers.iter() {
886            resp_headers.push(HeaderEntry {
887                name: k.clone().into(),
888                value: v.clone().into(),
889            });
890        }
891
892        let mut params = FulfillRequestParams::new(request_id.clone(), status);
893
894        // TODO: have this already encoded prior.
895        params.body = Some(
896            base64::engine::general_purpose::STANDARD
897                .encode(body)
898                .into(),
899        );
900
901        params.response_headers = Some(resp_headers);
902
903        self.push_cdp_request(params);
904    }
905
906    #[inline]
907    /// Continue the request url.
908    fn continue_request_with_url(
909        &mut self,
910        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
911        url: Option<&str>,
912        intercept_response: bool,
913    ) {
914        let mut params = ContinueRequestParams::new(request_id.clone());
915        if let Some(url) = url {
916            params.url = Some(url.to_string());
917            params.intercept_response = Some(intercept_response);
918        }
919        self.push_cdp_request(params);
920    }
921
922    /// On fetch request paused interception.
923    #[inline]
924    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
925        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
926            return;
927        }
928
929        let resource_type = &event.resource_type;
930
931        if self.block_all {
932            tracing::debug!(
933                "Blocked (block_all): {:?} - {}",
934                event.resource_type,
935                event.request.url
936            );
937            return self.fail_request_blocked(&event.request_id);
938        }
939
940        if let Some(network_id) = event.network_id.as_ref() {
941            if let Some(request_will_be_sent) =
942                self.requests_will_be_sent.remove(network_id.as_ref())
943            {
944                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
945            } else {
946                self.request_id_to_interception_id
947                    .insert(network_id.clone(), event.request_id.clone().into());
948            }
949        }
950
951        // From here on, we handle the full decision tree.
952        let javascript_resource = *resource_type == ResourceType::Script;
953        let document_resource = *resource_type == ResourceType::Document;
954        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
955
956        // Start with static / cheap skip checks.
957        let mut skip_networking =
958            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
959
960        // Also short-circuit if we've reloaded this document too many times.
961        if !skip_networking {
962            skip_networking = self.document_reload_tracker >= 3;
963        }
964
965        // Handle document redirect / masking and track xml documents.
966        let (current_url_cow, had_replacer) =
967            self.handle_document_replacement_and_tracking(event, document_resource);
968
969        let current_url: &str = current_url_cow.as_ref();
970
971        // Main initial check (visuals, stylesheets).
972        //
973        // IMPORTANT: Scripts are NOT blocked here anymore.
974        // Scripts are allowed by default and only blocked via explicit blocklists
975        // (adblock / block_websites / intercept_manager / URL tries).
976        if !skip_networking {
977            // Allow XSL for sitemap XML.
978            if self.xml_document && current_url.ends_with(".xsl") {
979                skip_networking = false;
980            } else {
981                skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
982            }
983        }
984
985        // Ad blocking (only active when feature = "adblock").
986        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
987
988        // Ignore embedded scripts when only_html or ignore_visuals is set.
989        if !skip_networking
990            && self.block_javascript
991            && (self.only_html || self.ignore_visuals)
992            && (javascript_resource || document_resource)
993        {
994            skip_networking = ignore_script_embedded(current_url);
995        }
996
997        // Script policy: allow-by-default.
998        // Block only if explicit block list patterns match.
999        if !skip_networking && javascript_resource {
1000            skip_networking = self.should_block_script_blocklist_only(current_url);
1001        }
1002
1003        // XHR / data resources.
1004        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1005
1006        // Custom interception layer.
1007        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1008            skip_networking = self.intercept_manager.intercept_detection(
1009                current_url,
1010                self.ignore_visuals,
1011                network_resource,
1012            );
1013        }
1014
1015        // Custom website block list.
1016        if !skip_networking && (javascript_resource || network_resource) {
1017            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1018        }
1019
1020        // whitelist 3rd party
1021        // not required unless explicit blocking.
1022        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1023        {
1024            skip_networking = false;
1025        }
1026
1027        // check if the url is in the whitelist.
1028        if skip_networking && self.is_whitelisted(current_url) {
1029            skip_networking = false;
1030        }
1031
1032        // 3rd party match
1033        if skip_networking
1034            && (javascript_resource || *resource_type == ResourceType::Stylesheet)
1035            && self.is_related_3rd_party_by_keyword_fast(current_url)
1036        {
1037            skip_networking = false;
1038        }
1039
1040        if skip_networking {
1041            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
1042            self.fulfill_request_empty_200(&event.request_id);
1043        } else {
1044            #[cfg(feature = "_cache")]
1045            {
1046                if let (Some(policy), Some(cache_site_key)) =
1047                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1048                {
1049                    let current_url = format!("{}:{}", event.request.method, &current_url);
1050
1051                    if let Some((res, cache_policy)) =
1052                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1053                    {
1054                        if policy.allows_cached(&cache_policy) {
1055                            tracing::debug!(
1056                                "Remote Cached: {:?} - {}",
1057                                resource_type,
1058                                &current_url
1059                            );
1060                            return self.fulfill_request_from_cache(
1061                                &event.request_id,
1062                                &res.body,
1063                                &res.headers,
1064                                res.status as i64,
1065                            );
1066                        }
1067                    }
1068                }
1069            }
1070
1071            // check our frame cache for the run.
1072            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1073            self.continue_request_with_url(
1074                &event.request_id,
1075                if had_replacer {
1076                    Some(current_url)
1077                } else {
1078                    None
1079                },
1080                !had_replacer,
1081            );
1082        }
1083    }
1084
1085    /// Shared "visuals + basic blocking" logic.
1086    ///
1087    /// IMPORTANT: Scripts are NOT blocked here anymore.
1088    /// Scripts are allowed by default and only blocked via explicit blocklists
1089    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1090    #[inline]
1091    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1092        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1093            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1094    }
1095
1096    /// Does the network manager have a target domain?
1097    pub fn has_target_domain(&self) -> bool {
1098        !self.document_target_url.is_empty()
1099    }
1100
1101    /// Set the target page url for tracking.
1102    pub fn set_page_url(&mut self, page_target_url: String) {
1103        let host_base = host_and_rest(&page_target_url)
1104            .map(|(h, _)| base_domain_from_host(h))
1105            .unwrap_or("");
1106
1107        self.document_target_domain = host_base.to_string();
1108        self.document_target_url = page_target_url;
1109        // 3rd party guard.
1110        self.site_keyword = SiteKeyword::new_from_base_domain(&self.document_target_domain);
1111    }
1112
1113    /// Clear the initial target domain on every navigation.
1114    pub fn clear_target_domain(&mut self) {
1115        self.document_reload_tracker = 0;
1116        self.document_target_url = Default::default();
1117        self.document_target_domain = Default::default();
1118        self.site_keyword = None;
1119    }
1120
1121    /// Handles:
1122    /// - document reload tracking (`document_reload_tracker`)
1123    /// - redirect masking / replacement
1124    /// - xml document detection (`xml_document`)
1125    /// - `document_target_url` updates
1126    ///
1127    /// Returns (current_url, had_replacer).
1128    #[inline]
1129    fn handle_document_replacement_and_tracking<'a>(
1130        &mut self,
1131        event: &'a EventRequestPaused,
1132        document_resource: bool,
1133    ) -> (Cow<'a, str>, bool) {
1134        let mut replacer: Option<String> = None;
1135        let current_url = event.request.url.as_str();
1136
1137        if document_resource {
1138            if self.document_target_url == current_url {
1139                self.document_reload_tracker += 1;
1140            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1141            {
1142                let (http_document_replacement, mut https_document_replacement) =
1143                    if self.document_target_url.starts_with("http://") {
1144                        (
1145                            self.document_target_url.replacen("http://", "http//", 1),
1146                            self.document_target_url.replacen("http://", "https://", 1),
1147                        )
1148                    } else {
1149                        (
1150                            self.document_target_url.replacen("https://", "https//", 1),
1151                            self.document_target_url.replacen("https://", "http://", 1),
1152                        )
1153                    };
1154
1155                // Track trailing slash to restore later.
1156                let trailing = https_document_replacement.ends_with('/');
1157                if trailing {
1158                    https_document_replacement.pop();
1159                }
1160                if https_document_replacement.ends_with('/') {
1161                    https_document_replacement.pop();
1162                }
1163
1164                let redirect_mask = format!(
1165                    "{}{}",
1166                    https_document_replacement, http_document_replacement
1167                );
1168
1169                if current_url == redirect_mask {
1170                    replacer = Some(if trailing {
1171                        format!("{}/", https_document_replacement)
1172                    } else {
1173                        https_document_replacement
1174                    });
1175                }
1176            }
1177
1178            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1179                self.xml_document = true;
1180            }
1181
1182            // Track last seen document URL.
1183            self.document_target_url = event.request.url.clone();
1184            self.document_target_domain = host_and_rest(&self.document_target_url)
1185                .map(|(h, _)| base_domain_from_host(h).to_string())
1186                .unwrap_or_default();
1187
1188            self.site_keyword = SiteKeyword::new_from_base_domain(&self.document_target_domain);
1189        }
1190
1191        let current_url_cow = match replacer {
1192            Some(r) => Cow::Owned(r),
1193            None => Cow::Borrowed(event.request.url.as_str()),
1194        };
1195
1196        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1197        (current_url_cow, had_replacer)
1198    }
1199
1200    /// Perform a page intercept for chrome
1201    #[cfg(feature = "adblock")]
1202    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1203        use adblock::{
1204            lists::{FilterSet, ParseOptions, RuleTypes},
1205            Engine,
1206        };
1207
1208        lazy_static::lazy_static! {
1209            static ref AD_ENGINE: Engine = {
1210                let mut filter_set = FilterSet::new(false);
1211                let mut rules = ParseOptions::default();
1212                rules.rule_types = RuleTypes::All;
1213
1214                filter_set.add_filters(
1215                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1216                    rules,
1217                );
1218
1219                Engine::from_filter_set(filter_set, true)
1220            };
1221        };
1222
1223        let blockable = ResourceType::Image == event.resource_type
1224            || event.resource_type == ResourceType::Media
1225            || event.resource_type == ResourceType::Stylesheet
1226            || event.resource_type == ResourceType::Document
1227            || event.resource_type == ResourceType::Fetch
1228            || event.resource_type == ResourceType::Xhr;
1229
1230        let u = &event.request.url;
1231
1232        let block_request = blockable
1233            // set it to example.com for 3rd party handling is_same_site
1234        && {
1235            let request = adblock::request::Request::preparsed(
1236                 &u,
1237                 "example.com",
1238                 "example.com",
1239                 &event.resource_type.as_ref().to_lowercase(),
1240                 !event.request.is_same_site.unwrap_or_default());
1241
1242            AD_ENGINE.check_network_request(&request).matched
1243        };
1244
1245        block_request
1246    }
1247
1248    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1249        let response = if self
1250            .attempted_authentications
1251            .contains(event.request_id.as_ref())
1252        {
1253            AuthChallengeResponseResponse::CancelAuth
1254        } else if self.credentials.is_some() {
1255            self.attempted_authentications
1256                .insert(event.request_id.clone().into());
1257            AuthChallengeResponseResponse::ProvideCredentials
1258        } else {
1259            AuthChallengeResponseResponse::Default
1260        };
1261
1262        let mut auth = AuthChallengeResponse::new(response);
1263        if let Some(creds) = self.credentials.clone() {
1264            auth.username = Some(creds.username);
1265            auth.password = Some(creds.password);
1266        }
1267        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1268    }
1269
1270    /// Set the page offline network emulation condition.
1271    pub fn set_offline_mode(&mut self, value: bool) {
1272        if self.offline == value {
1273            return;
1274        }
1275        self.offline = value;
1276        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1277            .offline(self.offline)
1278            .latency(0)
1279            .download_throughput(-1.)
1280            .upload_throughput(-1.)
1281            .build()
1282        {
1283            self.push_cdp_request(network);
1284        }
1285    }
1286
1287    /// Request interception doesn't happen for data URLs with Network Service.
1288    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1289        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1290            if let Some(interception_id) = self
1291                .request_id_to_interception_id
1292                .remove(event.request_id.as_ref())
1293            {
1294                self.on_request(event, Some(interception_id));
1295            } else {
1296                // TODO remove the clone for event
1297                self.requests_will_be_sent
1298                    .insert(event.request_id.clone(), event.clone());
1299            }
1300        } else {
1301            self.on_request(event, None);
1302        }
1303    }
1304
1305    /// The request was served from the cache.
1306    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1307        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1308            request.from_memory_cache = true;
1309        }
1310    }
1311
1312    /// On network response received.
1313    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1314        let mut request_failed = false;
1315
1316        // Track how many bytes we actually deducted from this target.
1317        let mut deducted: u64 = 0;
1318
1319        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1320            let before = *max_bytes;
1321
1322            // encoded_data_length -> saturating cast to u64
1323            let received_bytes: u64 = event.response.encoded_data_length as u64;
1324
1325            // Safe parse of Content-Length
1326            let content_length: Option<u64> = event
1327                .response
1328                .headers
1329                .inner()
1330                .get("content-length")
1331                .and_then(|v| v.as_str())
1332                .and_then(|s| s.trim().parse::<u64>().ok());
1333
1334            // Deduct what we actually received
1335            *max_bytes = max_bytes.saturating_sub(received_bytes);
1336
1337            // If the declared size can't fit, zero out now
1338            if let Some(cl) = content_length {
1339                if cl > *max_bytes {
1340                    *max_bytes = 0;
1341                }
1342            }
1343
1344            request_failed = *max_bytes == 0;
1345
1346            // Compute exact delta deducted on this event
1347            deducted = before.saturating_sub(*max_bytes);
1348        }
1349
1350        // Bubble up the deduction (even if request continues)
1351        if deducted > 0 {
1352            self.queued_events
1353                .push_back(NetworkEvent::BytesConsumed(deducted));
1354        }
1355
1356        // block all network request moving forward.
1357        if request_failed && self.max_bytes_allowed.is_some() {
1358            self.set_block_all(true);
1359        }
1360
1361        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1362            request.set_response(event.response.clone());
1363            self.queued_events.push_back(if request_failed {
1364                NetworkEvent::RequestFailed(request)
1365            } else {
1366                NetworkEvent::RequestFinished(request)
1367            });
1368        }
1369    }
1370
1371    /// On network loading finished.
1372    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1373        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1374            if let Some(interception_id) = request.interception_id.as_ref() {
1375                self.attempted_authentications
1376                    .remove(interception_id.as_ref());
1377            }
1378            self.queued_events
1379                .push_back(NetworkEvent::RequestFinished(request));
1380        }
1381    }
1382
1383    /// On network loading failed.
1384    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1385        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1386            request.failure_text = Some(event.error_text.clone());
1387            if let Some(interception_id) = request.interception_id.as_ref() {
1388                self.attempted_authentications
1389                    .remove(interception_id.as_ref());
1390            }
1391            self.queued_events
1392                .push_back(NetworkEvent::RequestFailed(request));
1393        }
1394    }
1395
1396    /// On request will be sent.
1397    fn on_request(
1398        &mut self,
1399        event: &EventRequestWillBeSent,
1400        interception_id: Option<InterceptionId>,
1401    ) {
1402        let mut redirect_chain = Vec::new();
1403        let mut redirect_location = None;
1404
1405        if let Some(redirect_resp) = &event.redirect_response {
1406            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1407                if is_redirect_status(redirect_resp.status) {
1408                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1409                        if redirect_resp.url != location {
1410                            let fixed_location = location.replace(&redirect_resp.url, "");
1411
1412                            if !fixed_location.is_empty() {
1413                                request.response.as_mut().map(|resp| {
1414                                    resp.headers.0["Location"] =
1415                                        serde_json::Value::String(fixed_location.clone());
1416                                });
1417                            }
1418
1419                            redirect_location = Some(fixed_location);
1420                        }
1421                    }
1422                }
1423
1424                self.handle_request_redirect(
1425                    &mut request,
1426                    if let Some(redirect_location) = redirect_location {
1427                        let mut redirect_resp = redirect_resp.clone();
1428
1429                        if !redirect_location.is_empty() {
1430                            redirect_resp.headers.0["Location"] =
1431                                serde_json::Value::String(redirect_location);
1432                        }
1433
1434                        redirect_resp
1435                    } else {
1436                        redirect_resp.clone()
1437                    },
1438                );
1439
1440                redirect_chain = std::mem::take(&mut request.redirect_chain);
1441                redirect_chain.push(request);
1442            }
1443        }
1444
1445        let request = HttpRequest::new(
1446            event.request_id.clone(),
1447            event.frame_id.clone(),
1448            interception_id,
1449            self.user_request_interception_enabled,
1450            redirect_chain,
1451        );
1452
1453        self.requests.insert(event.request_id.clone(), request);
1454        self.queued_events
1455            .push_back(NetworkEvent::Request(event.request_id.clone()));
1456    }
1457
1458    /// Handle request redirect.
1459    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1460        request.set_response(response);
1461        if let Some(interception_id) = request.interception_id.as_ref() {
1462            self.attempted_authentications
1463                .remove(interception_id.as_ref());
1464        }
1465    }
1466}
1467
1468#[derive(Debug)]
1469pub enum NetworkEvent {
1470    /// Send a CDP request.
1471    SendCdpRequest((MethodId, serde_json::Value)),
1472    /// Request.
1473    Request(RequestId),
1474    /// Response
1475    Response(RequestId),
1476    /// Request failed.
1477    RequestFailed(HttpRequest),
1478    /// Request finished.
1479    RequestFinished(HttpRequest),
1480    /// Bytes consumed.
1481    BytesConsumed(u64),
1482}
1483
1484#[cfg(test)]
1485mod tests {
1486    use super::ALLOWED_MATCHER_3RD_PARTY;
1487    use crate::handler::network::NetworkManager;
1488    use std::time::Duration;
1489
1490    #[test]
1491    fn test_allowed_matcher_3rd_party() {
1492        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1493        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1494        assert!(
1495            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1496            "expected Cloudflare challenge script to be allowed"
1497        );
1498
1499        // Should NOT be allowed (not in allow-list)
1500        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1501        assert!(
1502            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1503            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1504        );
1505
1506        // A couple sanity checks for existing allow patterns
1507        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1508        assert!(ALLOWED_MATCHER_3RD_PARTY
1509            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1510        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1511    }
1512
1513    #[test]
1514    fn test_script_allowed_by_default_when_not_blocklisted() {
1515        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1516        nm.set_page_url(
1517            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1518        );
1519
1520        // A random script that should not match your block tries.
1521        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1522        assert!(
1523            !nm.should_block_script_blocklist_only(ok),
1524            "expected non-blocklisted script to be allowed"
1525        );
1526    }
1527
1528    #[test]
1529    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1530        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1531        nm.set_page_url(
1532            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1533        );
1534
1535        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1536        let bad = "https://cdn.example.net/js/analytics.js";
1537        assert!(
1538            nm.should_block_script_blocklist_only(bad),
1539            "expected analytics.js to be blocklisted"
1540        );
1541    }
1542
1543    #[test]
1544    fn test_allowed_matcher_3rd_party_sanity() {
1545        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1546        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1547        assert!(
1548            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1549            "expected Cloudflare challenge script to be allowed"
1550        );
1551
1552        // Should NOT be allowed (not in allow-list)
1553        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1554        assert!(
1555            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1556            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1557        );
1558
1559        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1560        assert!(ALLOWED_MATCHER_3RD_PARTY
1561            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1562        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1563    }
1564
1565    #[test]
1566    fn test_related_3rd_party_keyword_fast_gelcom() {
1567        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1568        nm.set_page_url("https://www.gelcom.de/".to_string());
1569
1570        // path embeds /gelcom/
1571        let a = "https://tags-eu.tiqcdn.com/utag/gelcom/oneshop-eu/prod/utag.js";
1572        assert!(nm.is_related_3rd_party_by_keyword_fast(a));
1573
1574        // host embeds gelcom
1575        let b = "https://www2.gelcom.de/forward/ablyft-cdn/s/55651514.js";
1576        assert!(nm.is_related_3rd_party_by_keyword_fast(b));
1577
1578        // host embeds gelcom
1579        let c = "https://ebs01.gelcom.de/resout/legalnote-replacer/legalnote-replacer-oneshop.js";
1580        assert!(nm.is_related_3rd_party_by_keyword_fast(c));
1581
1582        // unrelated
1583        let d = "https://static.cloudflareinsights.com/beacon.min.js";
1584        assert!(!nm.is_related_3rd_party_by_keyword_fast(d));
1585    }
1586}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs