chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20    SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235/// How long a buffered `requests_will_be_sent` / `request_id_to_interception_id`
236/// entry may linger before being evicted. 30 seconds is generous — the CDP
237/// round-trip that reconciles the two racing events normally completes in
238/// milliseconds.
239const STALE_BUFFER_SECS: u64 = 30;
240
241/// How long an in-flight request entry (`requests` map) can live without
242/// being resolved by a `loadingFinished` / `loadingFailed` / `loadingCanceled`
243/// event before it is considered orphaned and evicted.  Longer than the
244/// race-condition buffer timeout because real requests can legitimately take
245/// tens of seconds (streaming, slow origins, etc.).
246const STALE_REQUEST_SECS: u64 = 120;
247
248/// Wrapper around `adblock::Engine` that implements `Debug`.
249#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255        f.debug_struct("AdblockEngine").finish()
256    }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261    type Target = adblock::Engine;
262    fn deref(&self) -> &Self::Target {
263        &self.0
264    }
265}
266
267#[derive(Debug)]
268/// The base network manager.
269pub struct NetworkManager {
270    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
271    ///
272    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
273    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
274    /// Consumers pull from this queue via `poll()`.
275    queued_events: VecDeque<NetworkEvent>,
276    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
277    ///
278    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
279    /// certificates (self-signed, expired, MITM proxies, etc.).
280    ignore_httpserrors: bool,
281    /// Active in-flight requests keyed by CDP `RequestId`.
282    ///
283    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
284    /// and final state used to emit `RequestFinished` / `RequestFailed`.
285    requests: HashMap<RequestId, HttpRequest>,
286    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
287    /// `Fetch.requestPaused` arrives later (or vice versa).
288    ///
289    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
290    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
291    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
292    requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293    /// Extra HTTP headers to apply to subsequent network requests via CDP.
294    ///
295    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
296    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
297    extra_headers: std::collections::HashMap<String, String>,
298    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
299    ///
300    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
301    /// store the interception id here so it can be attached to the `HttpRequest` once the
302    /// network request is observed.
303    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
304    request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305    /// Whether the user has disabled the browser cache.
306    ///
307    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
308    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
309    user_cache_disabled: bool,
310    /// Tracks which requests have already attempted authentication.
311    ///
312    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
313    /// authentication challenges (407/401). Once a request id is present here, subsequent
314    /// challenges for the same request are canceled.
315    attempted_authentications: HashSet<RequestId>,
316    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
317    ///
318    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
319    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
320    credentials: Option<Credentials>,
321    /// User-facing toggle indicating whether request interception is desired.
322    ///
323    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
324    /// not guarantee interception is active; interception is actually enabled/disabled by
325    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
326    ///
327    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
328    /// enabled to satisfy auth challenges.
329    pub(crate) user_request_interception_enabled: bool,
330    /// Hard kill-switch to block all network traffic.
331    ///
332    /// When `true`, the manager immediately blocks requests (typically via
333    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
334    /// and short-circuits most decision logic. This is used for safety conditions such as
335    /// exceeding `max_bytes_allowed` or other runtime protections.
336    block_all: bool,
337    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
338    ///
339    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
340    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
341    /// when `user_request_interception_enabled` or `credentials` change.
342    pub(crate) protocol_request_interception_enabled: bool,
343    /// The network is offline.
344    offline: bool,
345    /// The page request timeout.
346    pub request_timeout: Duration,
347    // made_request: bool,
348    /// Ignore visuals (no pings, prefetching, and etc).
349    pub ignore_visuals: bool,
350    /// Block CSS stylesheets.
351    pub block_stylesheets: bool,
352    /// Block javascript that is not critical to rendering.
353    ///
354    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
355    /// by itself (it remains for config compatibility).
356    pub block_javascript: bool,
357    /// Block analytics from rendering
358    pub block_analytics: bool,
359    /// Block pre-fetch request
360    pub block_prefetch: bool,
361    /// Only html from loading.
362    pub only_html: bool,
363    /// Is xml document?
364    pub xml_document: bool,
365    /// The custom intercept handle logic to run on the website.
366    pub intercept_manager: NetworkInterceptManager,
367    /// Track the amount of times the document reloaded.
368    pub document_reload_tracker: u8,
369    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
370    pub document_target_url: String,
371    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
372    pub document_target_domain: String,
373    /// The max bytes to receive.
374    pub max_bytes_allowed: Option<u64>,
375    /// Cap on main-frame Document redirect hops before the navigation is aborted.
376    ///
377    /// `None` disables enforcement (default, preserves prior behavior). When `Some(n)`,
378    /// the (n+1)th Document redirect short-circuits: a synthetic `RequestFailed` event
379    /// is emitted with `failure_text = "net::ERR_TOO_MANY_REDIRECTS"` and
380    /// `Page.stopLoading` is dispatched to abort in-flight navigation. The accumulated
381    /// `redirect_chain` is preserved on the failed request so consumers can inspect it.
382    pub max_redirects: Option<usize>,
383    #[cfg(feature = "_cache")]
384    /// The cache site_key to use.
385    pub cache_site_key: Option<String>,
386    /// The cache policy to use.
387    #[cfg(feature = "_cache")]
388    pub cache_policy: Option<BasicCachePolicy>,
389    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
390    whitelist_patterns: Vec<String>,
391    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
392    whitelist_matcher: Option<AhoCorasick>,
393    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
394    blacklist_patterns: Vec<String>,
395    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
396    blacklist_matcher: Option<AhoCorasick>,
397    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
398    blacklist_strict: bool,
399    /// Custom adblock engine built from user-supplied filter rules.
400    /// When `Some`, takes precedence over the global default engine.
401    #[cfg(feature = "adblock")]
402    adblock_engine: Option<AdblockEngine>,
403}
404
405impl NetworkManager {
406    /// A new network manager.
407    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
408        Self {
409            queued_events: Default::default(),
410            ignore_httpserrors,
411            requests: Default::default(),
412            requests_will_be_sent: Default::default(),
413            extra_headers: Default::default(),
414            request_id_to_interception_id: Default::default(),
415            user_cache_disabled: false,
416            attempted_authentications: Default::default(),
417            credentials: None,
418            block_all: false,
419            user_request_interception_enabled: false,
420            protocol_request_interception_enabled: false,
421            offline: false,
422            request_timeout,
423            ignore_visuals: false,
424            block_javascript: false,
425            block_stylesheets: false,
426            block_prefetch: true,
427            block_analytics: true,
428            only_html: false,
429            xml_document: false,
430            intercept_manager: NetworkInterceptManager::Unknown,
431            document_reload_tracker: 0,
432            document_target_url: String::new(),
433            document_target_domain: String::new(),
434            whitelist_patterns: Vec::new(),
435            whitelist_matcher: None,
436            blacklist_patterns: Vec::new(),
437            blacklist_matcher: None,
438            blacklist_strict: true,
439            max_bytes_allowed: None,
440            max_redirects: None,
441            #[cfg(feature = "_cache")]
442            cache_site_key: None,
443            #[cfg(feature = "_cache")]
444            cache_policy: None,
445            #[cfg(feature = "adblock")]
446            adblock_engine: None,
447        }
448    }
449
450    /// Set a custom adblock engine built from user-supplied filter rules.
451    #[cfg(feature = "adblock")]
452    pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
453        self.adblock_engine = Some(AdblockEngine(engine));
454    }
455
456    /// Replace the whitelist patterns (compiled once).
457    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
458    where
459        I: IntoIterator<Item = S>,
460        S: Into<String>,
461    {
462        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
463        self.rebuild_whitelist_matcher();
464    }
465
466    /// Replace the blacklist patterns (compiled once).
467    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
468    where
469        I: IntoIterator<Item = S>,
470        S: Into<String>,
471    {
472        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
473        self.rebuild_blacklist_matcher();
474    }
475
476    /// Add one pattern (cheap) and rebuild (call this sparingly).
477    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
478        self.blacklist_patterns.push(pattern.into());
479        self.rebuild_blacklist_matcher();
480    }
481
482    /// Add many patterns and rebuild once.
483    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
484    where
485        I: IntoIterator<Item = S>,
486        S: Into<String>,
487    {
488        self.blacklist_patterns
489            .extend(patterns.into_iter().map(Into::into));
490        self.rebuild_blacklist_matcher();
491    }
492
493    /// Clear blacklist entirely.
494    pub fn clear_blacklist(&mut self) {
495        self.blacklist_patterns.clear();
496        self.blacklist_matcher = None;
497    }
498
499    /// Control precedence: when true, blacklist always wins.
500    pub fn set_blacklist_strict(&mut self, strict: bool) {
501        self.blacklist_strict = strict;
502    }
503
504    #[inline]
505    fn rebuild_blacklist_matcher(&mut self) {
506        if self.blacklist_patterns.is_empty() {
507            self.blacklist_matcher = None;
508            return;
509        }
510
511        self.blacklist_matcher =
512            AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
513    }
514
515    #[inline]
516    fn is_blacklisted(&self, url: &str) -> bool {
517        self.blacklist_matcher
518            .as_ref()
519            .map(|m| m.is_match(url))
520            .unwrap_or(false)
521    }
522
523    /// Add one pattern (cheap) and rebuild (call this sparingly).
524    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
525        self.whitelist_patterns.push(pattern.into());
526        self.rebuild_whitelist_matcher();
527    }
528
529    /// Add many patterns and rebuild once.
530    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
531    where
532        I: IntoIterator<Item = S>,
533        S: Into<String>,
534    {
535        self.whitelist_patterns
536            .extend(patterns.into_iter().map(Into::into));
537        self.rebuild_whitelist_matcher();
538    }
539
540    #[inline]
541    fn rebuild_whitelist_matcher(&mut self) {
542        if self.whitelist_patterns.is_empty() {
543            self.whitelist_matcher = None;
544            return;
545        }
546
547        // If building fails (shouldn’t for simple patterns), just disable matcher.
548        self.whitelist_matcher =
549            AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
550    }
551
552    #[inline]
553    fn is_whitelisted(&self, url: &str) -> bool {
554        self.whitelist_matcher
555            .as_ref()
556            .map(|m| m.is_match(url))
557            .unwrap_or(false)
558    }
559
560    /// Commands to init the chain with.
561    pub fn init_commands(&self) -> CommandChain {
562        let cmds = if self.ignore_httpserrors {
563            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
564        } else {
565            INIT_CHAIN.clone()
566        };
567        CommandChain::new(cmds, self.request_timeout)
568    }
569
570    /// Push the CDP request.
571    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
572        let method = cmd.identifier();
573        if let Ok(params) = serde_json::to_value(cmd) {
574            self.queued_events
575                .push_back(NetworkEvent::SendCdpRequest((method, params)));
576        }
577    }
578
579    /// The next event to handle.
580    pub fn poll(&mut self) -> Option<NetworkEvent> {
581        self.queued_events.pop_front()
582    }
583
584    /// Evict stale entries from the race-condition buffers and from
585    /// `attempted_authentications`. Call this periodically (e.g. from the
586    /// handler's eviction tick) so that lost CDP events cannot cause unbounded
587    /// map growth.
588    pub fn evict_stale_entries(&mut self, now: Instant) {
589        let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
590
591        self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
592        self.request_id_to_interception_id
593            .retain(|_, (_, ts)| *ts > cutoff);
594
595        // Evict orphaned in-flight requests whose completion events
596        // (`loadingFinished` / `loadingFailed` / `loadingCanceled`) were
597        // never received.  Uses a longer timeout than the race-condition
598        // buffers since real requests can legitimately be long-lived.
599        let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
600        self.requests
601            .retain(|_, req| req.created_at > request_cutoff);
602
603        // `attempted_authentications` entries reference interception IDs that
604        // are cleaned up on loading-finished / loading-failed. If those events
605        // are lost, the set grows forever. Cross-reference with `requests`:
606        // any interception ID that no longer appears in a live request is stale.
607        if !self.attempted_authentications.is_empty() {
608            let live: HashSet<&str> = self
609                .requests
610                .values()
611                .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
612                .collect();
613            self.attempted_authentications
614                .retain(|id| live.contains(id.as_ref()));
615        }
616    }
617
618    /// Get the extra headers.
619    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
620        &self.extra_headers
621    }
622
623    /// Set extra HTTP headers.
624    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
625        self.extra_headers = headers;
626        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
627        self.extra_headers.remove("Proxy-Authorization");
628        if !self.extra_headers.is_empty() {
629            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
630                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
631            }
632        }
633    }
634
635    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
636        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
637    }
638
639    pub fn set_block_all(&mut self, block_all: bool) {
640        self.block_all = block_all;
641    }
642
643    pub fn set_request_interception(&mut self, enabled: bool) {
644        self.user_request_interception_enabled = enabled;
645        self.update_protocol_request_interception();
646    }
647
648    pub fn set_cache_enabled(&mut self, enabled: bool) {
649        let run = self.user_cache_disabled == enabled;
650        self.user_cache_disabled = !enabled;
651        if run {
652            self.update_protocol_cache_disabled();
653        }
654    }
655
656    /// Enable fetch interception.
657    pub fn enable_request_intercept(&mut self) {
658        self.protocol_request_interception_enabled = true;
659    }
660
661    /// Disable fetch interception.
662    pub fn disable_request_intercept(&mut self) {
663        self.protocol_request_interception_enabled = false;
664    }
665
666    /// Set the cache site key.
667    #[cfg(feature = "_cache")]
668    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
669        self.cache_site_key = cache_site_key;
670    }
671
672    /// Set the cache policy.
673    #[cfg(feature = "_cache")]
674    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
675        self.cache_policy = cache_policy;
676    }
677
678    pub fn update_protocol_cache_disabled(&mut self) {
679        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
680    }
681
682    pub fn authenticate(&mut self, credentials: Credentials) {
683        self.credentials = Some(credentials);
684        self.update_protocol_request_interception();
685        self.protocol_request_interception_enabled = true;
686    }
687
688    fn update_protocol_request_interception(&mut self) {
689        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
690
691        if enabled == self.protocol_request_interception_enabled {
692            return;
693        }
694
695        if enabled {
696            self.push_cdp_request(ENABLE_FETCH.clone())
697        } else {
698            self.push_cdp_request(DisableParams::default())
699        }
700    }
701
702    /// Blocklist-only script blocking.
703    /// Returns true only when the URL matches an explicit blocklist condition.
704    #[inline]
705    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
706        // If analytics blocking is off, skip all analytics tries.
707        let block_analytics = self.block_analytics;
708
709        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
710        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
711        {
712            return true;
713        }
714
715        // 2) Custom website block list (explicit).
716        if crate::handler::blockers::block_websites::block_website(url) {
717            return true;
718        }
719
720        // 3) Path-based explicit tries / fallbacks.
721        //
722        // We run these on:
723        // - path with leading slash ("/js/app.js")
724        // - path without leading slash ("js/app.js")
725        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
726        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
727            // Remove query/fragment so matching stays stable.
728            let p_slash = Self::strip_query_fragment(path_with_slash);
729            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
730
731            // Basename for filename-only lists.
732            let base = match p_slash.rsplit('/').next() {
733                Some(b) => b,
734                None => p_slash,
735            };
736
737            // ---- Trie checks ----
738            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
739            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
740                return true;
741            }
742            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
743                return true;
744            }
745            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
746                return true;
747            }
748
749            // Base-path ignore tries (framework noise / known ignorable script paths).
750            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
751            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
752                return true;
753            }
754
755            // Style path ignores only when visuals are ignored.
756            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
757                return true;
758            }
759        }
760
761        false
762    }
763
764    /// Extract the absolute URL path portion WITH the leading slash.
765    ///
766    /// Example:
767    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
768    #[inline]
769    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
770        // find scheme separator
771        let bytes = url.as_bytes();
772        let idx = memchr::memmem::find(bytes, b"//")?;
773        let after_slashes = idx + 2;
774
775        // find first slash after host
776        let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
777        let slash_idx = after_slashes + slash_rel;
778
779        if slash_idx < url.len() {
780            Some(&url[slash_idx..])
781        } else {
782            None
783        }
784    }
785
786    /// Strip query string and fragment from a path-ish string.
787    ///
788    /// Example:
789    /// - "/a/b.js?x=1#y" -> "/a/b.js"
790    #[inline]
791    fn strip_query_fragment(s: &str) -> &str {
792        match memchr::memchr2(b'?', b'#', s.as_bytes()) {
793            Some(i) => &s[..i],
794            None => s,
795        }
796    }
797
798    /// Determine if the request should be skipped.
799    #[inline]
800    fn skip_xhr(
801        &self,
802        skip_networking: bool,
803        event: &EventRequestPaused,
804        network_event: bool,
805    ) -> bool {
806        // XHR check
807        if !skip_networking && network_event {
808            let request_url = event.request.url.as_str();
809
810            // check if part of ignore scripts.
811            let skip_analytics =
812                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
813
814            if skip_analytics {
815                true
816            } else if self.block_stylesheets || self.ignore_visuals {
817                let block_css = self.block_stylesheets;
818                let block_media = self.ignore_visuals;
819
820                let mut block_request = false;
821
822                if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
823                    let hlen = request_url.len();
824                    let has_asset = hlen - position;
825
826                    if has_asset >= 3 {
827                        let next_position = position + 1;
828
829                        if block_media
830                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
831                                &request_url[next_position..].into(),
832                            )
833                        {
834                            block_request = true;
835                        } else if block_css {
836                            block_request = CaseInsensitiveString::from(
837                                &request_url.as_bytes()[next_position..],
838                            )
839                            .contains(&**CSS_EXTENSION)
840                        }
841                    }
842                }
843
844                if !block_request {
845                    block_request = ignore_script_xhr_media(request_url);
846                }
847
848                block_request
849            } else {
850                skip_networking
851            }
852        } else {
853            skip_networking
854        }
855    }
856
857    #[cfg(feature = "adblock")]
858    #[inline]
859    /// Detect if ad enabled.
860    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
861        if skip_networking {
862            true
863        } else {
864            block_ads(&event.request.url) || self.detect_ad(event)
865        }
866    }
867
868    /// When adblock feature is disabled, this is a no-op.
869    #[cfg(not(feature = "adblock"))]
870    #[inline]
871    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
872        use crate::handler::blockers::block_websites::block_ads;
873        if skip_networking {
874            true
875        } else {
876            block_ads(&event.request.url)
877        }
878    }
879
880    #[inline]
881    /// Fail request
882    fn fail_request_blocked(
883        &mut self,
884        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
885    ) {
886        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
887            request_id.clone(),
888            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
889        );
890        self.push_cdp_request(params);
891    }
892
893    #[inline]
894    /// Fulfill request
895    fn fulfill_request_empty_200(
896        &mut self,
897        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
898    ) {
899        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
900            request_id.clone(),
901            200,
902        );
903        self.push_cdp_request(params);
904    }
905
906    #[cfg(feature = "_cache")]
907    #[inline]
908    /// Fulfill a paused Fetch request from cached bytes + header map.
909    ///
910    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
911    fn fulfill_request_from_cache(
912        &mut self,
913        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
914        body: &[u8],
915        headers: &std::collections::HashMap<String, String>,
916        status: i64,
917    ) {
918        use crate::cdp::browser_protocol::fetch::HeaderEntry;
919        use crate::handler::network::fetch::FulfillRequestParams;
920        use base64::Engine;
921
922        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
923
924        for (k, v) in headers.iter() {
925            resp_headers.push(HeaderEntry {
926                name: k.clone(),
927                value: v.clone(),
928            });
929        }
930
931        let mut params = FulfillRequestParams::new(request_id.clone(), status);
932
933        // TODO: have this already encoded prior.
934        params.body = Some(
935            base64::engine::general_purpose::STANDARD
936                .encode(body)
937                .into(),
938        );
939
940        params.response_headers = Some(resp_headers);
941
942        self.push_cdp_request(params);
943    }
944
945    #[inline]
946    /// Continue the request url.
947    fn continue_request_with_url(
948        &mut self,
949        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
950        url: Option<&str>,
951        intercept_response: bool,
952    ) {
953        let mut params = ContinueRequestParams::new(request_id.clone());
954        if let Some(url) = url {
955            params.url = Some(url.to_string());
956            params.intercept_response = Some(intercept_response);
957        }
958        self.push_cdp_request(params);
959    }
960
961    /// On fetch request paused interception.
962    #[inline]
963    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
964        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
965            return;
966        }
967
968        if self.block_all {
969            tracing::debug!(
970                "Blocked (block_all): {:?} - {}",
971                event.resource_type,
972                event.request.url
973            );
974            return self.fail_request_blocked(&event.request_id);
975        }
976
977        if let Some(network_id) = event.network_id.as_ref() {
978            if let Some((request_will_be_sent, _)) =
979                self.requests_will_be_sent.remove(network_id.as_ref())
980            {
981                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
982            } else {
983                self.request_id_to_interception_id.insert(
984                    network_id.clone(),
985                    (event.request_id.clone().into(), Instant::now()),
986                );
987            }
988        }
989
990        // From here on, we handle the full decision tree.
991        let javascript_resource = event.resource_type == ResourceType::Script;
992        let document_resource = event.resource_type == ResourceType::Document;
993        let network_resource =
994            !document_resource && crate::utils::is_data_resource(&event.resource_type);
995
996        // Start with static / cheap skip checks.
997        let mut skip_networking =
998            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
999
1000        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
1001            skip_networking = true;
1002        }
1003
1004        // Also short-circuit if we've reloaded this document too many times.
1005        if !skip_networking {
1006            skip_networking = self.document_reload_tracker >= 3;
1007        }
1008
1009        // Handle document redirect / masking and track xml documents.
1010        let (current_url_cow, had_replacer) =
1011            self.handle_document_replacement_and_tracking(event, document_resource);
1012
1013        let current_url: &str = current_url_cow.as_ref();
1014
1015        let blacklisted = self.is_blacklisted(current_url);
1016
1017        if !self.blacklist_strict && blacklisted {
1018            skip_networking = true;
1019        }
1020
1021        if !skip_networking {
1022            // Allow XSL for sitemap XML.
1023            if self.xml_document && current_url.ends_with(".xsl") {
1024                skip_networking = false;
1025            } else {
1026                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1027            }
1028        }
1029
1030        // Skip ad detection for the user-requested top-level Document and
1031        // every step of its redirect chain. The crawler explicitly targets
1032        // this URL — fulfilling-empty-200 a page just because its host
1033        // matches an ad classifier breaks the user's intent (you can
1034        // legitimately want to scrape an ad page). Reproduced on
1035        // https://logrocket.com/careers, where the firewall ad list
1036        // flagged the host and chromey emitted a 17-byte stub for the
1037        // document; downstream sub-resources (script/img/iframe/etc.)
1038        // remain subject to ad blocking through the rest of the tree.
1039        //
1040        // Signals in short-circuit order (cheap → expensive):
1041        //   1. `redirected_request_id.is_some()` — explicit redirect hop
1042        //   2. `had_replacer` — chromey's masked-URL repair path
1043        //   3. `document_target_url.is_empty()` — very first nav, tracker
1044        //      not yet populated
1045        //   4. URL equality against the target — last because string
1046        //      compare is the only non-O(1) op (`handle_document_
1047        //      replacement_and_tracking` above just set the target to
1048        //      the current url, so this is the always-true fallback)
1049        //
1050        // Sub-resources (Script/Image/Font/Stylesheet/XHR/iframe content)
1051        // remain subject to ad blocking through the rest of the tree.
1052        let is_main_document_request = document_resource
1053            && (event.redirected_request_id.is_some()
1054                || had_replacer
1055                || self.document_target_url.is_empty()
1056                || event.request.url == self.document_target_url);
1057        if !is_main_document_request {
1058            skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1059        }
1060
1061        // Ignore embedded scripts, tracker stylesheets, and tracker images when only_html or ignore_visuals is set.
1062        if !skip_networking
1063            && self.block_javascript
1064            && (self.only_html || self.ignore_visuals)
1065            && (javascript_resource
1066                || document_resource
1067                || event.resource_type == ResourceType::Stylesheet
1068                || event.resource_type == ResourceType::Image)
1069        {
1070            skip_networking = ignore_script_embedded(current_url);
1071        }
1072
1073        // Script policy: allow-by-default.
1074        // Block only if explicit block list patterns match.
1075        if !skip_networking && javascript_resource {
1076            skip_networking = self.should_block_script_blocklist_only(current_url);
1077        }
1078
1079        // XHR / data resources.
1080        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1081
1082        // Custom interception layer.
1083        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1084            skip_networking = self.intercept_manager.intercept_detection(
1085                current_url,
1086                self.ignore_visuals,
1087                network_resource,
1088            );
1089        }
1090
1091        // Custom website block list.
1092        if !skip_networking && (javascript_resource || network_resource) {
1093            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1094        }
1095
1096        // whitelist 3rd party
1097        // not required unless explicit blocking.
1098        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1099        {
1100            skip_networking = false;
1101        }
1102
1103        // check if the url is in the whitelist.
1104        if skip_networking && self.is_whitelisted(current_url) {
1105            skip_networking = false;
1106        }
1107
1108        if self.blacklist_strict && blacklisted {
1109            skip_networking = true;
1110        }
1111
1112        if skip_networking {
1113            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1114            self.fulfill_request_empty_200(&event.request_id);
1115        } else {
1116            #[cfg(feature = "_cache")]
1117            {
1118                if let (Some(policy), Some(cache_site_key)) =
1119                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1120                {
1121                    let current_url = format!("{}:{}", event.request.method, &current_url);
1122
1123                    if let Some((res, cache_policy)) =
1124                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1125                    {
1126                        if policy.allows_cached(&cache_policy) {
1127                            tracing::debug!(
1128                                "Remote Cached: {:?} - {}",
1129                                &event.resource_type,
1130                                &current_url
1131                            );
1132                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1133                            return self.fulfill_request_from_cache(
1134                                &event.request_id,
1135                                &res.body,
1136                                &flat_headers,
1137                                res.status as i64,
1138                            );
1139                        }
1140                    }
1141                }
1142            }
1143
1144            // check our frame cache for the run.
1145            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1146            self.continue_request_with_url(
1147                &event.request_id,
1148                if had_replacer {
1149                    Some(current_url)
1150                } else {
1151                    None
1152                },
1153                !had_replacer,
1154            );
1155        }
1156    }
1157
1158    /// Shared "visuals + basic blocking" logic.
1159    ///
1160    /// IMPORTANT: Scripts are NOT blocked here anymore.
1161    /// Scripts are allowed by default and only blocked via explicit blocklists
1162    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1163    #[inline]
1164    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1165        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1166            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1167    }
1168
1169    /// Does the network manager have a target domain?
1170    pub fn has_target_domain(&self) -> bool {
1171        !self.document_target_url.is_empty()
1172    }
1173
1174    /// Set the target page url for tracking.
1175    pub fn set_page_url(&mut self, page_target_url: String) {
1176        let host_base = host_and_rest(&page_target_url)
1177            .map(|(h, _)| base_domain_from_host(h))
1178            .unwrap_or("");
1179
1180        self.document_target_domain = host_base.to_string();
1181        self.document_target_url = page_target_url;
1182    }
1183
1184    /// Clear the initial target domain on every navigation.
1185    pub fn clear_target_domain(&mut self) {
1186        self.document_reload_tracker = 0;
1187        self.document_target_url = Default::default();
1188        self.document_target_domain = Default::default();
1189    }
1190
1191    /// Handles:
1192    /// - document reload tracking (`document_reload_tracker`)
1193    /// - redirect masking / replacement
1194    /// - xml document detection (`xml_document`)
1195    /// - `document_target_url` updates
1196    ///
1197    /// Returns (current_url, had_replacer).
1198    #[inline]
1199    fn handle_document_replacement_and_tracking<'a>(
1200        &mut self,
1201        event: &'a EventRequestPaused,
1202        document_resource: bool,
1203    ) -> (Cow<'a, str>, bool) {
1204        let mut replacer: Option<String> = None;
1205        let current_url = event.request.url.as_str();
1206
1207        if document_resource {
1208            if self.document_target_url == current_url {
1209                self.document_reload_tracker += 1;
1210            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1211            {
1212                let (http_document_replacement, mut https_document_replacement) =
1213                    if self.document_target_url.starts_with("http://") {
1214                        (
1215                            self.document_target_url.replacen("http://", "http//", 1),
1216                            self.document_target_url.replacen("http://", "https://", 1),
1217                        )
1218                    } else {
1219                        (
1220                            self.document_target_url.replacen("https://", "https//", 1),
1221                            self.document_target_url.replacen("https://", "http://", 1),
1222                        )
1223                    };
1224
1225                // Track trailing slash to restore later.
1226                let trailing = https_document_replacement.ends_with('/');
1227                if trailing {
1228                    https_document_replacement.pop();
1229                }
1230                if https_document_replacement.ends_with('/') {
1231                    https_document_replacement.pop();
1232                }
1233
1234                let redirect_mask = format!(
1235                    "{}{}",
1236                    https_document_replacement, http_document_replacement
1237                );
1238
1239                if current_url == redirect_mask {
1240                    replacer = Some(if trailing {
1241                        format!("{}/", https_document_replacement)
1242                    } else {
1243                        https_document_replacement
1244                    });
1245                }
1246            }
1247
1248            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1249                self.xml_document = true;
1250            }
1251
1252            // Track last seen document URL.
1253            self.document_target_url = event.request.url.clone();
1254            self.document_target_domain = host_and_rest(&self.document_target_url)
1255                .map(|(h, _)| base_domain_from_host(h).to_string())
1256                .unwrap_or_default();
1257        }
1258
1259        let current_url_cow = match replacer {
1260            Some(r) => Cow::Owned(r),
1261            None => Cow::Borrowed(event.request.url.as_str()),
1262        };
1263
1264        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1265        (current_url_cow, had_replacer)
1266    }
1267
1268    /// Perform a page intercept for chrome using the adblock engine.
1269    /// Uses the custom engine when user-supplied filter rules are configured,
1270    /// otherwise falls back to the global default engine with built-in patterns.
1271    #[cfg(feature = "adblock")]
1272    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1273        use adblock::{
1274            lists::{FilterSet, ParseOptions, RuleTypes},
1275            Engine,
1276        };
1277
1278        lazy_static::lazy_static! {
1279            static ref AD_ENGINE: Engine = {
1280                let mut filter_set = FilterSet::new(false);
1281                let mut rules = ParseOptions::default();
1282                rules.rule_types = RuleTypes::All;
1283
1284                filter_set.add_filters(
1285                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1286                    rules,
1287                );
1288
1289                // When adblock_easylist is enabled, EasyList + EasyPrivacy are
1290                // embedded at build time for zero-cost runtime loading.
1291                #[cfg(feature = "adblock_easylist")]
1292                {
1293                    static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1294                    static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1295
1296                    if !EASYLIST.is_empty() {
1297                        filter_set.add_filter_list(EASYLIST, rules);
1298                    }
1299                    if !EASYPRIVACY.is_empty() {
1300                        filter_set.add_filter_list(EASYPRIVACY, rules);
1301                    }
1302                }
1303
1304                Engine::from_filter_set(filter_set, true)
1305            };
1306        }
1307
1308        let blockable = event.resource_type == ResourceType::Script
1309            || event.resource_type == ResourceType::Image
1310            || event.resource_type == ResourceType::Media
1311            || event.resource_type == ResourceType::Stylesheet
1312            || event.resource_type == ResourceType::Document
1313            || event.resource_type == ResourceType::Fetch
1314            || event.resource_type == ResourceType::Xhr;
1315
1316        if !blockable {
1317            return false;
1318        }
1319
1320        let u = &event.request.url;
1321
1322        let source_domain = if self.document_target_domain.is_empty() {
1323            "example.com"
1324        } else {
1325            &self.document_target_domain
1326        };
1327
1328        // Fast hostname extraction without full URL parsing.
1329        // preparsed(url, request_hostname, source_hostname, type, third_party)
1330        let hostname = u
1331            .strip_prefix("https://")
1332            .or_else(|| u.strip_prefix("http://"))
1333            .and_then(|rest| rest.split('/').next())
1334            // Strip userinfo (user:pass@) if present.
1335            .map(
1336                |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1337                    Some(i) => &authority[i + 1..],
1338                    None => authority,
1339                },
1340            )
1341            // Strip port (:8080) if present.
1342            .and_then(|host_port| host_port.split(':').next())
1343            .unwrap_or(source_domain);
1344
1345        let resource_type_str = match event.resource_type {
1346            ResourceType::Script => "script",
1347            ResourceType::Image => "image",
1348            ResourceType::Media => "media",
1349            ResourceType::Stylesheet => "stylesheet",
1350            ResourceType::Document => "document",
1351            ResourceType::Fetch => "fetch",
1352            ResourceType::Xhr => "xhr",
1353            _ => "other",
1354        };
1355
1356        let request = adblock::request::Request::preparsed(
1357            u,
1358            hostname,
1359            source_domain,
1360            resource_type_str,
1361            !event.request.is_same_site.unwrap_or_default(),
1362        );
1363
1364        let engine: &Engine = match self.adblock_engine.as_ref() {
1365            Some(custom) => custom,
1366            None => &AD_ENGINE,
1367        };
1368
1369        engine.check_network_request(&request).matched
1370    }
1371
1372    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1373        let response = if self
1374            .attempted_authentications
1375            .contains(event.request_id.as_ref())
1376        {
1377            AuthChallengeResponseResponse::CancelAuth
1378        } else if self.credentials.is_some() {
1379            self.attempted_authentications
1380                .insert(event.request_id.clone().into());
1381            AuthChallengeResponseResponse::ProvideCredentials
1382        } else {
1383            AuthChallengeResponseResponse::Default
1384        };
1385
1386        let mut auth = AuthChallengeResponse::new(response);
1387        if let Some(creds) = self.credentials.clone() {
1388            auth.username = Some(creds.username);
1389            auth.password = Some(creds.password);
1390        }
1391        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1392    }
1393
1394    /// Set the page offline network emulation condition.
1395    pub fn set_offline_mode(&mut self, value: bool) {
1396        if self.offline == value {
1397            return;
1398        }
1399        self.offline = value;
1400        if let Ok(condition) = NetworkConditions::builder()
1401            .url_pattern("")
1402            .latency(0)
1403            .download_throughput(-1.)
1404            .upload_throughput(-1.)
1405            .build()
1406        {
1407            if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1408                .offline(self.offline)
1409                .matched_network_condition(condition)
1410                .build()
1411            {
1412                self.push_cdp_request(network);
1413            }
1414        }
1415    }
1416
1417    /// Request interception doesn't happen for data URLs with Network Service.
1418    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1419        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1420            if let Some((interception_id, _)) = self
1421                .request_id_to_interception_id
1422                .remove(event.request_id.as_ref())
1423            {
1424                self.on_request(event, Some(interception_id));
1425            } else {
1426                self.requests_will_be_sent
1427                    .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1428            }
1429        } else {
1430            self.on_request(event, None);
1431        }
1432    }
1433
1434    /// The request was served from the cache.
1435    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1436        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1437            request.from_memory_cache = true;
1438        }
1439    }
1440
1441    /// On network response received.
1442    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1443        let mut request_failed = false;
1444
1445        // Track how many bytes we actually deducted from this target.
1446        let mut deducted: u64 = 0;
1447
1448        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1449            let before = *max_bytes;
1450
1451            // encoded_data_length -> saturating cast to u64
1452            let received_bytes: u64 = event.response.encoded_data_length as u64;
1453
1454            // Safe parse of Content-Length
1455            let content_length: Option<u64> = event
1456                .response
1457                .headers
1458                .inner()
1459                .get("content-length")
1460                .and_then(|v| v.as_str())
1461                .and_then(|s| s.trim().parse::<u64>().ok());
1462
1463            // Deduct what we actually received
1464            *max_bytes = max_bytes.saturating_sub(received_bytes);
1465
1466            // If the declared size can't fit, zero out now
1467            if let Some(cl) = content_length {
1468                if cl > *max_bytes {
1469                    *max_bytes = 0;
1470                }
1471            }
1472
1473            request_failed = *max_bytes == 0;
1474
1475            // Compute exact delta deducted on this event
1476            deducted = before.saturating_sub(*max_bytes);
1477        }
1478
1479        // Bubble up the deduction (even if request continues)
1480        if deducted > 0 {
1481            self.queued_events
1482                .push_back(NetworkEvent::BytesConsumed(deducted));
1483        }
1484
1485        // block all network request moving forward.
1486        if request_failed && self.max_bytes_allowed.is_some() {
1487            self.set_block_all(true);
1488        }
1489
1490        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1491            request.set_response(event.response.clone());
1492            self.queued_events.push_back(if request_failed {
1493                NetworkEvent::RequestFailed(request)
1494            } else {
1495                NetworkEvent::RequestFinished(request)
1496            });
1497        }
1498    }
1499
1500    /// On network loading finished.
1501    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1502        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1503            if let Some(interception_id) = request.interception_id.as_ref() {
1504                self.attempted_authentications
1505                    .remove(interception_id.as_ref());
1506            }
1507            self.queued_events
1508                .push_back(NetworkEvent::RequestFinished(request));
1509        }
1510    }
1511
1512    /// On network loading failed.
1513    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1514        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1515            request.failure_text = Some(event.error_text.clone());
1516            if let Some(interception_id) = request.interception_id.as_ref() {
1517                self.attempted_authentications
1518                    .remove(interception_id.as_ref());
1519            }
1520            self.queued_events
1521                .push_back(NetworkEvent::RequestFailed(request));
1522        }
1523    }
1524
1525    /// On request will be sent.
1526    fn on_request(
1527        &mut self,
1528        event: &EventRequestWillBeSent,
1529        interception_id: Option<InterceptionId>,
1530    ) {
1531        let mut redirect_chain = Vec::new();
1532        let mut redirect_location = None;
1533
1534        if let Some(redirect_resp) = &event.redirect_response {
1535            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1536                if is_redirect_status(redirect_resp.status) {
1537                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1538                        if redirect_resp.url != location {
1539                            let fixed_location = location.replace(&redirect_resp.url, "");
1540
1541                            if !fixed_location.is_empty() {
1542                                if let Some(resp) = request.response.as_mut() {
1543                                    resp.headers.0["Location"] =
1544                                        serde_json::Value::String(fixed_location.clone());
1545                                }
1546                            }
1547
1548                            redirect_location = Some(fixed_location);
1549                        }
1550                    }
1551                }
1552
1553                {
1554                    let mut redirect_resp = redirect_resp.clone();
1555
1556                    if let Some(redirect_location) = redirect_location {
1557                        if !redirect_location.is_empty() {
1558                            redirect_resp.headers.0["Location"] =
1559                                serde_json::Value::String(redirect_location);
1560                        }
1561                    }
1562
1563                    self.handle_request_redirect(&mut request, redirect_resp);
1564                }
1565
1566                redirect_chain = std::mem::take(&mut request.redirect_chain);
1567                redirect_chain.push(request);
1568            }
1569        }
1570
1571        // Redirect cap: applies only to Document-type hops and only when
1572        // `max_redirects` is set. Sub-resource chains are untouched.
1573        if let Some(cap) = self.max_redirects {
1574            let is_document = matches!(event.r#type, Some(ResourceType::Document));
1575            if is_document && redirect_chain.len() > cap {
1576                let mut failed = HttpRequest::new(
1577                    event.request_id.clone(),
1578                    event.frame_id.clone(),
1579                    interception_id,
1580                    self.user_request_interception_enabled,
1581                    redirect_chain,
1582                );
1583                failed.url = Some(event.request.url.clone());
1584                failed.method = Some(event.request.method.clone());
1585                failed.failure_text = Some("net::ERR_TOO_MANY_REDIRECTS".into());
1586                self.push_cdp_request(
1587                    chromiumoxide_cdp::cdp::browser_protocol::page::StopLoadingParams::default(),
1588                );
1589                self.queued_events
1590                    .push_back(NetworkEvent::RequestFailed(failed));
1591                return;
1592            }
1593        }
1594
1595        let request = HttpRequest::new(
1596            event.request_id.clone(),
1597            event.frame_id.clone(),
1598            interception_id,
1599            self.user_request_interception_enabled,
1600            redirect_chain,
1601        );
1602
1603        let rid = event.request_id.clone();
1604        self.queued_events
1605            .push_back(NetworkEvent::Request(rid.clone()));
1606        self.requests.insert(rid, request);
1607    }
1608
1609    /// Handle request redirect.
1610    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1611        request.set_response(response);
1612        if let Some(interception_id) = request.interception_id.as_ref() {
1613            self.attempted_authentications
1614                .remove(interception_id.as_ref());
1615        }
1616    }
1617}
1618
1619#[derive(Debug)]
1620pub enum NetworkEvent {
1621    /// Send a CDP request.
1622    SendCdpRequest((MethodId, serde_json::Value)),
1623    /// Request.
1624    Request(RequestId),
1625    /// Response
1626    Response(RequestId),
1627    /// Request failed.
1628    RequestFailed(HttpRequest),
1629    /// Request finished.
1630    RequestFinished(HttpRequest),
1631    /// Bytes consumed.
1632    BytesConsumed(u64),
1633}
1634
1635#[cfg(test)]
1636mod tests {
1637    use super::ALLOWED_MATCHER_3RD_PARTY;
1638    use crate::handler::network::NetworkManager;
1639    use std::time::Duration;
1640
1641    #[test]
1642    fn test_allowed_matcher_3rd_party() {
1643        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1644        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1645        assert!(
1646            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1647            "expected Cloudflare challenge script to be allowed"
1648        );
1649
1650        // Should NOT be allowed (not in allow-list)
1651        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1652        assert!(
1653            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1654            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1655        );
1656
1657        // A couple sanity checks for existing allow patterns
1658        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1659        assert!(ALLOWED_MATCHER_3RD_PARTY
1660            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1661        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1662    }
1663
1664    #[test]
1665    fn test_script_allowed_by_default_when_not_blocklisted() {
1666        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1667        nm.set_page_url(
1668            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1669        );
1670
1671        // A random script that should not match your block tries.
1672        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1673        assert!(
1674            !nm.should_block_script_blocklist_only(ok),
1675            "expected non-blocklisted script to be allowed"
1676        );
1677    }
1678
1679    #[test]
1680    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1681        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1682        nm.set_page_url(
1683            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1684        );
1685
1686        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1687        let bad = "https://cdn.example.net/js/analytics.js";
1688        assert!(
1689            nm.should_block_script_blocklist_only(bad),
1690            "expected analytics.js to be blocklisted"
1691        );
1692    }
1693
1694    #[test]
1695    fn test_allowed_matcher_3rd_party_sanity() {
1696        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1697        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1698        assert!(
1699            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1700            "expected Cloudflare challenge script to be allowed"
1701        );
1702
1703        // Should NOT be allowed (not in allow-list)
1704        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1705        assert!(
1706            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1707            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1708        );
1709
1710        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1711        assert!(ALLOWED_MATCHER_3RD_PARTY
1712            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1713        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1714    }
1715    #[test]
1716    fn test_dynamic_blacklist_blocks_url() {
1717        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1718        nm.set_page_url("https://example.com/".to_string());
1719
1720        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1721        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1722        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1723
1724        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1725    }
1726
1727    #[test]
1728    fn test_blacklist_strict_wins_over_whitelist() {
1729        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1730        nm.set_page_url("https://example.com/".to_string());
1731
1732        // Same URL in both lists.
1733        nm.set_blacklist_patterns(["beacon.min.js"]);
1734        nm.set_whitelist_patterns(["beacon.min.js"]);
1735
1736        nm.set_blacklist_strict(true);
1737
1738        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1739        assert!(nm.is_whitelisted(u));
1740        assert!(nm.is_blacklisted(u));
1741
1742        // In strict mode, it should still be considered blocked at decision time.
1743        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1744        assert!(nm.blacklist_strict);
1745    }
1746
1747    #[cfg(feature = "adblock")]
1748    fn make_request_paused(
1749        url: &str,
1750        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1751        is_same_site: bool,
1752    ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1753        use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1754        use chromiumoxide_cdp::cdp::browser_protocol::network::{
1755            Headers, Request, RequestReferrerPolicy, ResourcePriority,
1756        };
1757
1758        EventRequestPaused {
1759            request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1760                "test-req".to_string(),
1761            )
1762            .into(),
1763            request: Request {
1764                url: url.to_string(),
1765                method: "GET".to_string(),
1766                headers: Headers::new(serde_json::Value::Object(Default::default())),
1767                initial_priority: ResourcePriority::Medium,
1768                referrer_policy: RequestReferrerPolicy::NoReferrer,
1769                url_fragment: None,
1770                has_post_data: None,
1771                post_data_entries: None,
1772                mixed_content_type: None,
1773                is_link_preload: None,
1774                trust_token_params: None,
1775                is_same_site: Some(is_same_site),
1776                is_ad_related: None,
1777            },
1778            frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1779                "frame1".to_string(),
1780            ),
1781            resource_type,
1782            response_error_reason: None,
1783            response_status_code: None,
1784            response_status_text: None,
1785            response_headers: None,
1786            network_id: None,
1787            redirected_request_id: None,
1788        }
1789    }
1790
1791    #[cfg(feature = "adblock")]
1792    #[test]
1793    fn test_detect_ad_blocks_known_tracker_scripts() {
1794        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1795
1796        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1797        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1798
1799        let event = make_request_paused(
1800            "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1801            ResourceType::Script,
1802            false,
1803        );
1804
1805        assert!(
1806            nm.detect_ad(&event),
1807            "googletagmanager.com script should be detected as ad"
1808        );
1809    }
1810
1811    #[cfg(feature = "adblock")]
1812    #[test]
1813    fn test_detect_ad_allows_legitimate_scripts() {
1814        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1815
1816        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1817        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1818
1819        let event = make_request_paused(
1820            "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1821            ResourceType::Script,
1822            true,
1823        );
1824
1825        assert!(
1826            !nm.detect_ad(&event),
1827            "legitimate first-party app bundle should not be blocked"
1828        );
1829    }
1830
1831    #[cfg(feature = "adblock")]
1832    #[test]
1833    fn test_detect_ad_uses_source_domain() {
1834        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1835
1836        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1837        nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1838
1839        assert!(
1840            !nm.document_target_domain.is_empty(),
1841            "document_target_domain should be set after set_page_url"
1842        );
1843
1844        let event = make_request_paused(
1845            "https://www.google-analytics.com/analytics.js",
1846            ResourceType::Script,
1847            false,
1848        );
1849
1850        assert!(
1851            nm.detect_ad(&event),
1852            "google-analytics.com should be blocked as tracker"
1853        );
1854    }
1855
1856    #[cfg(feature = "adblock")]
1857    #[test]
1858    fn test_custom_adblock_engine_takes_precedence() {
1859        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1860
1861        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1862        nm.set_page_url("https://example.com/".to_string());
1863
1864        // Build a custom engine with a specific rule.
1865        let mut filter_set = adblock::lists::FilterSet::new(false);
1866        let mut opts = adblock::lists::ParseOptions::default();
1867        opts.rule_types = adblock::lists::RuleTypes::All;
1868        filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1869        let engine = adblock::Engine::from_filter_set(filter_set, true);
1870        nm.set_adblock_engine(std::sync::Arc::new(engine));
1871
1872        let event = make_request_paused(
1873            "https://custom-tracker.example.net/pixel.js",
1874            ResourceType::Script,
1875            false,
1876        );
1877
1878        assert!(
1879            nm.detect_ad(&event),
1880            "custom engine rule should block custom-tracker.example.net"
1881        );
1882    }
1883
1884    /// Helper: run a URL through the full `on_fetch_request_paused` pipeline
1885    /// and return whether it was blocked (true) or allowed (false).
1886    #[cfg(feature = "adblock")]
1887    fn run_full_interception(
1888        nm: &mut NetworkManager,
1889        url: &str,
1890        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1891        is_same_site: bool,
1892    ) -> bool {
1893        use super::NetworkEvent;
1894
1895        // Drain any prior events.
1896        while nm.poll().is_some() {}
1897
1898        let event = make_request_paused(url, resource_type, is_same_site);
1899        nm.on_fetch_request_paused(&event);
1900
1901        // Check what was emitted: Fetch.fulfillRequest = blocked, Fetch.continueRequest = allowed.
1902        let mut blocked = false;
1903        while let Some(ev) = nm.poll() {
1904            if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1905                let m: &str = method.as_ref();
1906                if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1907                    blocked = true;
1908                }
1909            }
1910        }
1911        blocked
1912    }
1913
1914    // ── End-to-end interception tests ───────────────────────────────────
1915
1916    #[cfg(feature = "adblock")]
1917    #[test]
1918    fn test_e2e_tracker_script_blocked() {
1919        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1920
1921        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1922        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1923
1924        assert!(
1925            run_full_interception(
1926                &mut nm,
1927                "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1928                ResourceType::Script,
1929                false,
1930            ),
1931            "GTM script should be blocked through full pipeline"
1932        );
1933    }
1934
1935    #[cfg(feature = "adblock")]
1936    #[test]
1937    fn test_e2e_legitimate_script_allowed() {
1938        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1939
1940        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1941        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1942
1943        assert!(
1944            !run_full_interception(
1945                &mut nm,
1946                "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1947                ResourceType::Script,
1948                true,
1949            ),
1950            "legitimate first-party script should be allowed through full pipeline"
1951        );
1952    }
1953
1954    #[cfg(feature = "adblock")]
1955    #[test]
1956    fn test_e2e_analytics_xhr_blocked() {
1957        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1958
1959        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1960        nm.set_page_url("https://example.org/".to_string());
1961
1962        assert!(
1963            run_full_interception(
1964                &mut nm,
1965                "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
1966                ResourceType::Xhr,
1967                false,
1968            ),
1969            "Google Analytics XHR should be blocked through full pipeline"
1970        );
1971    }
1972
1973    #[cfg(feature = "adblock")]
1974    #[test]
1975    fn test_e2e_whitelisted_overrides_adblock() {
1976        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1977
1978        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1979        nm.set_page_url("https://example.org/".to_string());
1980        nm.set_whitelist_patterns(["googletagmanager.com"]);
1981
1982        // GTM would normally be blocked by adblock, but whitelist overrides.
1983        assert!(
1984            !run_full_interception(
1985                &mut nm,
1986                "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
1987                ResourceType::Script,
1988                false,
1989            ),
1990            "whitelisted tracker should be allowed even when adblock would block it"
1991        );
1992    }
1993
1994    #[cfg(feature = "adblock")]
1995    #[test]
1996    fn test_e2e_blacklist_strict_overrides_whitelist() {
1997        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1998
1999        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2000        nm.set_page_url("https://example.org/".to_string());
2001        nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
2002        nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
2003        nm.set_blacklist_strict(true);
2004
2005        assert!(
2006            run_full_interception(
2007                &mut nm,
2008                "https://cdn.example.net/evil.js",
2009                ResourceType::Script,
2010                false,
2011            ),
2012            "strict blacklist should win over whitelist"
2013        );
2014    }
2015
2016    #[cfg(feature = "adblock")]
2017    #[test]
2018    fn test_e2e_first_party_document_not_blocked() {
2019        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2020
2021        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2022        nm.set_page_url("https://www.nytimes.com/".to_string());
2023
2024        assert!(
2025            !run_full_interception(
2026                &mut nm,
2027                "https://www.nytimes.com/2024/article.html",
2028                ResourceType::Document,
2029                true,
2030            ),
2031            "first-party document navigation should never be blocked"
2032        );
2033    }
2034
2035    #[cfg(feature = "adblock")]
2036    #[test]
2037    fn test_e2e_custom_engine_blocks_through_pipeline() {
2038        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2039
2040        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2041        nm.set_page_url("https://mysite.com/".to_string());
2042
2043        let mut filter_set = adblock::lists::FilterSet::new(false);
2044        let mut opts = adblock::lists::ParseOptions::default();
2045        opts.rule_types = adblock::lists::RuleTypes::All;
2046        filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
2047        let engine = adblock::Engine::from_filter_set(filter_set, true);
2048        nm.set_adblock_engine(std::sync::Arc::new(engine));
2049
2050        assert!(
2051            run_full_interception(
2052                &mut nm,
2053                "https://evil-cdn.example.net/tracker.js",
2054                ResourceType::Script,
2055                false,
2056            ),
2057            "custom engine rule should block through full pipeline"
2058        );
2059
2060        // Legitimate script on the same site should still pass.
2061        assert!(
2062            !run_full_interception(
2063                &mut nm,
2064                "https://mysite.com/app.js",
2065                ResourceType::Script,
2066                true,
2067            ),
2068            "first-party script should still be allowed with custom engine"
2069        );
2070    }
2071
2072    #[cfg(feature = "adblock")]
2073    #[test]
2074    fn test_e2e_ad_image_blocked() {
2075        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2076
2077        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2078        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2079
2080        // Ad tracking pixel should be blocked via adblock pattern or trie.
2081        assert!(
2082            run_full_interception(
2083                &mut nm,
2084                "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2085                ResourceType::Image,
2086                false,
2087            ),
2088            "doubleclick ad image/tracking pixel should be blocked"
2089        );
2090
2091        // Legitimate first-party image should pass.
2092        assert!(
2093            !run_full_interception(
2094                &mut nm,
2095                "https://www.mylegitsite-test.com/images/logo.png",
2096                ResourceType::Image,
2097                true,
2098            ),
2099            "legitimate first-party image should not be blocked"
2100        );
2101    }
2102
2103    #[cfg(feature = "adblock")]
2104    #[test]
2105    fn test_e2e_hostname_with_userinfo() {
2106        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2107
2108        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2109        nm.set_page_url("https://example.org/".to_string());
2110
2111        // URL with userinfo should still correctly identify googletagmanager.com.
2112        assert!(
2113            run_full_interception(
2114                &mut nm,
2115                "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2116                ResourceType::Script,
2117                false,
2118            ),
2119            "tracker URL with userinfo should still be blocked"
2120        );
2121    }
2122
2123    #[test]
2124    fn test_blacklist_non_strict_allows_whitelist_override() {
2125        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2126        nm.set_page_url("https://example.com/".to_string());
2127
2128        nm.set_blacklist_patterns(["beacon.min.js"]);
2129        nm.set_whitelist_patterns(["beacon.min.js"]);
2130
2131        nm.set_blacklist_strict(false);
2132
2133        let u = "https://static.cloudflareinsights.com/beacon.min.js";
2134        assert!(nm.is_blacklisted(u));
2135        assert!(nm.is_whitelisted(u));
2136        assert!(!nm.blacklist_strict);
2137    }
2138
2139    // ── max_redirects enforcement ───────────────────────────────────────
2140    //
2141    // The redirect cap short-circuits in NetworkManager::on_request when a
2142    // Document-type chain exceeds the configured limit. We drive it via the
2143    // public on_request_will_be_sent entry point by deserializing synthetic
2144    // events — builder APIs exist but require every non-optional field, and
2145    // JSON is less fragile to cdp schema additions.
2146
2147    fn make_request_will_be_sent(
2148        request_id: &str,
2149        url: &str,
2150        resource_type: &str,
2151        redirect_from_url: Option<&str>,
2152    ) -> chromiumoxide_cdp::cdp::browser_protocol::network::EventRequestWillBeSent {
2153        let mut v = serde_json::json!({
2154            "requestId": request_id,
2155            "loaderId": "test-loader",
2156            "documentURL": url,
2157            "request": {
2158                "url": url,
2159                "method": "GET",
2160                "headers": {},
2161                "initialPriority": "Medium",
2162                "referrerPolicy": "no-referrer"
2163            },
2164            "timestamp": 0.0,
2165            "wallTime": 0.0,
2166            "initiator": { "type": "other" },
2167            "redirectHasExtraInfo": false,
2168            "type": resource_type,
2169            "frameId": "frame1"
2170        });
2171        if let Some(from) = redirect_from_url {
2172            v["redirectResponse"] = serde_json::json!({
2173                "url": from,
2174                "status": 302,
2175                "statusText": "Found",
2176                "headers": { "Location": url },
2177                "mimeType": "text/html",
2178                "charset": "",
2179                "connectionReused": false,
2180                "connectionId": 0.0,
2181                "encodedDataLength": 0.0,
2182                "securityState": "unknown"
2183            });
2184        }
2185        serde_json::from_value(v).expect("EventRequestWillBeSent should deserialize")
2186    }
2187
2188    fn drain_too_many_redirects(nm: &mut NetworkManager) -> Option<super::HttpRequest> {
2189        while let Some(ev) = nm.poll() {
2190            if let super::NetworkEvent::RequestFailed(req) = ev {
2191                if req.failure_text.as_deref() == Some("net::ERR_TOO_MANY_REDIRECTS") {
2192                    return Some(req);
2193                }
2194            }
2195        }
2196        None
2197    }
2198
2199    fn drain_stop_loading(nm: &mut NetworkManager) -> bool {
2200        while let Some(ev) = nm.poll() {
2201            if let super::NetworkEvent::SendCdpRequest((method, _)) = ev {
2202                let m: &str = method.as_ref();
2203                if m == "Page.stopLoading" {
2204                    return true;
2205                }
2206            }
2207        }
2208        false
2209    }
2210
2211    #[test]
2212    fn test_max_redirects_none_allows_unlimited_chain() {
2213        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2214        // max_redirects left at its default (None).
2215
2216        // 10 sequential Document hops sharing the same request_id.
2217        nm.on_request_will_be_sent(&make_request_will_be_sent(
2218            "r1",
2219            "https://example.com/0",
2220            "Document",
2221            None,
2222        ));
2223        for i in 1..10 {
2224            nm.on_request_will_be_sent(&make_request_will_be_sent(
2225                "r1",
2226                &format!("https://example.com/{i}"),
2227                "Document",
2228                Some(&format!("https://example.com/{}", i - 1)),
2229            ));
2230        }
2231
2232        assert!(
2233            drain_too_many_redirects(&mut nm).is_none(),
2234            "no cap set: chain of 10 hops must not emit ERR_TOO_MANY_REDIRECTS"
2235        );
2236    }
2237
2238    #[test]
2239    fn test_max_redirects_caps_document_chain() {
2240        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2241        nm.max_redirects = Some(3);
2242
2243        // Initial request + 4 redirect hops. The 4th redirect (chain length 4 > 3)
2244        // must trip the cap.
2245        nm.on_request_will_be_sent(&make_request_will_be_sent(
2246            "r1",
2247            "https://example.com/0",
2248            "Document",
2249            None,
2250        ));
2251        for i in 1..=4 {
2252            nm.on_request_will_be_sent(&make_request_will_be_sent(
2253                "r1",
2254                &format!("https://example.com/{i}"),
2255                "Document",
2256                Some(&format!("https://example.com/{}", i - 1)),
2257            ));
2258        }
2259
2260        let failed = drain_too_many_redirects(&mut nm)
2261            .expect("cap of 3 on a 4-hop chain must emit ERR_TOO_MANY_REDIRECTS");
2262        assert_eq!(
2263            failed.redirect_chain.len(),
2264            4,
2265            "failed request should preserve the full accumulated chain"
2266        );
2267        assert_eq!(
2268            failed.url.as_deref(),
2269            Some("https://example.com/4"),
2270            "failed request url should be the hop that tripped the cap"
2271        );
2272
2273        // Second navigation after the cap is tripped must also schedule
2274        // Page.stopLoading to actually abort the tab.
2275        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2276        nm.max_redirects = Some(3);
2277        nm.on_request_will_be_sent(&make_request_will_be_sent(
2278            "r2",
2279            "https://example.com/0",
2280            "Document",
2281            None,
2282        ));
2283        for i in 1..=4 {
2284            nm.on_request_will_be_sent(&make_request_will_be_sent(
2285                "r2",
2286                &format!("https://example.com/{i}"),
2287                "Document",
2288                Some(&format!("https://example.com/{}", i - 1)),
2289            ));
2290        }
2291        assert!(
2292            drain_stop_loading(&mut nm),
2293            "cap hit must dispatch Page.stopLoading to abort navigation"
2294        );
2295    }
2296
2297    #[test]
2298    fn test_max_redirects_ignores_subresources() {
2299        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2300        nm.max_redirects = Some(2);
2301
2302        // A 5-hop script redirect chain — sub-resources are exempt by design.
2303        nm.on_request_will_be_sent(&make_request_will_be_sent(
2304            "s1",
2305            "https://cdn.example.com/0.js",
2306            "Script",
2307            None,
2308        ));
2309        for i in 1..=5 {
2310            nm.on_request_will_be_sent(&make_request_will_be_sent(
2311                "s1",
2312                &format!("https://cdn.example.com/{i}.js"),
2313                "Script",
2314                Some(&format!("https://cdn.example.com/{}.js", i - 1)),
2315            ));
2316        }
2317
2318        assert!(
2319            drain_too_many_redirects(&mut nm).is_none(),
2320            "sub-resource redirect chains must never be capped"
2321        );
2322    }
2323}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs