chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InitiatorType, InterceptionId, NetworkConditions, RequestId, ResourceType, Response,
20    SetCacheDisabledParams, SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235/// How long a buffered `requests_will_be_sent` / `request_id_to_interception_id`
236/// entry may linger before being evicted. 30 seconds is generous — the CDP
237/// round-trip that reconciles the two racing events normally completes in
238/// milliseconds.
239const STALE_BUFFER_SECS: u64 = 30;
240
241/// How long an in-flight request entry (`requests` map) can live without
242/// being resolved by a `loadingFinished` / `loadingFailed` / `loadingCanceled`
243/// event before it is considered orphaned and evicted.  Longer than the
244/// race-condition buffer timeout because real requests can legitimately take
245/// tens of seconds (streaming, slow origins, etc.).
246const STALE_REQUEST_SECS: u64 = 120;
247
248/// Wrapper around `adblock::Engine` that implements `Debug`.
249#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255        f.debug_struct("AdblockEngine").finish()
256    }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261    type Target = adblock::Engine;
262    fn deref(&self) -> &Self::Target {
263        &self.0
264    }
265}
266
267#[derive(Debug)]
268/// The base network manager.
269pub struct NetworkManager {
270    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
271    ///
272    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
273    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
274    /// Consumers pull from this queue via `poll()`.
275    queued_events: VecDeque<NetworkEvent>,
276    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
277    ///
278    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
279    /// certificates (self-signed, expired, MITM proxies, etc.).
280    ignore_httpserrors: bool,
281    /// Active in-flight requests keyed by CDP `RequestId`.
282    ///
283    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
284    /// and final state used to emit `RequestFinished` / `RequestFailed`.
285    requests: HashMap<RequestId, HttpRequest>,
286    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
287    /// `Fetch.requestPaused` arrives later (or vice versa).
288    ///
289    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
290    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
291    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
292    requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293    /// Extra HTTP headers to apply to subsequent network requests via CDP.
294    ///
295    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
296    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
297    extra_headers: std::collections::HashMap<String, String>,
298    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
299    ///
300    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
301    /// store the interception id here so it can be attached to the `HttpRequest` once the
302    /// network request is observed.
303    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
304    request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305    /// Whether the user has disabled the browser cache.
306    ///
307    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
308    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
309    user_cache_disabled: bool,
310    /// Tracks which requests have already attempted authentication.
311    ///
312    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
313    /// authentication challenges (407/401). Once a request id is present here, subsequent
314    /// challenges for the same request are canceled.
315    attempted_authentications: HashSet<RequestId>,
316    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
317    ///
318    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
319    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
320    credentials: Option<Credentials>,
321    /// User-facing toggle indicating whether request interception is desired.
322    ///
323    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
324    /// not guarantee interception is active; interception is actually enabled/disabled by
325    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
326    ///
327    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
328    /// enabled to satisfy auth challenges.
329    pub(crate) user_request_interception_enabled: bool,
330    /// Hard kill-switch to block all network traffic.
331    ///
332    /// When `true`, the manager immediately blocks requests (typically via
333    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
334    /// and short-circuits most decision logic. This is used for safety conditions such as
335    /// exceeding `max_bytes_allowed` or other runtime protections.
336    block_all: bool,
337    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
338    ///
339    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
340    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
341    /// when `user_request_interception_enabled` or `credentials` change.
342    pub(crate) protocol_request_interception_enabled: bool,
343    /// The network is offline.
344    offline: bool,
345    /// The page request timeout.
346    pub request_timeout: Duration,
347    // made_request: bool,
348    /// Ignore visuals (no pings, prefetching, and etc).
349    pub ignore_visuals: bool,
350    /// Block CSS stylesheets.
351    pub block_stylesheets: bool,
352    /// Block javascript that is not critical to rendering.
353    ///
354    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
355    /// by itself (it remains for config compatibility).
356    pub block_javascript: bool,
357    /// Block analytics from rendering
358    pub block_analytics: bool,
359    /// Block pre-fetch request
360    pub block_prefetch: bool,
361    /// Only html from loading.
362    pub only_html: bool,
363    /// Is xml document?
364    pub xml_document: bool,
365    /// The custom intercept handle logic to run on the website.
366    pub intercept_manager: NetworkInterceptManager,
367    /// Track the amount of times the document reloaded.
368    pub document_reload_tracker: u8,
369    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
370    pub document_target_url: String,
371    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
372    pub document_target_domain: String,
373    /// The max bytes to receive.
374    pub max_bytes_allowed: Option<u64>,
375    /// Cap on main-frame Document redirect hops before the navigation is aborted.
376    ///
377    /// `None` disables enforcement (default, preserves prior behavior). When `Some(n)`,
378    /// the (n+1)th Document redirect short-circuits: a synthetic `RequestFailed` event
379    /// is emitted with `failure_text = "net::ERR_TOO_MANY_REDIRECTS"` and
380    /// `Page.stopLoading` is dispatched to abort in-flight navigation. The accumulated
381    /// `redirect_chain` is preserved on the failed request so consumers can inspect it.
382    pub max_redirects: Option<usize>,
383    #[cfg(feature = "_cache")]
384    /// The cache site_key to use.
385    pub cache_site_key: Option<String>,
386    /// The cache policy to use.
387    #[cfg(feature = "_cache")]
388    pub cache_policy: Option<BasicCachePolicy>,
389    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
390    whitelist_patterns: Vec<String>,
391    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
392    whitelist_matcher: Option<AhoCorasick>,
393    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
394    blacklist_patterns: Vec<String>,
395    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
396    blacklist_matcher: Option<AhoCorasick>,
397    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
398    blacklist_strict: bool,
399    /// Custom adblock engine built from user-supplied filter rules.
400    /// When `Some`, takes precedence over the global default engine.
401    #[cfg(feature = "adblock")]
402    adblock_engine: Option<AdblockEngine>,
403}
404
405impl NetworkManager {
406    /// A new network manager.
407    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
408        Self {
409            queued_events: Default::default(),
410            ignore_httpserrors,
411            requests: Default::default(),
412            requests_will_be_sent: Default::default(),
413            extra_headers: Default::default(),
414            request_id_to_interception_id: Default::default(),
415            user_cache_disabled: false,
416            attempted_authentications: Default::default(),
417            credentials: None,
418            block_all: false,
419            user_request_interception_enabled: false,
420            protocol_request_interception_enabled: false,
421            offline: false,
422            request_timeout,
423            ignore_visuals: false,
424            block_javascript: false,
425            block_stylesheets: false,
426            block_prefetch: true,
427            block_analytics: true,
428            only_html: false,
429            xml_document: false,
430            intercept_manager: NetworkInterceptManager::Unknown,
431            document_reload_tracker: 0,
432            document_target_url: String::new(),
433            document_target_domain: String::new(),
434            whitelist_patterns: Vec::new(),
435            whitelist_matcher: None,
436            blacklist_patterns: Vec::new(),
437            blacklist_matcher: None,
438            blacklist_strict: true,
439            max_bytes_allowed: None,
440            max_redirects: None,
441            #[cfg(feature = "_cache")]
442            cache_site_key: None,
443            #[cfg(feature = "_cache")]
444            cache_policy: None,
445            #[cfg(feature = "adblock")]
446            adblock_engine: None,
447        }
448    }
449
450    /// Set a custom adblock engine built from user-supplied filter rules.
451    #[cfg(feature = "adblock")]
452    pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
453        self.adblock_engine = Some(AdblockEngine(engine));
454    }
455
456    /// Replace the whitelist patterns (compiled once).
457    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
458    where
459        I: IntoIterator<Item = S>,
460        S: Into<String>,
461    {
462        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
463        self.rebuild_whitelist_matcher();
464    }
465
466    /// Replace the blacklist patterns (compiled once).
467    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
468    where
469        I: IntoIterator<Item = S>,
470        S: Into<String>,
471    {
472        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
473        self.rebuild_blacklist_matcher();
474    }
475
476    /// Add one pattern (cheap) and rebuild (call this sparingly).
477    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
478        self.blacklist_patterns.push(pattern.into());
479        self.rebuild_blacklist_matcher();
480    }
481
482    /// Add many patterns and rebuild once.
483    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
484    where
485        I: IntoIterator<Item = S>,
486        S: Into<String>,
487    {
488        self.blacklist_patterns
489            .extend(patterns.into_iter().map(Into::into));
490        self.rebuild_blacklist_matcher();
491    }
492
493    /// Clear blacklist entirely.
494    pub fn clear_blacklist(&mut self) {
495        self.blacklist_patterns.clear();
496        self.blacklist_matcher = None;
497    }
498
499    /// Control precedence: when true, blacklist always wins.
500    pub fn set_blacklist_strict(&mut self, strict: bool) {
501        self.blacklist_strict = strict;
502    }
503
504    #[inline]
505    fn rebuild_blacklist_matcher(&mut self) {
506        if self.blacklist_patterns.is_empty() {
507            self.blacklist_matcher = None;
508            return;
509        }
510
511        self.blacklist_matcher =
512            AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
513    }
514
515    #[inline]
516    fn is_blacklisted(&self, url: &str) -> bool {
517        self.blacklist_matcher
518            .as_ref()
519            .map(|m| m.is_match(url))
520            .unwrap_or(false)
521    }
522
523    /// Add one pattern (cheap) and rebuild (call this sparingly).
524    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
525        self.whitelist_patterns.push(pattern.into());
526        self.rebuild_whitelist_matcher();
527    }
528
529    /// Add many patterns and rebuild once.
530    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
531    where
532        I: IntoIterator<Item = S>,
533        S: Into<String>,
534    {
535        self.whitelist_patterns
536            .extend(patterns.into_iter().map(Into::into));
537        self.rebuild_whitelist_matcher();
538    }
539
540    #[inline]
541    fn rebuild_whitelist_matcher(&mut self) {
542        if self.whitelist_patterns.is_empty() {
543            self.whitelist_matcher = None;
544            return;
545        }
546
547        // If building fails (shouldn’t for simple patterns), just disable matcher.
548        self.whitelist_matcher =
549            AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
550    }
551
552    #[inline]
553    fn is_whitelisted(&self, url: &str) -> bool {
554        self.whitelist_matcher
555            .as_ref()
556            .map(|m| m.is_match(url))
557            .unwrap_or(false)
558    }
559
560    /// Commands to init the chain with.
561    pub fn init_commands(&self) -> CommandChain {
562        let cmds = if self.ignore_httpserrors {
563            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
564        } else {
565            INIT_CHAIN.clone()
566        };
567        CommandChain::new(cmds, self.request_timeout)
568    }
569
570    /// Push the CDP request.
571    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
572        let method = cmd.identifier();
573        if let Ok(params) = serde_json::to_value(cmd) {
574            self.queued_events
575                .push_back(NetworkEvent::SendCdpRequest((method, params)));
576        }
577    }
578
579    /// The next event to handle.
580    pub fn poll(&mut self) -> Option<NetworkEvent> {
581        self.queued_events.pop_front()
582    }
583
584    /// Evict stale entries from the race-condition buffers and from
585    /// `attempted_authentications`. Call this periodically (e.g. from the
586    /// handler's eviction tick) so that lost CDP events cannot cause unbounded
587    /// map growth.
588    pub fn evict_stale_entries(&mut self, now: Instant) {
589        let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
590
591        self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
592        self.request_id_to_interception_id
593            .retain(|_, (_, ts)| *ts > cutoff);
594
595        // Evict orphaned in-flight requests whose completion events
596        // (`loadingFinished` / `loadingFailed` / `loadingCanceled`) were
597        // never received.  Uses a longer timeout than the race-condition
598        // buffers since real requests can legitimately be long-lived.
599        let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
600        self.requests
601            .retain(|_, req| req.created_at > request_cutoff);
602
603        // `attempted_authentications` entries reference interception IDs that
604        // are cleaned up on loading-finished / loading-failed. If those events
605        // are lost, the set grows forever. Cross-reference with `requests`:
606        // any interception ID that no longer appears in a live request is stale.
607        if !self.attempted_authentications.is_empty() {
608            let live: HashSet<&str> = self
609                .requests
610                .values()
611                .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
612                .collect();
613            self.attempted_authentications
614                .retain(|id| live.contains(id.as_ref()));
615        }
616    }
617
618    /// Get the extra headers.
619    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
620        &self.extra_headers
621    }
622
623    /// Set extra HTTP headers.
624    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
625        self.extra_headers = headers;
626        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
627        self.extra_headers.remove("Proxy-Authorization");
628        if !self.extra_headers.is_empty() {
629            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
630                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
631            }
632        }
633    }
634
635    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
636        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
637    }
638
639    pub fn set_block_all(&mut self, block_all: bool) {
640        self.block_all = block_all;
641    }
642
643    pub fn set_request_interception(&mut self, enabled: bool) {
644        self.user_request_interception_enabled = enabled;
645        self.update_protocol_request_interception();
646    }
647
648    pub fn set_cache_enabled(&mut self, enabled: bool) {
649        let run = self.user_cache_disabled == enabled;
650        self.user_cache_disabled = !enabled;
651        if run {
652            self.update_protocol_cache_disabled();
653        }
654    }
655
656    /// Enable fetch interception.
657    pub fn enable_request_intercept(&mut self) {
658        self.protocol_request_interception_enabled = true;
659    }
660
661    /// Disable fetch interception.
662    pub fn disable_request_intercept(&mut self) {
663        self.protocol_request_interception_enabled = false;
664    }
665
666    /// Set the cache site key.
667    #[cfg(feature = "_cache")]
668    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
669        self.cache_site_key = cache_site_key;
670    }
671
672    /// Set the cache policy.
673    #[cfg(feature = "_cache")]
674    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
675        self.cache_policy = cache_policy;
676    }
677
678    pub fn update_protocol_cache_disabled(&mut self) {
679        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
680    }
681
682    pub fn authenticate(&mut self, credentials: Credentials) {
683        self.credentials = Some(credentials);
684        self.update_protocol_request_interception();
685        self.protocol_request_interception_enabled = true;
686    }
687
688    fn update_protocol_request_interception(&mut self) {
689        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
690
691        if enabled == self.protocol_request_interception_enabled {
692            return;
693        }
694
695        if enabled {
696            self.push_cdp_request(ENABLE_FETCH.clone())
697        } else {
698            self.push_cdp_request(DisableParams::default())
699        }
700    }
701
702    /// Blocklist-only script blocking.
703    /// Returns true only when the URL matches an explicit blocklist condition.
704    #[inline]
705    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
706        // If analytics blocking is off, skip all analytics tries.
707        let block_analytics = self.block_analytics;
708
709        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
710        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
711        {
712            return true;
713        }
714
715        // 2) Custom website block list (explicit).
716        if crate::handler::blockers::block_websites::block_website(url) {
717            return true;
718        }
719
720        // 3) Path-based explicit tries / fallbacks.
721        //
722        // We run these on:
723        // - path with leading slash ("/js/app.js")
724        // - path without leading slash ("js/app.js")
725        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
726        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
727            // Remove query/fragment so matching stays stable.
728            let p_slash = Self::strip_query_fragment(path_with_slash);
729            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
730
731            // Basename for filename-only lists.
732            let base = match p_slash.rsplit('/').next() {
733                Some(b) => b,
734                None => p_slash,
735            };
736
737            // ---- Trie checks ----
738            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
739            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
740                return true;
741            }
742            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
743                return true;
744            }
745            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
746                return true;
747            }
748
749            // Base-path ignore tries (framework noise / known ignorable script paths).
750            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
751            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
752                return true;
753            }
754
755            // Style path ignores only when visuals are ignored.
756            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
757                return true;
758            }
759        }
760
761        false
762    }
763
764    /// Extract the absolute URL path portion WITH the leading slash.
765    ///
766    /// Example:
767    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
768    #[inline]
769    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
770        // find scheme separator
771        let bytes = url.as_bytes();
772        let idx = memchr::memmem::find(bytes, b"//")?;
773        let after_slashes = idx + 2;
774
775        // find first slash after host
776        let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
777        let slash_idx = after_slashes + slash_rel;
778
779        if slash_idx < url.len() {
780            Some(&url[slash_idx..])
781        } else {
782            None
783        }
784    }
785
786    /// Strip query string and fragment from a path-ish string.
787    ///
788    /// Example:
789    /// - "/a/b.js?x=1#y" -> "/a/b.js"
790    #[inline]
791    fn strip_query_fragment(s: &str) -> &str {
792        match memchr::memchr2(b'?', b'#', s.as_bytes()) {
793            Some(i) => &s[..i],
794            None => s,
795        }
796    }
797
798    /// Determine if the request should be skipped.
799    #[inline]
800    fn skip_xhr(
801        &self,
802        skip_networking: bool,
803        event: &EventRequestPaused,
804        network_event: bool,
805    ) -> bool {
806        // XHR check
807        if !skip_networking && network_event {
808            let request_url = event.request.url.as_str();
809
810            // check if part of ignore scripts.
811            let skip_analytics =
812                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
813
814            if skip_analytics {
815                true
816            } else if self.block_stylesheets || self.ignore_visuals {
817                let block_css = self.block_stylesheets;
818                let block_media = self.ignore_visuals;
819
820                let mut block_request = false;
821
822                if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
823                    let hlen = request_url.len();
824                    let has_asset = hlen - position;
825
826                    if has_asset >= 3 {
827                        let next_position = position + 1;
828
829                        if block_media
830                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
831                                &request_url[next_position..].into(),
832                            )
833                        {
834                            block_request = true;
835                        } else if block_css {
836                            block_request = CaseInsensitiveString::from(
837                                &request_url.as_bytes()[next_position..],
838                            )
839                            .contains(&**CSS_EXTENSION)
840                        }
841                    }
842                }
843
844                if !block_request {
845                    block_request = ignore_script_xhr_media(request_url);
846                }
847
848                block_request
849            } else {
850                skip_networking
851            }
852        } else {
853            skip_networking
854        }
855    }
856
857    #[cfg(feature = "adblock")]
858    #[inline]
859    /// Detect if ad enabled.
860    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
861        if skip_networking {
862            true
863        } else {
864            block_ads(&event.request.url) || self.detect_ad(event)
865        }
866    }
867
868    /// When adblock feature is disabled, this is a no-op.
869    #[cfg(not(feature = "adblock"))]
870    #[inline]
871    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
872        use crate::handler::blockers::block_websites::block_ads;
873        if skip_networking {
874            true
875        } else {
876            block_ads(&event.request.url)
877        }
878    }
879
880    #[inline]
881    /// Fail request
882    fn fail_request_blocked(
883        &mut self,
884        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
885    ) {
886        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
887            request_id.clone(),
888            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
889        );
890        self.push_cdp_request(params);
891    }
892
893    #[inline]
894    /// Fulfill request
895    fn fulfill_request_empty_200(
896        &mut self,
897        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
898    ) {
899        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
900            request_id.clone(),
901            200,
902        );
903        self.push_cdp_request(params);
904    }
905
906    #[cfg(feature = "_cache")]
907    #[inline]
908    /// Fulfill a paused Fetch request from cached bytes + header map.
909    ///
910    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
911    fn fulfill_request_from_cache(
912        &mut self,
913        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
914        body: &[u8],
915        headers: &std::collections::HashMap<String, String>,
916        status: i64,
917    ) {
918        use crate::cdp::browser_protocol::fetch::HeaderEntry;
919        use crate::handler::network::fetch::FulfillRequestParams;
920        use base64::Engine;
921
922        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
923
924        for (k, v) in headers.iter() {
925            resp_headers.push(HeaderEntry {
926                name: k.clone(),
927                value: v.clone(),
928            });
929        }
930
931        let mut params = FulfillRequestParams::new(request_id.clone(), status);
932
933        // TODO: have this already encoded prior.
934        params.body = Some(
935            base64::engine::general_purpose::STANDARD
936                .encode(body)
937                .into(),
938        );
939
940        params.response_headers = Some(resp_headers);
941
942        self.push_cdp_request(params);
943    }
944
945    #[inline]
946    /// Continue the request url.
947    fn continue_request_with_url(
948        &mut self,
949        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
950        url: Option<&str>,
951        intercept_response: bool,
952    ) {
953        let mut params = ContinueRequestParams::new(request_id.clone());
954        if let Some(url) = url {
955            params.url = Some(url.to_string());
956            params.intercept_response = Some(intercept_response);
957        }
958        self.push_cdp_request(params);
959    }
960
961    /// On fetch request paused interception.
962    #[inline]
963    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
964        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
965            return;
966        }
967
968        if self.block_all {
969            tracing::debug!(
970                "Blocked (block_all): {:?} - {}",
971                event.resource_type,
972                event.request.url
973            );
974            return self.fail_request_blocked(&event.request_id);
975        }
976
977        // Capture the initiator type (set by Chrome on
978        // `Network.requestWillBeSent`) before consuming the cached event.
979        // Used below to override `block_stylesheets` for first-party CSS.
980        // For parser-dispatched <link rel="stylesheet"> requests Chrome
981        // routinely fires `Fetch.requestPaused` *before* the companion
982        // `Network.requestWillBeSent` arrives, so this is `None` for the
983        // first-party case — the override below treats unknown as
984        // "not-Script" and lets the request through. Tracker stylesheets
985        // injected by JS execute after the parser yields, by which point
986        // requestWillBeSent has populated the cache, so they carry
987        // initiator `Script` and stay blocked.
988        let initiator_type: Option<InitiatorType> = event
989            .network_id
990            .as_ref()
991            .and_then(|nid| self.requests_will_be_sent.get(nid.as_ref()))
992            .map(|(rwbs, _)| rwbs.initiator.r#type.clone());
993
994        if let Some(network_id) = event.network_id.as_ref() {
995            if let Some((request_will_be_sent, _)) =
996                self.requests_will_be_sent.remove(network_id.as_ref())
997            {
998                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
999            } else {
1000                self.request_id_to_interception_id.insert(
1001                    network_id.clone(),
1002                    (event.request_id.clone().into(), Instant::now()),
1003                );
1004            }
1005        }
1006
1007        // From here on, we handle the full decision tree.
1008        let javascript_resource = event.resource_type == ResourceType::Script;
1009        let document_resource = event.resource_type == ResourceType::Document;
1010        let network_resource =
1011            !document_resource && crate::utils::is_data_resource(&event.resource_type);
1012
1013        // Start with static / cheap skip checks.
1014        let mut skip_networking =
1015            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
1016
1017        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
1018            skip_networking = true;
1019        }
1020
1021        // Also short-circuit if we've reloaded this document too many times.
1022        if !skip_networking {
1023            skip_networking = self.document_reload_tracker >= 3;
1024        }
1025
1026        // Handle document redirect / masking and track xml documents.
1027        let (current_url_cow, had_replacer) =
1028            self.handle_document_replacement_and_tracking(event, document_resource);
1029
1030        let current_url: &str = current_url_cow.as_ref();
1031
1032        let blacklisted = self.is_blacklisted(current_url);
1033
1034        if !self.blacklist_strict && blacklisted {
1035            skip_networking = true;
1036        }
1037
1038        if !skip_networking {
1039            // Allow XSL for sitemap XML.
1040            if self.xml_document && current_url.ends_with(".xsl") {
1041                skip_networking = false;
1042            } else {
1043                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1044            }
1045        }
1046
1047        // Skip ad detection for the user-requested top-level Document and
1048        // every step of its redirect chain. The crawler explicitly targets
1049        // this URL — fulfilling-empty-200 a page just because its host
1050        // matches an ad classifier breaks the user's intent (you can
1051        // legitimately want to scrape an ad page). Reproduced on
1052        // https://logrocket.com/careers, where the firewall ad list
1053        // flagged the host and chromey emitted a 17-byte stub for the
1054        // document; downstream sub-resources (script/img/iframe/etc.)
1055        // remain subject to ad blocking through the rest of the tree.
1056        //
1057        // Signals in short-circuit order (cheap → expensive):
1058        //   1. `redirected_request_id.is_some()` — explicit redirect hop
1059        //   2. `had_replacer` — chromey's masked-URL repair path
1060        //   3. `document_target_url.is_empty()` — very first nav, tracker
1061        //      not yet populated
1062        //   4. URL equality against the target — last because string
1063        //      compare is the only non-O(1) op (`handle_document_
1064        //      replacement_and_tracking` above just set the target to
1065        //      the current url, so this is the always-true fallback)
1066        //
1067        // Sub-resources (Script/Image/Font/Stylesheet/XHR/iframe content)
1068        // remain subject to ad blocking through the rest of the tree.
1069        let is_main_document_request = document_resource
1070            && (event.redirected_request_id.is_some()
1071                || had_replacer
1072                || self.document_target_url.is_empty()
1073                || event.request.url == self.document_target_url);
1074        if !is_main_document_request {
1075            skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1076        }
1077
1078        // Ignore embedded scripts, tracker stylesheets, and tracker images when only_html or ignore_visuals is set.
1079        if !skip_networking
1080            && self.block_javascript
1081            && (self.only_html || self.ignore_visuals)
1082            && (javascript_resource
1083                || document_resource
1084                || event.resource_type == ResourceType::Stylesheet
1085                || event.resource_type == ResourceType::Image)
1086        {
1087            skip_networking = ignore_script_embedded(current_url);
1088        }
1089
1090        // Script policy: allow-by-default.
1091        // Block only if explicit block list patterns match.
1092        if !skip_networking && javascript_resource {
1093            skip_networking = self.should_block_script_blocklist_only(current_url);
1094        }
1095
1096        // XHR / data resources.
1097        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1098
1099        // Custom interception layer.
1100        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1101            skip_networking = self.intercept_manager.intercept_detection(
1102                current_url,
1103                self.ignore_visuals,
1104                network_resource,
1105            );
1106        }
1107
1108        // Custom website block list.
1109        if !skip_networking && (javascript_resource || network_resource) {
1110            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1111        }
1112
1113        // whitelist 3rd party
1114        // not required unless explicit blocking.
1115        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1116        {
1117            skip_networking = false;
1118        }
1119
1120        // check if the url is in the whitelist.
1121        if skip_networking && self.is_whitelisted(current_url) {
1122            skip_networking = false;
1123        }
1124
1125        // First-party stylesheet allow.
1126        //
1127        // `block_stylesheets` was originally a coarse "drop all CSS"
1128        // bandwidth optimization, but modern SPAs (React/Next.js with
1129        // dynamic `import()`, AppFabric, requirejs-style loaders, etc.)
1130        // gate hydration on the `load` event of stylesheets they themselves
1131        // load — blocking those leaves outer_html_bytes capturing only the
1132        // pre-hydration shell. We can't always cleanly tell first-party CSS
1133        // from third-party tracker CSS by `initiator.url` alone (page-owned
1134        // CDNs differ from the document eTLD+1, e.g. intuit.com vs
1135        // intuitcdn.net), so we use the CDP event-ordering signal instead:
1136        //
1137        //   - Parser-dispatched <link rel="stylesheet"> AND CSS injected by
1138        //     code that runs synchronously during the document parse (every
1139        //     SPA bootstrap loader) all reach `Fetch.requestPaused` before
1140        //     the companion `Network.requestWillBeSent` lands in our
1141        //     `requests_will_be_sent` cache, so `initiator_type` is `None`.
1142        //   - Tracker CSS injected by JS that runs *after* the parser
1143        //     yields (analytics tags, Hotjar, etc.) reach requestPaused
1144        //     after requestWillBeSent has populated the cache, so
1145        //     `initiator_type` is `Some(Script)`.
1146        //
1147        // So: when block_stylesheets would block a stylesheet, allow it
1148        // through *unless* we positively observed `initiator = Script`.
1149        // Behavior change is confined to stylesheets only — no other path
1150        // is touched.
1151        if skip_networking
1152            && self.block_stylesheets
1153            && event.resource_type == ResourceType::Stylesheet
1154            && !matches!(initiator_type, Some(InitiatorType::Script))
1155        {
1156            skip_networking = false;
1157        }
1158
1159        if self.blacklist_strict && blacklisted {
1160            skip_networking = true;
1161        }
1162
1163        if skip_networking {
1164            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1165            self.fulfill_request_empty_200(&event.request_id);
1166        } else {
1167            #[cfg(feature = "_cache")]
1168            {
1169                if let (Some(policy), Some(cache_site_key)) =
1170                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1171                {
1172                    let current_url = format!("{}:{}", event.request.method, &current_url);
1173
1174                    if let Some((res, cache_policy)) =
1175                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1176                    {
1177                        if policy.allows_cached(&cache_policy) {
1178                            tracing::debug!(
1179                                "Remote Cached: {:?} - {}",
1180                                &event.resource_type,
1181                                &current_url
1182                            );
1183                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1184                            return self.fulfill_request_from_cache(
1185                                &event.request_id,
1186                                &res.body,
1187                                &flat_headers,
1188                                res.status as i64,
1189                            );
1190                        }
1191                    }
1192                }
1193            }
1194
1195            // check our frame cache for the run.
1196            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1197            self.continue_request_with_url(
1198                &event.request_id,
1199                if had_replacer {
1200                    Some(current_url)
1201                } else {
1202                    None
1203                },
1204                !had_replacer,
1205            );
1206        }
1207    }
1208
1209    /// Shared "visuals + basic blocking" logic.
1210    ///
1211    /// IMPORTANT: Scripts are NOT blocked here anymore.
1212    /// Scripts are allowed by default and only blocked via explicit blocklists
1213    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1214    #[inline]
1215    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1216        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1217            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1218    }
1219
1220    /// Does the network manager have a target domain?
1221    pub fn has_target_domain(&self) -> bool {
1222        !self.document_target_url.is_empty()
1223    }
1224
1225    /// Set the target page url for tracking.
1226    pub fn set_page_url(&mut self, page_target_url: String) {
1227        let host_base = host_and_rest(&page_target_url)
1228            .map(|(h, _)| base_domain_from_host(h))
1229            .unwrap_or("");
1230
1231        self.document_target_domain = host_base.to_string();
1232        self.document_target_url = page_target_url;
1233    }
1234
1235    /// Clear the initial target domain on every navigation.
1236    pub fn clear_target_domain(&mut self) {
1237        self.document_reload_tracker = 0;
1238        self.document_target_url = Default::default();
1239        self.document_target_domain = Default::default();
1240    }
1241
1242    /// Handles:
1243    /// - document reload tracking (`document_reload_tracker`)
1244    /// - redirect masking / replacement
1245    /// - xml document detection (`xml_document`)
1246    /// - `document_target_url` updates
1247    ///
1248    /// Returns (current_url, had_replacer).
1249    #[inline]
1250    fn handle_document_replacement_and_tracking<'a>(
1251        &mut self,
1252        event: &'a EventRequestPaused,
1253        document_resource: bool,
1254    ) -> (Cow<'a, str>, bool) {
1255        let mut replacer: Option<String> = None;
1256        let current_url = event.request.url.as_str();
1257
1258        if document_resource {
1259            if self.document_target_url == current_url {
1260                self.document_reload_tracker += 1;
1261            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1262            {
1263                let (http_document_replacement, mut https_document_replacement) =
1264                    if self.document_target_url.starts_with("http://") {
1265                        (
1266                            self.document_target_url.replacen("http://", "http//", 1),
1267                            self.document_target_url.replacen("http://", "https://", 1),
1268                        )
1269                    } else {
1270                        (
1271                            self.document_target_url.replacen("https://", "https//", 1),
1272                            self.document_target_url.replacen("https://", "http://", 1),
1273                        )
1274                    };
1275
1276                // Track trailing slash to restore later.
1277                let trailing = https_document_replacement.ends_with('/');
1278                if trailing {
1279                    https_document_replacement.pop();
1280                }
1281                if https_document_replacement.ends_with('/') {
1282                    https_document_replacement.pop();
1283                }
1284
1285                let redirect_mask = format!(
1286                    "{}{}",
1287                    https_document_replacement, http_document_replacement
1288                );
1289
1290                if current_url == redirect_mask {
1291                    replacer = Some(if trailing {
1292                        format!("{}/", https_document_replacement)
1293                    } else {
1294                        https_document_replacement
1295                    });
1296                }
1297            }
1298
1299            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1300                self.xml_document = true;
1301            }
1302
1303            // Track last seen document URL.
1304            self.document_target_url = event.request.url.clone();
1305            self.document_target_domain = host_and_rest(&self.document_target_url)
1306                .map(|(h, _)| base_domain_from_host(h).to_string())
1307                .unwrap_or_default();
1308        }
1309
1310        let current_url_cow = match replacer {
1311            Some(r) => Cow::Owned(r),
1312            None => Cow::Borrowed(event.request.url.as_str()),
1313        };
1314
1315        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1316        (current_url_cow, had_replacer)
1317    }
1318
1319    /// Perform a page intercept for chrome using the adblock engine.
1320    /// Uses the custom engine when user-supplied filter rules are configured,
1321    /// otherwise falls back to the global default engine with built-in patterns.
1322    #[cfg(feature = "adblock")]
1323    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1324        use adblock::{
1325            lists::{FilterSet, ParseOptions, RuleTypes},
1326            Engine,
1327        };
1328
1329        lazy_static::lazy_static! {
1330            static ref AD_ENGINE: Engine = {
1331                let mut filter_set = FilterSet::new(false);
1332                let mut rules = ParseOptions::default();
1333                rules.rule_types = RuleTypes::All;
1334
1335                filter_set.add_filters(
1336                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1337                    rules,
1338                );
1339
1340                // When adblock_easylist is enabled, EasyList + EasyPrivacy are
1341                // embedded at build time for zero-cost runtime loading.
1342                #[cfg(feature = "adblock_easylist")]
1343                {
1344                    static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1345                    static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1346
1347                    if !EASYLIST.is_empty() {
1348                        filter_set.add_filter_list(EASYLIST, rules);
1349                    }
1350                    if !EASYPRIVACY.is_empty() {
1351                        filter_set.add_filter_list(EASYPRIVACY, rules);
1352                    }
1353                }
1354
1355                Engine::from_filter_set(filter_set, true)
1356            };
1357        }
1358
1359        let blockable = event.resource_type == ResourceType::Script
1360            || event.resource_type == ResourceType::Image
1361            || event.resource_type == ResourceType::Media
1362            || event.resource_type == ResourceType::Stylesheet
1363            || event.resource_type == ResourceType::Document
1364            || event.resource_type == ResourceType::Fetch
1365            || event.resource_type == ResourceType::Xhr;
1366
1367        if !blockable {
1368            return false;
1369        }
1370
1371        let u = &event.request.url;
1372
1373        let source_domain = if self.document_target_domain.is_empty() {
1374            "example.com"
1375        } else {
1376            &self.document_target_domain
1377        };
1378
1379        // Fast hostname extraction without full URL parsing.
1380        // preparsed(url, request_hostname, source_hostname, type, third_party)
1381        let hostname = u
1382            .strip_prefix("https://")
1383            .or_else(|| u.strip_prefix("http://"))
1384            .and_then(|rest| rest.split('/').next())
1385            // Strip userinfo (user:pass@) if present.
1386            .map(
1387                |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1388                    Some(i) => &authority[i + 1..],
1389                    None => authority,
1390                },
1391            )
1392            // Strip port (:8080) if present.
1393            .and_then(|host_port| host_port.split(':').next())
1394            .unwrap_or(source_domain);
1395
1396        let resource_type_str = match event.resource_type {
1397            ResourceType::Script => "script",
1398            ResourceType::Image => "image",
1399            ResourceType::Media => "media",
1400            ResourceType::Stylesheet => "stylesheet",
1401            ResourceType::Document => "document",
1402            ResourceType::Fetch => "fetch",
1403            ResourceType::Xhr => "xhr",
1404            _ => "other",
1405        };
1406
1407        let request = adblock::request::Request::preparsed(
1408            u,
1409            hostname,
1410            source_domain,
1411            resource_type_str,
1412            !event.request.is_same_site.unwrap_or_default(),
1413        );
1414
1415        let engine: &Engine = match self.adblock_engine.as_ref() {
1416            Some(custom) => custom,
1417            None => &AD_ENGINE,
1418        };
1419
1420        engine.check_network_request(&request).matched
1421    }
1422
1423    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1424        let response = if self
1425            .attempted_authentications
1426            .contains(event.request_id.as_ref())
1427        {
1428            AuthChallengeResponseResponse::CancelAuth
1429        } else if self.credentials.is_some() {
1430            self.attempted_authentications
1431                .insert(event.request_id.clone().into());
1432            AuthChallengeResponseResponse::ProvideCredentials
1433        } else {
1434            AuthChallengeResponseResponse::Default
1435        };
1436
1437        let mut auth = AuthChallengeResponse::new(response);
1438        if let Some(creds) = self.credentials.clone() {
1439            auth.username = Some(creds.username);
1440            auth.password = Some(creds.password);
1441        }
1442        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1443    }
1444
1445    /// Set the page offline network emulation condition.
1446    pub fn set_offline_mode(&mut self, value: bool) {
1447        if self.offline == value {
1448            return;
1449        }
1450        self.offline = value;
1451        if let Ok(condition) = NetworkConditions::builder()
1452            .url_pattern("")
1453            .latency(0)
1454            .download_throughput(-1.)
1455            .upload_throughput(-1.)
1456            .build()
1457        {
1458            if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1459                .offline(self.offline)
1460                .matched_network_condition(condition)
1461                .build()
1462            {
1463                self.push_cdp_request(network);
1464            }
1465        }
1466    }
1467
1468    /// Request interception doesn't happen for data URLs with Network Service.
1469    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1470        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1471            if let Some((interception_id, _)) = self
1472                .request_id_to_interception_id
1473                .remove(event.request_id.as_ref())
1474            {
1475                self.on_request(event, Some(interception_id));
1476            } else {
1477                self.requests_will_be_sent
1478                    .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1479            }
1480        } else {
1481            self.on_request(event, None);
1482        }
1483    }
1484
1485    /// The request was served from the cache.
1486    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1487        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1488            request.from_memory_cache = true;
1489        }
1490    }
1491
1492    /// On network response received.
1493    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1494        let mut request_failed = false;
1495
1496        // Track how many bytes we actually deducted from this target.
1497        let mut deducted: u64 = 0;
1498
1499        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1500            let before = *max_bytes;
1501
1502            // encoded_data_length -> saturating cast to u64
1503            let received_bytes: u64 = event.response.encoded_data_length as u64;
1504
1505            // Safe parse of Content-Length
1506            let content_length: Option<u64> = event
1507                .response
1508                .headers
1509                .inner()
1510                .get("content-length")
1511                .and_then(|v| v.as_str())
1512                .and_then(|s| s.trim().parse::<u64>().ok());
1513
1514            // Deduct what we actually received
1515            *max_bytes = max_bytes.saturating_sub(received_bytes);
1516
1517            // If the declared size can't fit, zero out now
1518            if let Some(cl) = content_length {
1519                if cl > *max_bytes {
1520                    *max_bytes = 0;
1521                }
1522            }
1523
1524            request_failed = *max_bytes == 0;
1525
1526            // Compute exact delta deducted on this event
1527            deducted = before.saturating_sub(*max_bytes);
1528        }
1529
1530        // Bubble up the deduction (even if request continues)
1531        if deducted > 0 {
1532            self.queued_events
1533                .push_back(NetworkEvent::BytesConsumed(deducted));
1534        }
1535
1536        // block all network request moving forward.
1537        if request_failed && self.max_bytes_allowed.is_some() {
1538            self.set_block_all(true);
1539        }
1540
1541        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1542            request.set_response(event.response.clone());
1543            self.queued_events.push_back(if request_failed {
1544                NetworkEvent::RequestFailed(request)
1545            } else {
1546                NetworkEvent::RequestFinished(request)
1547            });
1548        }
1549    }
1550
1551    /// On network loading finished.
1552    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1553        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1554            if let Some(interception_id) = request.interception_id.as_ref() {
1555                self.attempted_authentications
1556                    .remove(interception_id.as_ref());
1557            }
1558            self.queued_events
1559                .push_back(NetworkEvent::RequestFinished(request));
1560        }
1561    }
1562
1563    /// On network loading failed.
1564    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1565        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1566            request.failure_text = Some(event.error_text.clone());
1567            if let Some(interception_id) = request.interception_id.as_ref() {
1568                self.attempted_authentications
1569                    .remove(interception_id.as_ref());
1570            }
1571            self.queued_events
1572                .push_back(NetworkEvent::RequestFailed(request));
1573        }
1574    }
1575
1576    /// On request will be sent.
1577    fn on_request(
1578        &mut self,
1579        event: &EventRequestWillBeSent,
1580        interception_id: Option<InterceptionId>,
1581    ) {
1582        let mut redirect_chain = Vec::new();
1583        let mut redirect_location = None;
1584
1585        if let Some(redirect_resp) = &event.redirect_response {
1586            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1587                if is_redirect_status(redirect_resp.status) {
1588                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1589                        if redirect_resp.url != location {
1590                            let fixed_location = location.replace(&redirect_resp.url, "");
1591
1592                            if !fixed_location.is_empty() {
1593                                if let Some(resp) = request.response.as_mut() {
1594                                    resp.headers.0["Location"] =
1595                                        serde_json::Value::String(fixed_location.clone());
1596                                }
1597                            }
1598
1599                            redirect_location = Some(fixed_location);
1600                        }
1601                    }
1602                }
1603
1604                {
1605                    let mut redirect_resp = redirect_resp.clone();
1606
1607                    if let Some(redirect_location) = redirect_location {
1608                        if !redirect_location.is_empty() {
1609                            redirect_resp.headers.0["Location"] =
1610                                serde_json::Value::String(redirect_location);
1611                        }
1612                    }
1613
1614                    self.handle_request_redirect(&mut request, redirect_resp);
1615                }
1616
1617                redirect_chain = std::mem::take(&mut request.redirect_chain);
1618                redirect_chain.push(request);
1619            }
1620        }
1621
1622        // Redirect cap: applies only to Document-type hops and only when
1623        // `max_redirects` is set. Sub-resource chains are untouched.
1624        if let Some(cap) = self.max_redirects {
1625            let is_document = matches!(event.r#type, Some(ResourceType::Document));
1626            if is_document && redirect_chain.len() > cap {
1627                let mut failed = HttpRequest::new(
1628                    event.request_id.clone(),
1629                    event.frame_id.clone(),
1630                    interception_id,
1631                    self.user_request_interception_enabled,
1632                    redirect_chain,
1633                );
1634                failed.url = Some(event.request.url.clone());
1635                failed.method = Some(event.request.method.clone());
1636                failed.failure_text = Some("net::ERR_TOO_MANY_REDIRECTS".into());
1637                self.push_cdp_request(
1638                    chromiumoxide_cdp::cdp::browser_protocol::page::StopLoadingParams::default(),
1639                );
1640                self.queued_events
1641                    .push_back(NetworkEvent::RequestFailed(failed));
1642                return;
1643            }
1644        }
1645
1646        let request = HttpRequest::new(
1647            event.request_id.clone(),
1648            event.frame_id.clone(),
1649            interception_id,
1650            self.user_request_interception_enabled,
1651            redirect_chain,
1652        );
1653
1654        let rid = event.request_id.clone();
1655        self.queued_events
1656            .push_back(NetworkEvent::Request(rid.clone()));
1657        self.requests.insert(rid, request);
1658    }
1659
1660    /// Handle request redirect.
1661    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1662        request.set_response(response);
1663        if let Some(interception_id) = request.interception_id.as_ref() {
1664            self.attempted_authentications
1665                .remove(interception_id.as_ref());
1666        }
1667    }
1668}
1669
1670#[derive(Debug)]
1671pub enum NetworkEvent {
1672    /// Send a CDP request.
1673    SendCdpRequest((MethodId, serde_json::Value)),
1674    /// Request.
1675    Request(RequestId),
1676    /// Response
1677    Response(RequestId),
1678    /// Request failed.
1679    RequestFailed(HttpRequest),
1680    /// Request finished.
1681    RequestFinished(HttpRequest),
1682    /// Bytes consumed.
1683    BytesConsumed(u64),
1684}
1685
1686#[cfg(test)]
1687mod tests {
1688    use super::ALLOWED_MATCHER_3RD_PARTY;
1689    use crate::handler::network::NetworkManager;
1690    use std::time::Duration;
1691
1692    #[test]
1693    fn test_allowed_matcher_3rd_party() {
1694        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1695        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1696        assert!(
1697            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1698            "expected Cloudflare challenge script to be allowed"
1699        );
1700
1701        // Should NOT be allowed (not in allow-list)
1702        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1703        assert!(
1704            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1705            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1706        );
1707
1708        // A couple sanity checks for existing allow patterns
1709        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1710        assert!(ALLOWED_MATCHER_3RD_PARTY
1711            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1712        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1713    }
1714
1715    #[test]
1716    fn test_script_allowed_by_default_when_not_blocklisted() {
1717        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1718        nm.set_page_url(
1719            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1720        );
1721
1722        // A random script that should not match your block tries.
1723        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1724        assert!(
1725            !nm.should_block_script_blocklist_only(ok),
1726            "expected non-blocklisted script to be allowed"
1727        );
1728    }
1729
1730    #[test]
1731    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1732        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1733        nm.set_page_url(
1734            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1735        );
1736
1737        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1738        let bad = "https://cdn.example.net/js/analytics.js";
1739        assert!(
1740            nm.should_block_script_blocklist_only(bad),
1741            "expected analytics.js to be blocklisted"
1742        );
1743    }
1744
1745    #[test]
1746    fn test_allowed_matcher_3rd_party_sanity() {
1747        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1748        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1749        assert!(
1750            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1751            "expected Cloudflare challenge script to be allowed"
1752        );
1753
1754        // Should NOT be allowed (not in allow-list)
1755        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1756        assert!(
1757            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1758            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1759        );
1760
1761        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1762        assert!(ALLOWED_MATCHER_3RD_PARTY
1763            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1764        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1765    }
1766    #[test]
1767    fn test_dynamic_blacklist_blocks_url() {
1768        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1769        nm.set_page_url("https://example.com/".to_string());
1770
1771        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1772        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1773        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1774
1775        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1776    }
1777
1778    #[test]
1779    fn test_blacklist_strict_wins_over_whitelist() {
1780        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1781        nm.set_page_url("https://example.com/".to_string());
1782
1783        // Same URL in both lists.
1784        nm.set_blacklist_patterns(["beacon.min.js"]);
1785        nm.set_whitelist_patterns(["beacon.min.js"]);
1786
1787        nm.set_blacklist_strict(true);
1788
1789        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1790        assert!(nm.is_whitelisted(u));
1791        assert!(nm.is_blacklisted(u));
1792
1793        // In strict mode, it should still be considered blocked at decision time.
1794        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1795        assert!(nm.blacklist_strict);
1796    }
1797
1798    #[cfg(feature = "adblock")]
1799    fn make_request_paused(
1800        url: &str,
1801        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1802        is_same_site: bool,
1803    ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1804        use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1805        use chromiumoxide_cdp::cdp::browser_protocol::network::{
1806            Headers, Request, RequestReferrerPolicy, ResourcePriority,
1807        };
1808
1809        EventRequestPaused {
1810            request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1811                "test-req".to_string(),
1812            )
1813            .into(),
1814            request: Request {
1815                url: url.to_string(),
1816                method: "GET".to_string(),
1817                headers: Headers::new(serde_json::Value::Object(Default::default())),
1818                initial_priority: ResourcePriority::Medium,
1819                referrer_policy: RequestReferrerPolicy::NoReferrer,
1820                url_fragment: None,
1821                has_post_data: None,
1822                post_data_entries: None,
1823                mixed_content_type: None,
1824                is_link_preload: None,
1825                trust_token_params: None,
1826                is_same_site: Some(is_same_site),
1827                is_ad_related: None,
1828            },
1829            frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1830                "frame1".to_string(),
1831            ),
1832            resource_type,
1833            response_error_reason: None,
1834            response_status_code: None,
1835            response_status_text: None,
1836            response_headers: None,
1837            network_id: None,
1838            redirected_request_id: None,
1839        }
1840    }
1841
1842    #[cfg(feature = "adblock")]
1843    #[test]
1844    fn test_detect_ad_blocks_known_tracker_scripts() {
1845        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1846
1847        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1848        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1849
1850        let event = make_request_paused(
1851            "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1852            ResourceType::Script,
1853            false,
1854        );
1855
1856        assert!(
1857            nm.detect_ad(&event),
1858            "googletagmanager.com script should be detected as ad"
1859        );
1860    }
1861
1862    #[cfg(feature = "adblock")]
1863    #[test]
1864    fn test_detect_ad_allows_legitimate_scripts() {
1865        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1866
1867        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1868        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1869
1870        let event = make_request_paused(
1871            "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1872            ResourceType::Script,
1873            true,
1874        );
1875
1876        assert!(
1877            !nm.detect_ad(&event),
1878            "legitimate first-party app bundle should not be blocked"
1879        );
1880    }
1881
1882    #[cfg(feature = "adblock")]
1883    #[test]
1884    fn test_detect_ad_uses_source_domain() {
1885        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1886
1887        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1888        nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1889
1890        assert!(
1891            !nm.document_target_domain.is_empty(),
1892            "document_target_domain should be set after set_page_url"
1893        );
1894
1895        let event = make_request_paused(
1896            "https://www.google-analytics.com/analytics.js",
1897            ResourceType::Script,
1898            false,
1899        );
1900
1901        assert!(
1902            nm.detect_ad(&event),
1903            "google-analytics.com should be blocked as tracker"
1904        );
1905    }
1906
1907    #[cfg(feature = "adblock")]
1908    #[test]
1909    fn test_custom_adblock_engine_takes_precedence() {
1910        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1911
1912        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1913        nm.set_page_url("https://example.com/".to_string());
1914
1915        // Build a custom engine with a specific rule.
1916        let mut filter_set = adblock::lists::FilterSet::new(false);
1917        let mut opts = adblock::lists::ParseOptions::default();
1918        opts.rule_types = adblock::lists::RuleTypes::All;
1919        filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1920        let engine = adblock::Engine::from_filter_set(filter_set, true);
1921        nm.set_adblock_engine(std::sync::Arc::new(engine));
1922
1923        let event = make_request_paused(
1924            "https://custom-tracker.example.net/pixel.js",
1925            ResourceType::Script,
1926            false,
1927        );
1928
1929        assert!(
1930            nm.detect_ad(&event),
1931            "custom engine rule should block custom-tracker.example.net"
1932        );
1933    }
1934
1935    /// Helper: run a URL through the full `on_fetch_request_paused` pipeline
1936    /// and return whether it was blocked (true) or allowed (false).
1937    #[cfg(feature = "adblock")]
1938    fn run_full_interception(
1939        nm: &mut NetworkManager,
1940        url: &str,
1941        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1942        is_same_site: bool,
1943    ) -> bool {
1944        use super::NetworkEvent;
1945
1946        // Drain any prior events.
1947        while nm.poll().is_some() {}
1948
1949        let event = make_request_paused(url, resource_type, is_same_site);
1950        nm.on_fetch_request_paused(&event);
1951
1952        // Check what was emitted: Fetch.fulfillRequest = blocked, Fetch.continueRequest = allowed.
1953        let mut blocked = false;
1954        while let Some(ev) = nm.poll() {
1955            if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1956                let m: &str = method.as_ref();
1957                if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1958                    blocked = true;
1959                }
1960            }
1961        }
1962        blocked
1963    }
1964
1965    // ── End-to-end interception tests ───────────────────────────────────
1966
1967    #[cfg(feature = "adblock")]
1968    #[test]
1969    fn test_e2e_tracker_script_blocked() {
1970        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1971
1972        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1973        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1974
1975        assert!(
1976            run_full_interception(
1977                &mut nm,
1978                "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1979                ResourceType::Script,
1980                false,
1981            ),
1982            "GTM script should be blocked through full pipeline"
1983        );
1984    }
1985
1986    #[cfg(feature = "adblock")]
1987    #[test]
1988    fn test_e2e_legitimate_script_allowed() {
1989        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1990
1991        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1992        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1993
1994        assert!(
1995            !run_full_interception(
1996                &mut nm,
1997                "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1998                ResourceType::Script,
1999                true,
2000            ),
2001            "legitimate first-party script should be allowed through full pipeline"
2002        );
2003    }
2004
2005    #[cfg(feature = "adblock")]
2006    #[test]
2007    fn test_e2e_analytics_xhr_blocked() {
2008        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2009
2010        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2011        nm.set_page_url("https://example.org/".to_string());
2012
2013        assert!(
2014            run_full_interception(
2015                &mut nm,
2016                "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
2017                ResourceType::Xhr,
2018                false,
2019            ),
2020            "Google Analytics XHR should be blocked through full pipeline"
2021        );
2022    }
2023
2024    #[cfg(feature = "adblock")]
2025    #[test]
2026    fn test_e2e_whitelisted_overrides_adblock() {
2027        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2028
2029        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2030        nm.set_page_url("https://example.org/".to_string());
2031        nm.set_whitelist_patterns(["googletagmanager.com"]);
2032
2033        // GTM would normally be blocked by adblock, but whitelist overrides.
2034        assert!(
2035            !run_full_interception(
2036                &mut nm,
2037                "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
2038                ResourceType::Script,
2039                false,
2040            ),
2041            "whitelisted tracker should be allowed even when adblock would block it"
2042        );
2043    }
2044
2045    #[cfg(feature = "adblock")]
2046    #[test]
2047    fn test_e2e_blacklist_strict_overrides_whitelist() {
2048        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2049
2050        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2051        nm.set_page_url("https://example.org/".to_string());
2052        nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
2053        nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
2054        nm.set_blacklist_strict(true);
2055
2056        assert!(
2057            run_full_interception(
2058                &mut nm,
2059                "https://cdn.example.net/evil.js",
2060                ResourceType::Script,
2061                false,
2062            ),
2063            "strict blacklist should win over whitelist"
2064        );
2065    }
2066
2067    #[cfg(feature = "adblock")]
2068    #[test]
2069    fn test_e2e_first_party_document_not_blocked() {
2070        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2071
2072        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2073        nm.set_page_url("https://www.nytimes.com/".to_string());
2074
2075        assert!(
2076            !run_full_interception(
2077                &mut nm,
2078                "https://www.nytimes.com/2024/article.html",
2079                ResourceType::Document,
2080                true,
2081            ),
2082            "first-party document navigation should never be blocked"
2083        );
2084    }
2085
2086    #[cfg(feature = "adblock")]
2087    #[test]
2088    fn test_e2e_custom_engine_blocks_through_pipeline() {
2089        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2090
2091        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2092        nm.set_page_url("https://mysite.com/".to_string());
2093
2094        let mut filter_set = adblock::lists::FilterSet::new(false);
2095        let mut opts = adblock::lists::ParseOptions::default();
2096        opts.rule_types = adblock::lists::RuleTypes::All;
2097        filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
2098        let engine = adblock::Engine::from_filter_set(filter_set, true);
2099        nm.set_adblock_engine(std::sync::Arc::new(engine));
2100
2101        assert!(
2102            run_full_interception(
2103                &mut nm,
2104                "https://evil-cdn.example.net/tracker.js",
2105                ResourceType::Script,
2106                false,
2107            ),
2108            "custom engine rule should block through full pipeline"
2109        );
2110
2111        // Legitimate script on the same site should still pass.
2112        assert!(
2113            !run_full_interception(
2114                &mut nm,
2115                "https://mysite.com/app.js",
2116                ResourceType::Script,
2117                true,
2118            ),
2119            "first-party script should still be allowed with custom engine"
2120        );
2121    }
2122
2123    #[cfg(feature = "adblock")]
2124    #[test]
2125    fn test_e2e_ad_image_blocked() {
2126        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2127
2128        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2129        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2130
2131        // Ad tracking pixel should be blocked via adblock pattern or trie.
2132        assert!(
2133            run_full_interception(
2134                &mut nm,
2135                "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2136                ResourceType::Image,
2137                false,
2138            ),
2139            "doubleclick ad image/tracking pixel should be blocked"
2140        );
2141
2142        // Legitimate first-party image should pass.
2143        assert!(
2144            !run_full_interception(
2145                &mut nm,
2146                "https://www.mylegitsite-test.com/images/logo.png",
2147                ResourceType::Image,
2148                true,
2149            ),
2150            "legitimate first-party image should not be blocked"
2151        );
2152    }
2153
2154    #[cfg(feature = "adblock")]
2155    #[test]
2156    fn test_e2e_hostname_with_userinfo() {
2157        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2158
2159        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2160        nm.set_page_url("https://example.org/".to_string());
2161
2162        // URL with userinfo should still correctly identify googletagmanager.com.
2163        assert!(
2164            run_full_interception(
2165                &mut nm,
2166                "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2167                ResourceType::Script,
2168                false,
2169            ),
2170            "tracker URL with userinfo should still be blocked"
2171        );
2172    }
2173
2174    #[test]
2175    fn test_blacklist_non_strict_allows_whitelist_override() {
2176        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2177        nm.set_page_url("https://example.com/".to_string());
2178
2179        nm.set_blacklist_patterns(["beacon.min.js"]);
2180        nm.set_whitelist_patterns(["beacon.min.js"]);
2181
2182        nm.set_blacklist_strict(false);
2183
2184        let u = "https://static.cloudflareinsights.com/beacon.min.js";
2185        assert!(nm.is_blacklisted(u));
2186        assert!(nm.is_whitelisted(u));
2187        assert!(!nm.blacklist_strict);
2188    }
2189
2190    // ── max_redirects enforcement ───────────────────────────────────────
2191    //
2192    // The redirect cap short-circuits in NetworkManager::on_request when a
2193    // Document-type chain exceeds the configured limit. We drive it via the
2194    // public on_request_will_be_sent entry point by deserializing synthetic
2195    // events — builder APIs exist but require every non-optional field, and
2196    // JSON is less fragile to cdp schema additions.
2197
2198    fn make_request_will_be_sent(
2199        request_id: &str,
2200        url: &str,
2201        resource_type: &str,
2202        redirect_from_url: Option<&str>,
2203    ) -> chromiumoxide_cdp::cdp::browser_protocol::network::EventRequestWillBeSent {
2204        let mut v = serde_json::json!({
2205            "requestId": request_id,
2206            "loaderId": "test-loader",
2207            "documentURL": url,
2208            "request": {
2209                "url": url,
2210                "method": "GET",
2211                "headers": {},
2212                "initialPriority": "Medium",
2213                "referrerPolicy": "no-referrer"
2214            },
2215            "timestamp": 0.0,
2216            "wallTime": 0.0,
2217            "initiator": { "type": "other" },
2218            "redirectHasExtraInfo": false,
2219            "type": resource_type,
2220            "frameId": "frame1"
2221        });
2222        if let Some(from) = redirect_from_url {
2223            v["redirectResponse"] = serde_json::json!({
2224                "url": from,
2225                "status": 302,
2226                "statusText": "Found",
2227                "headers": { "Location": url },
2228                "mimeType": "text/html",
2229                "charset": "",
2230                "connectionReused": false,
2231                "connectionId": 0.0,
2232                "encodedDataLength": 0.0,
2233                "securityState": "unknown"
2234            });
2235        }
2236        serde_json::from_value(v).expect("EventRequestWillBeSent should deserialize")
2237    }
2238
2239    fn drain_too_many_redirects(nm: &mut NetworkManager) -> Option<super::HttpRequest> {
2240        while let Some(ev) = nm.poll() {
2241            if let super::NetworkEvent::RequestFailed(req) = ev {
2242                if req.failure_text.as_deref() == Some("net::ERR_TOO_MANY_REDIRECTS") {
2243                    return Some(req);
2244                }
2245            }
2246        }
2247        None
2248    }
2249
2250    fn drain_stop_loading(nm: &mut NetworkManager) -> bool {
2251        while let Some(ev) = nm.poll() {
2252            if let super::NetworkEvent::SendCdpRequest((method, _)) = ev {
2253                let m: &str = method.as_ref();
2254                if m == "Page.stopLoading" {
2255                    return true;
2256                }
2257            }
2258        }
2259        false
2260    }
2261
2262    #[test]
2263    fn test_max_redirects_none_allows_unlimited_chain() {
2264        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2265        // max_redirects left at its default (None).
2266
2267        // 10 sequential Document hops sharing the same request_id.
2268        nm.on_request_will_be_sent(&make_request_will_be_sent(
2269            "r1",
2270            "https://example.com/0",
2271            "Document",
2272            None,
2273        ));
2274        for i in 1..10 {
2275            nm.on_request_will_be_sent(&make_request_will_be_sent(
2276                "r1",
2277                &format!("https://example.com/{i}"),
2278                "Document",
2279                Some(&format!("https://example.com/{}", i - 1)),
2280            ));
2281        }
2282
2283        assert!(
2284            drain_too_many_redirects(&mut nm).is_none(),
2285            "no cap set: chain of 10 hops must not emit ERR_TOO_MANY_REDIRECTS"
2286        );
2287    }
2288
2289    #[test]
2290    fn test_max_redirects_caps_document_chain() {
2291        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2292        nm.max_redirects = Some(3);
2293
2294        // Initial request + 4 redirect hops. The 4th redirect (chain length 4 > 3)
2295        // must trip the cap.
2296        nm.on_request_will_be_sent(&make_request_will_be_sent(
2297            "r1",
2298            "https://example.com/0",
2299            "Document",
2300            None,
2301        ));
2302        for i in 1..=4 {
2303            nm.on_request_will_be_sent(&make_request_will_be_sent(
2304                "r1",
2305                &format!("https://example.com/{i}"),
2306                "Document",
2307                Some(&format!("https://example.com/{}", i - 1)),
2308            ));
2309        }
2310
2311        let failed = drain_too_many_redirects(&mut nm)
2312            .expect("cap of 3 on a 4-hop chain must emit ERR_TOO_MANY_REDIRECTS");
2313        assert_eq!(
2314            failed.redirect_chain.len(),
2315            4,
2316            "failed request should preserve the full accumulated chain"
2317        );
2318        assert_eq!(
2319            failed.url.as_deref(),
2320            Some("https://example.com/4"),
2321            "failed request url should be the hop that tripped the cap"
2322        );
2323
2324        // Second navigation after the cap is tripped must also schedule
2325        // Page.stopLoading to actually abort the tab.
2326        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2327        nm.max_redirects = Some(3);
2328        nm.on_request_will_be_sent(&make_request_will_be_sent(
2329            "r2",
2330            "https://example.com/0",
2331            "Document",
2332            None,
2333        ));
2334        for i in 1..=4 {
2335            nm.on_request_will_be_sent(&make_request_will_be_sent(
2336                "r2",
2337                &format!("https://example.com/{i}"),
2338                "Document",
2339                Some(&format!("https://example.com/{}", i - 1)),
2340            ));
2341        }
2342        assert!(
2343            drain_stop_loading(&mut nm),
2344            "cap hit must dispatch Page.stopLoading to abort navigation"
2345        );
2346    }
2347
2348    #[test]
2349    fn test_max_redirects_ignores_subresources() {
2350        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2351        nm.max_redirects = Some(2);
2352
2353        // A 5-hop script redirect chain — sub-resources are exempt by design.
2354        nm.on_request_will_be_sent(&make_request_will_be_sent(
2355            "s1",
2356            "https://cdn.example.com/0.js",
2357            "Script",
2358            None,
2359        ));
2360        for i in 1..=5 {
2361            nm.on_request_will_be_sent(&make_request_will_be_sent(
2362                "s1",
2363                &format!("https://cdn.example.com/{i}.js"),
2364                "Script",
2365                Some(&format!("https://cdn.example.com/{}.js", i - 1)),
2366            ));
2367        }
2368
2369        assert!(
2370            drain_too_many_redirects(&mut nm).is_none(),
2371            "sub-resource redirect chains must never be capped"
2372        );
2373    }
2374}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs