chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InitiatorType, InterceptionId, NetworkConditions, RequestId, ResourceType, Response,
20    SetCacheDisabledParams, SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235/// How long a buffered `requests_will_be_sent` / `request_id_to_interception_id`
236/// entry may linger before being evicted. 30 seconds is generous — the CDP
237/// round-trip that reconciles the two racing events normally completes in
238/// milliseconds.
239const STALE_BUFFER_SECS: u64 = 30;
240
241/// How long an in-flight request entry (`requests` map) can live without
242/// being resolved by a `loadingFinished` / `loadingFailed` / `loadingCanceled`
243/// event before it is considered orphaned and evicted.  Longer than the
244/// race-condition buffer timeout because real requests can legitimately take
245/// tens of seconds (streaming, slow origins, etc.).
246const STALE_REQUEST_SECS: u64 = 120;
247
248/// Wrapper around `adblock::Engine` that implements `Debug`.
249#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255        f.debug_struct("AdblockEngine").finish()
256    }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261    type Target = adblock::Engine;
262    fn deref(&self) -> &Self::Target {
263        &self.0
264    }
265}
266
267#[derive(Debug)]
268/// The base network manager.
269pub struct NetworkManager {
270    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
271    ///
272    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
273    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
274    /// Consumers pull from this queue via `poll()`.
275    queued_events: VecDeque<NetworkEvent>,
276    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
277    ///
278    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
279    /// certificates (self-signed, expired, MITM proxies, etc.).
280    ignore_httpserrors: bool,
281    /// Active in-flight requests keyed by CDP `RequestId`.
282    ///
283    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
284    /// and final state used to emit `RequestFinished` / `RequestFailed`.
285    requests: HashMap<RequestId, HttpRequest>,
286    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
287    /// `Fetch.requestPaused` arrives later (or vice versa).
288    ///
289    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
290    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
291    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
292    requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293    /// Extra HTTP headers to apply to subsequent network requests via CDP.
294    ///
295    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
296    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
297    extra_headers: std::collections::HashMap<String, String>,
298    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
299    ///
300    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
301    /// store the interception id here so it can be attached to the `HttpRequest` once the
302    /// network request is observed.
303    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
304    request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305    /// Whether the user has disabled the browser cache.
306    ///
307    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
308    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
309    user_cache_disabled: bool,
310    /// Tracks which requests have already attempted authentication.
311    ///
312    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
313    /// authentication challenges (407/401). Once a request id is present here, subsequent
314    /// challenges for the same request are canceled.
315    attempted_authentications: HashSet<RequestId>,
316    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
317    ///
318    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
319    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
320    credentials: Option<Credentials>,
321    /// User-facing toggle indicating whether request interception is desired.
322    ///
323    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
324    /// not guarantee interception is active; interception is actually enabled/disabled by
325    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
326    ///
327    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
328    /// enabled to satisfy auth challenges.
329    pub(crate) user_request_interception_enabled: bool,
330    /// Hard kill-switch to block all network traffic.
331    ///
332    /// When `true`, the manager immediately blocks requests (typically via
333    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
334    /// and short-circuits most decision logic. This is used for safety conditions such as
335    /// exceeding `max_bytes_allowed` or other runtime protections.
336    block_all: bool,
337    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
338    ///
339    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
340    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
341    /// when `user_request_interception_enabled` or `credentials` change.
342    pub(crate) protocol_request_interception_enabled: bool,
343    /// The network is offline.
344    offline: bool,
345    /// The page request timeout.
346    pub request_timeout: Duration,
347    // made_request: bool,
348    /// Ignore visuals (no pings, prefetching, and etc).
349    pub ignore_visuals: bool,
350    /// Block CSS stylesheets.
351    pub block_stylesheets: bool,
352    /// Block javascript that is not critical to rendering.
353    ///
354    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
355    /// by itself (it remains for config compatibility).
356    pub block_javascript: bool,
357    /// When `block_stylesheets` would skip a stylesheet, allow it through if
358    /// the request URL is first-party (registrable domain matches the page's
359    /// primary frame). Default `true` so SPAs that load their own CSS via
360    /// dynamic imports still hydrate when callers pass `block_stylesheets`
361    /// for bandwidth. Set `false` to strictly block ALL stylesheets.
362    pub allow_first_party_stylesheets: bool,
363    /// When a downstream blocker (intercept_manager / adblock / blocklists)
364    /// would skip a script, allow it through if the request URL is
365    /// first-party. Default `true` so SPA bootloaders are not collateral
366    /// damage from third-party tracker rules.
367    pub allow_first_party_javascript: bool,
368    /// When `ignore_visuals` would skip an image/media/font, allow it through
369    /// if the request URL is first-party. Default `true` so first-party
370    /// image-driven SPA renderers (gallery code-splits, font-blocking
371    /// hydration) still complete when callers pass `ignore_visuals`.
372    pub allow_first_party_visuals: bool,
373    /// Block analytics from rendering
374    pub block_analytics: bool,
375    /// Block pre-fetch request
376    pub block_prefetch: bool,
377    /// Only html from loading.
378    pub only_html: bool,
379    /// Is xml document?
380    pub xml_document: bool,
381    /// The custom intercept handle logic to run on the website.
382    pub intercept_manager: NetworkInterceptManager,
383    /// Track the amount of times the document reloaded.
384    pub document_reload_tracker: u8,
385    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
386    pub document_target_url: String,
387    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
388    pub document_target_domain: String,
389    /// The max bytes to receive.
390    pub max_bytes_allowed: Option<u64>,
391    /// Cap on main-frame Document redirect hops before the navigation is aborted.
392    ///
393    /// `None` disables enforcement (default, preserves prior behavior). When `Some(n)`,
394    /// the (n+1)th Document redirect short-circuits: a synthetic `RequestFailed` event
395    /// is emitted with `failure_text = "net::ERR_TOO_MANY_REDIRECTS"` and
396    /// `Page.stopLoading` is dispatched to abort in-flight navigation. The accumulated
397    /// `redirect_chain` is preserved on the failed request so consumers can inspect it.
398    pub max_redirects: Option<usize>,
399    #[cfg(feature = "_cache")]
400    /// The cache site_key to use.
401    pub cache_site_key: Option<String>,
402    /// The cache policy to use.
403    #[cfg(feature = "_cache")]
404    pub cache_policy: Option<BasicCachePolicy>,
405    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
406    whitelist_patterns: Vec<String>,
407    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
408    whitelist_matcher: Option<AhoCorasick>,
409    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
410    blacklist_patterns: Vec<String>,
411    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
412    blacklist_matcher: Option<AhoCorasick>,
413    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
414    blacklist_strict: bool,
415    /// When true, push the interception policy (flags + per-job
416    /// blacklist/whitelist + page url) to a capable remote engine once per
417    /// navigation via `Interception.setPolicy`, so it can resolve block/allow
418    /// decisions locally instead of round-tripping each `Fetch.requestPaused`.
419    /// Default `false` — a real Chrome target 200-OK-ignores the unknown
420    /// method, and an engine that does not implement it simply keeps the
421    /// round-trip path, so this is safe to leave on. Carries no engine-
422    /// specific data: only this manager's existing config fields are
423    /// serialized; the per-job lists travel as the opaque strings the caller
424    /// already supplied.
425    remote_local_policy: bool,
426    /// Custom adblock engine built from user-supplied filter rules.
427    /// When `Some`, takes precedence over the global default engine.
428    #[cfg(feature = "adblock")]
429    adblock_engine: Option<AdblockEngine>,
430}
431
432impl NetworkManager {
433    /// A new network manager.
434    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
435        Self {
436            queued_events: Default::default(),
437            ignore_httpserrors,
438            requests: Default::default(),
439            requests_will_be_sent: Default::default(),
440            extra_headers: Default::default(),
441            request_id_to_interception_id: Default::default(),
442            user_cache_disabled: false,
443            attempted_authentications: Default::default(),
444            credentials: None,
445            block_all: false,
446            user_request_interception_enabled: false,
447            protocol_request_interception_enabled: false,
448            offline: false,
449            request_timeout,
450            ignore_visuals: false,
451            block_javascript: false,
452            block_stylesheets: false,
453            allow_first_party_stylesheets: true,
454            allow_first_party_javascript: true,
455            allow_first_party_visuals: true,
456            block_prefetch: true,
457            block_analytics: true,
458            only_html: false,
459            xml_document: false,
460            intercept_manager: NetworkInterceptManager::Unknown,
461            document_reload_tracker: 0,
462            document_target_url: String::new(),
463            document_target_domain: String::new(),
464            whitelist_patterns: Vec::new(),
465            whitelist_matcher: None,
466            blacklist_patterns: Vec::new(),
467            blacklist_matcher: None,
468            blacklist_strict: true,
469            remote_local_policy: false,
470            max_bytes_allowed: None,
471            max_redirects: None,
472            #[cfg(feature = "_cache")]
473            cache_site_key: None,
474            #[cfg(feature = "_cache")]
475            cache_policy: None,
476            #[cfg(feature = "adblock")]
477            adblock_engine: None,
478        }
479    }
480
481    /// Set a custom adblock engine built from user-supplied filter rules.
482    #[cfg(feature = "adblock")]
483    pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
484        self.adblock_engine = Some(AdblockEngine(engine));
485    }
486
487    /// Replace the whitelist patterns (compiled once).
488    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
489    where
490        I: IntoIterator<Item = S>,
491        S: Into<String>,
492    {
493        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
494        self.rebuild_whitelist_matcher();
495    }
496
497    /// Replace the blacklist patterns (compiled once).
498    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
499    where
500        I: IntoIterator<Item = S>,
501        S: Into<String>,
502    {
503        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
504        self.rebuild_blacklist_matcher();
505    }
506
507    /// Add one pattern (cheap) and rebuild (call this sparingly).
508    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
509        self.blacklist_patterns.push(pattern.into());
510        self.rebuild_blacklist_matcher();
511    }
512
513    /// Add many patterns and rebuild once.
514    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
515    where
516        I: IntoIterator<Item = S>,
517        S: Into<String>,
518    {
519        self.blacklist_patterns
520            .extend(patterns.into_iter().map(Into::into));
521        self.rebuild_blacklist_matcher();
522    }
523
524    /// Clear blacklist entirely.
525    pub fn clear_blacklist(&mut self) {
526        self.blacklist_patterns.clear();
527        self.blacklist_matcher = None;
528    }
529
530    /// Control precedence: when true, blacklist always wins.
531    pub fn set_blacklist_strict(&mut self, strict: bool) {
532        self.blacklist_strict = strict;
533    }
534
535    #[inline]
536    fn rebuild_blacklist_matcher(&mut self) {
537        if self.blacklist_patterns.is_empty() {
538            self.blacklist_matcher = None;
539            return;
540        }
541
542        self.blacklist_matcher =
543            AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
544    }
545
546    #[inline]
547    fn is_blacklisted(&self, url: &str) -> bool {
548        self.blacklist_matcher
549            .as_ref()
550            .map(|m| m.is_match(url))
551            .unwrap_or(false)
552    }
553
554    /// Add one pattern (cheap) and rebuild (call this sparingly).
555    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
556        self.whitelist_patterns.push(pattern.into());
557        self.rebuild_whitelist_matcher();
558    }
559
560    /// Add many patterns and rebuild once.
561    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
562    where
563        I: IntoIterator<Item = S>,
564        S: Into<String>,
565    {
566        self.whitelist_patterns
567            .extend(patterns.into_iter().map(Into::into));
568        self.rebuild_whitelist_matcher();
569    }
570
571    #[inline]
572    fn rebuild_whitelist_matcher(&mut self) {
573        if self.whitelist_patterns.is_empty() {
574            self.whitelist_matcher = None;
575            return;
576        }
577
578        // If building fails (shouldn’t for simple patterns), just disable matcher.
579        self.whitelist_matcher =
580            AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
581    }
582
583    #[inline]
584    fn is_whitelisted(&self, url: &str) -> bool {
585        self.whitelist_matcher
586            .as_ref()
587            .map(|m| m.is_match(url))
588            .unwrap_or(false)
589    }
590
591    /// Commands to init the chain with.
592    pub fn init_commands(&self) -> CommandChain {
593        let cmds = if self.ignore_httpserrors {
594            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
595        } else {
596            INIT_CHAIN.clone()
597        };
598        CommandChain::new(cmds, self.request_timeout)
599    }
600
601    /// Push the CDP request.
602    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
603        let method = cmd.identifier();
604        if let Ok(params) = serde_json::to_value(cmd) {
605            self.queued_events
606                .push_back(NetworkEvent::SendCdpRequest((method, params)));
607        }
608    }
609
610    /// The next event to handle.
611    pub fn poll(&mut self) -> Option<NetworkEvent> {
612        self.queued_events.pop_front()
613    }
614
615    /// Evict stale entries from the race-condition buffers and from
616    /// `attempted_authentications`. Call this periodically (e.g. from the
617    /// handler's eviction tick) so that lost CDP events cannot cause unbounded
618    /// map growth.
619    pub fn evict_stale_entries(&mut self, now: Instant) {
620        let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
621
622        self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
623        self.request_id_to_interception_id
624            .retain(|_, (_, ts)| *ts > cutoff);
625
626        // Evict orphaned in-flight requests whose completion events
627        // (`loadingFinished` / `loadingFailed` / `loadingCanceled`) were
628        // never received.  Uses a longer timeout than the race-condition
629        // buffers since real requests can legitimately be long-lived.
630        let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
631        self.requests
632            .retain(|_, req| req.created_at > request_cutoff);
633
634        // `attempted_authentications` entries reference interception IDs that
635        // are cleaned up on loading-finished / loading-failed. If those events
636        // are lost, the set grows forever. Cross-reference with `requests`:
637        // any interception ID that no longer appears in a live request is stale.
638        if !self.attempted_authentications.is_empty() {
639            let live: HashSet<&str> = self
640                .requests
641                .values()
642                .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
643                .collect();
644            self.attempted_authentications
645                .retain(|id| live.contains(id.as_ref()));
646        }
647    }
648
649    /// Get the extra headers.
650    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
651        &self.extra_headers
652    }
653
654    /// Set extra HTTP headers.
655    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
656        self.extra_headers = headers;
657        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
658        self.extra_headers.remove("Proxy-Authorization");
659        if !self.extra_headers.is_empty() {
660            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
661                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
662            }
663        }
664    }
665
666    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
667        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
668    }
669
670    pub fn set_block_all(&mut self, block_all: bool) {
671        self.block_all = block_all;
672    }
673
674    /// Enable/disable pushing the interception policy to a capable remote
675    /// engine (see [`NetworkManager::remote_local_policy`]).
676    pub fn set_remote_local_policy(&mut self, enabled: bool) {
677        self.remote_local_policy = enabled;
678    }
679
680    /// Serialize the current interception configuration into the
681    /// `Interception.setPolicy` params. Pure serialization of fields this
682    /// manager already holds — no engine-specific logic, no list contents
683    /// beyond the caller-supplied per-job substrings.
684    fn request_policy_params(&self) -> serde_json::Value {
685        serde_json::json!({
686            "version": 1,
687            "enabled": true,
688            "flags": {
689                "blockAll": self.block_all,
690                "blockVisuals": self.ignore_visuals,
691                "blockStylesheets": self.block_stylesheets,
692                "blockJavascript": self.block_javascript,
693                "blockAnalytics": self.block_analytics,
694                "blockAds": self.block_ads_enabled(),
695                "blockPrefetch": self.block_prefetch,
696                "onlyHtml": self.only_html,
697                "blacklistStrict": self.blacklist_strict,
698                "allowFirstPartyStylesheets": self.allow_first_party_stylesheets,
699                "allowFirstPartyJavascript": self.allow_first_party_javascript,
700                "allowFirstPartyVisuals": self.allow_first_party_visuals,
701            },
702            "blacklist": self.blacklist_patterns,
703            "whitelist": self.whitelist_patterns,
704            // Discriminant only — a non-default manager signals the engine to
705            // keep the round-trip (the manager's detection stays here).
706            "interceptManager": format!("{:?}", self.intercept_manager),
707            "pageUrl": self.document_target_url,
708        })
709    }
710
711    /// Ad blocking is governed by the `firewall` build (the `block_websites`
712    /// /`detect_ad` path). When that feature is off this manager performs no
713    /// ad blocking, so the pushed policy must report `false` to stay in parity.
714    #[inline]
715    fn block_ads_enabled(&self) -> bool {
716        cfg!(feature = "firewall")
717    }
718
719    /// Push the one-shot `Interception.setPolicy` to the remote engine when
720    /// enabled and Fetch interception is active. No-op otherwise (default), so
721    /// callers that never opt in pay nothing and behavior is unchanged.
722    pub fn emit_request_policy(&mut self) {
723        if !self.remote_local_policy || !self.protocol_request_interception_enabled {
724            return;
725        }
726        let params = self.request_policy_params();
727        self.queued_events.push_back(NetworkEvent::SendCdpRequest((
728            Cow::Borrowed("Interception.setPolicy"),
729            params,
730        )));
731    }
732
733    pub fn set_request_interception(&mut self, enabled: bool) {
734        self.user_request_interception_enabled = enabled;
735        self.update_protocol_request_interception();
736    }
737
738    pub fn set_cache_enabled(&mut self, enabled: bool) {
739        let run = self.user_cache_disabled == enabled;
740        self.user_cache_disabled = !enabled;
741        if run {
742            self.update_protocol_cache_disabled();
743        }
744    }
745
746    /// Enable fetch interception.
747    pub fn enable_request_intercept(&mut self) {
748        self.protocol_request_interception_enabled = true;
749    }
750
751    /// Disable fetch interception.
752    pub fn disable_request_intercept(&mut self) {
753        self.protocol_request_interception_enabled = false;
754    }
755
756    /// Set the cache site key.
757    #[cfg(feature = "_cache")]
758    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
759        self.cache_site_key = cache_site_key;
760    }
761
762    /// Set the cache policy.
763    #[cfg(feature = "_cache")]
764    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
765        self.cache_policy = cache_policy;
766    }
767
768    pub fn update_protocol_cache_disabled(&mut self) {
769        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
770    }
771
772    pub fn authenticate(&mut self, credentials: Credentials) {
773        self.credentials = Some(credentials);
774        self.update_protocol_request_interception();
775        self.protocol_request_interception_enabled = true;
776    }
777
778    fn update_protocol_request_interception(&mut self) {
779        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
780
781        if enabled == self.protocol_request_interception_enabled {
782            return;
783        }
784
785        if enabled {
786            self.push_cdp_request(ENABLE_FETCH.clone())
787        } else {
788            self.push_cdp_request(DisableParams::default())
789        }
790    }
791
792    /// Blocklist-only script blocking.
793    /// Returns true only when the URL matches an explicit blocklist condition.
794    #[inline]
795    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
796        // If analytics blocking is off, skip all analytics tries.
797        let block_analytics = self.block_analytics;
798
799        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
800        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
801        {
802            return true;
803        }
804
805        // 2) Custom website block list (explicit).
806        if crate::handler::blockers::block_websites::block_website(url) {
807            return true;
808        }
809
810        // 3) Path-based explicit tries / fallbacks.
811        //
812        // We run these on:
813        // - path with leading slash ("/js/app.js")
814        // - path without leading slash ("js/app.js")
815        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
816        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
817            // Remove query/fragment so matching stays stable.
818            let p_slash = Self::strip_query_fragment(path_with_slash);
819            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
820
821            // Basename for filename-only lists.
822            let base = match p_slash.rsplit('/').next() {
823                Some(b) => b,
824                None => p_slash,
825            };
826
827            // ---- Trie checks ----
828            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
829            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
830                return true;
831            }
832            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
833                return true;
834            }
835            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
836                return true;
837            }
838
839            // Base-path ignore tries (framework noise / known ignorable script paths).
840            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
841            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
842                return true;
843            }
844
845            // Style path ignores only when visuals are ignored.
846            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
847                return true;
848            }
849        }
850
851        false
852    }
853
854    /// Extract the absolute URL path portion WITH the leading slash.
855    ///
856    /// Example:
857    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
858    #[inline]
859    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
860        // find scheme separator
861        let bytes = url.as_bytes();
862        let idx = memchr::memmem::find(bytes, b"//")?;
863        let after_slashes = idx + 2;
864
865        // find first slash after host
866        let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
867        let slash_idx = after_slashes + slash_rel;
868
869        if slash_idx < url.len() {
870            Some(&url[slash_idx..])
871        } else {
872            None
873        }
874    }
875
876    /// Strip query string and fragment from a path-ish string.
877    ///
878    /// Example:
879    /// - "/a/b.js?x=1#y" -> "/a/b.js"
880    #[inline]
881    fn strip_query_fragment(s: &str) -> &str {
882        match memchr::memchr2(b'?', b'#', s.as_bytes()) {
883            Some(i) => &s[..i],
884            None => s,
885        }
886    }
887
888    /// Determine if the request should be skipped.
889    #[inline]
890    fn skip_xhr(
891        &self,
892        skip_networking: bool,
893        event: &EventRequestPaused,
894        network_event: bool,
895    ) -> bool {
896        // XHR check
897        if !skip_networking && network_event {
898            let request_url = event.request.url.as_str();
899
900            // check if part of ignore scripts.
901            let skip_analytics =
902                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
903
904            if skip_analytics {
905                true
906            } else if self.block_stylesheets || self.ignore_visuals {
907                let block_css = self.block_stylesheets;
908                let block_media = self.ignore_visuals;
909
910                let mut block_request = false;
911
912                if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
913                    let hlen = request_url.len();
914                    let has_asset = hlen - position;
915
916                    if has_asset >= 3 {
917                        let next_position = position + 1;
918
919                        if block_media
920                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
921                                &request_url[next_position..].into(),
922                            )
923                        {
924                            block_request = true;
925                        } else if block_css {
926                            block_request = CaseInsensitiveString::from(
927                                &request_url.as_bytes()[next_position..],
928                            )
929                            .contains(&**CSS_EXTENSION)
930                        }
931                    }
932                }
933
934                if !block_request {
935                    block_request = ignore_script_xhr_media(request_url);
936                }
937
938                block_request
939            } else {
940                skip_networking
941            }
942        } else {
943            skip_networking
944        }
945    }
946
947    #[cfg(feature = "adblock")]
948    #[inline]
949    /// Detect if ad enabled.
950    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
951        if skip_networking {
952            true
953        } else {
954            block_ads(&event.request.url) || self.detect_ad(event)
955        }
956    }
957
958    /// When adblock feature is disabled, this is a no-op.
959    #[cfg(not(feature = "adblock"))]
960    #[inline]
961    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
962        use crate::handler::blockers::block_websites::block_ads;
963        if skip_networking {
964            true
965        } else {
966            block_ads(&event.request.url)
967        }
968    }
969
970    #[inline]
971    /// Fail request
972    fn fail_request_blocked(
973        &mut self,
974        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
975    ) {
976        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
977            request_id.clone(),
978            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
979        );
980        self.push_cdp_request(params);
981    }
982
983    #[inline]
984    /// Fulfill request
985    fn fulfill_request_empty_200(
986        &mut self,
987        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
988    ) {
989        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
990            request_id.clone(),
991            200,
992        );
993        self.push_cdp_request(params);
994    }
995
996    #[cfg(feature = "_cache")]
997    #[inline]
998    /// Fulfill a paused Fetch request from cached bytes + header map.
999    ///
1000    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
1001    fn fulfill_request_from_cache(
1002        &mut self,
1003        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
1004        body: &[u8],
1005        headers: &std::collections::HashMap<String, String>,
1006        status: i64,
1007    ) {
1008        use crate::cdp::browser_protocol::fetch::HeaderEntry;
1009        use crate::handler::network::fetch::FulfillRequestParams;
1010        use base64::Engine;
1011
1012        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
1013
1014        for (k, v) in headers.iter() {
1015            resp_headers.push(HeaderEntry {
1016                name: k.clone(),
1017                value: v.clone(),
1018            });
1019        }
1020
1021        let mut params = FulfillRequestParams::new(request_id.clone(), status);
1022
1023        // TODO: have this already encoded prior.
1024        params.body = Some(
1025            base64::engine::general_purpose::STANDARD
1026                .encode(body)
1027                .into(),
1028        );
1029
1030        params.response_headers = Some(resp_headers);
1031
1032        self.push_cdp_request(params);
1033    }
1034
1035    #[inline]
1036    /// Continue the request url.
1037    fn continue_request_with_url(
1038        &mut self,
1039        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
1040        url: Option<&str>,
1041        intercept_response: bool,
1042    ) {
1043        let mut params = ContinueRequestParams::new(request_id.clone());
1044        if let Some(url) = url {
1045            params.url = Some(url.to_string());
1046            params.intercept_response = Some(intercept_response);
1047        }
1048        self.push_cdp_request(params);
1049    }
1050
1051    /// On fetch request paused interception.
1052    #[inline]
1053    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
1054        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
1055            return;
1056        }
1057
1058        if self.block_all {
1059            tracing::debug!(
1060                "Blocked (block_all): {:?} - {}",
1061                event.resource_type,
1062                event.request.url
1063            );
1064            return self.fail_request_blocked(&event.request_id);
1065        }
1066
1067        // Capture the CDP initiator type (set by Chrome on
1068        // `Network.requestWillBeSent`) before consuming the cached event.
1069        // Used by the legacy stylesheet heuristic below as an additive
1070        // fallback alongside the new `allow_first_party_*` flags — keeping
1071        // both keeps 2.48.2's third-party-with-unknown-initiator stylesheet
1072        // pass-through bug-compatible (strict superset of allowed traffic).
1073        // Cheap clone of an `Option<InitiatorType>` enum (no allocation).
1074        let initiator_type: Option<InitiatorType> = event
1075            .network_id
1076            .as_ref()
1077            .and_then(|nid| self.requests_will_be_sent.get(nid.as_ref()))
1078            .map(|(rwbs, _)| rwbs.initiator.r#type.clone());
1079
1080        if let Some(network_id) = event.network_id.as_ref() {
1081            if let Some((request_will_be_sent, _)) =
1082                self.requests_will_be_sent.remove(network_id.as_ref())
1083            {
1084                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
1085            } else {
1086                self.request_id_to_interception_id.insert(
1087                    network_id.clone(),
1088                    (event.request_id.clone().into(), Instant::now()),
1089                );
1090            }
1091        }
1092
1093        // From here on, we handle the full decision tree.
1094        let javascript_resource = event.resource_type == ResourceType::Script;
1095        let document_resource = event.resource_type == ResourceType::Document;
1096        let network_resource =
1097            !document_resource && crate::utils::is_data_resource(&event.resource_type);
1098
1099        // Start with static / cheap skip checks.
1100        let mut skip_networking =
1101            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
1102
1103        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
1104            skip_networking = true;
1105        }
1106
1107        // Also short-circuit if we've reloaded this document too many times.
1108        if !skip_networking {
1109            skip_networking = self.document_reload_tracker >= 3;
1110        }
1111
1112        // Handle document redirect / masking and track xml documents.
1113        let (current_url_cow, had_replacer) =
1114            self.handle_document_replacement_and_tracking(event, document_resource);
1115
1116        let current_url: &str = current_url_cow.as_ref();
1117
1118        let blacklisted = self.is_blacklisted(current_url);
1119
1120        if !self.blacklist_strict && blacklisted {
1121            skip_networking = true;
1122        }
1123
1124        if !skip_networking {
1125            // Allow XSL for sitemap XML.
1126            if self.xml_document && current_url.ends_with(".xsl") {
1127                skip_networking = false;
1128            } else {
1129                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1130            }
1131        }
1132
1133        // Skip ad detection for the user-requested top-level Document and
1134        // every step of its redirect chain. The crawler explicitly targets
1135        // this URL — fulfilling-empty-200 a page just because its host
1136        // matches an ad classifier breaks the user's intent (you can
1137        // legitimately want to scrape an ad page). Reproduced on
1138        // https://logrocket.com/careers, where the firewall ad list
1139        // flagged the host and chromey emitted a 17-byte stub for the
1140        // document; downstream sub-resources (script/img/iframe/etc.)
1141        // remain subject to ad blocking through the rest of the tree.
1142        //
1143        // Signals in short-circuit order (cheap → expensive):
1144        //   1. `redirected_request_id.is_some()` — explicit redirect hop
1145        //   2. `had_replacer` — chromey's masked-URL repair path
1146        //   3. `document_target_url.is_empty()` — very first nav, tracker
1147        //      not yet populated
1148        //   4. URL equality against the target — last because string
1149        //      compare is the only non-O(1) op (`handle_document_
1150        //      replacement_and_tracking` above just set the target to
1151        //      the current url, so this is the always-true fallback)
1152        //
1153        // Sub-resources (Script/Image/Font/Stylesheet/XHR/iframe content)
1154        // remain subject to ad blocking through the rest of the tree.
1155        let is_main_document_request = document_resource
1156            && (event.redirected_request_id.is_some()
1157                || had_replacer
1158                || self.document_target_url.is_empty()
1159                || event.request.url == self.document_target_url);
1160        if !is_main_document_request {
1161            skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1162        }
1163
1164        // Ignore embedded scripts, tracker stylesheets, and tracker images when only_html or ignore_visuals is set.
1165        if !skip_networking
1166            && self.block_javascript
1167            && (self.only_html || self.ignore_visuals)
1168            && (javascript_resource
1169                || document_resource
1170                || event.resource_type == ResourceType::Stylesheet
1171                || event.resource_type == ResourceType::Image)
1172        {
1173            skip_networking = ignore_script_embedded(current_url);
1174        }
1175
1176        // Script policy: allow-by-default.
1177        // Block only if explicit block list patterns match.
1178        if !skip_networking && javascript_resource {
1179            skip_networking = self.should_block_script_blocklist_only(current_url);
1180        }
1181
1182        // XHR / data resources.
1183        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1184
1185        // Custom interception layer.
1186        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1187            skip_networking = self.intercept_manager.intercept_detection(
1188                current_url,
1189                self.ignore_visuals,
1190                network_resource,
1191            );
1192        }
1193
1194        // Custom website block list.
1195        if !skip_networking && (javascript_resource || network_resource) {
1196            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1197        }
1198
1199        // whitelist 3rd party
1200        // not required unless explicit blocking.
1201        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1202        {
1203            skip_networking = false;
1204        }
1205
1206        // check if the url is in the whitelist.
1207        if skip_networking && self.is_whitelisted(current_url) {
1208            skip_networking = false;
1209        }
1210
1211        // First-party allow (default ON for stylesheets/javascript/visuals).
1212        //
1213        // `block_stylesheets` and `ignore_visuals` were originally coarse
1214        // "drop all" bandwidth optimizations, but modern SPAs (React/Next.js
1215        // with dynamic `import()`, AppFabric, requirejs-style loaders, etc.)
1216        // gate hydration on the `load` event of resources they themselves
1217        // load — blocking those leaves outer_html_bytes capturing only the
1218        // pre-hydration shell. To stay flexible without regressing the
1219        // bandwidth case, we use registrable-domain (eTLD+1) matching:
1220        // when a request is first-party to the page's primary frame, the
1221        // corresponding `allow_first_party_*` flag (default `true`) lets it
1222        // through; third-party requests still hit the original block path.
1223        //
1224        // Set the matching `allow_first_party_*` flag to `false` to restore
1225        // the strict "block ALL of this resource type" semantics.
1226        if skip_networking && !self.document_target_domain.is_empty() {
1227            let allow = match event.resource_type {
1228                ResourceType::Stylesheet => self.allow_first_party_stylesheets,
1229                ResourceType::Script => self.allow_first_party_javascript,
1230                _ if IGNORE_VISUAL_RESOURCE_MAP.contains(event.resource_type.as_ref()) => {
1231                    self.allow_first_party_visuals
1232                }
1233                _ => false,
1234            };
1235            if allow && self.is_first_party_url(current_url) {
1236                skip_networking = false;
1237            }
1238        }
1239
1240        // Legacy stylesheet allow (kept as additive fallback for strict
1241        // bug-compat with chromey 2.48.2). For parser-dispatched
1242        // <link rel="stylesheet"> Chrome routinely fires
1243        // `Fetch.requestPaused` *before* the companion
1244        // `Network.requestWillBeSent`, so `initiator_type` is `None`; this
1245        // rescues those even on cross-origin CDNs where the eTLD+1 differs
1246        // from the page (e.g. intuit.com → intuitcdn.net). Tracker CSS
1247        // injected by JS that runs after parser yield carries
1248        // `Some(InitiatorType::Script)` and stays blocked here.
1249        //
1250        // Gated on `allow_first_party_stylesheets=true` so callers who opt
1251        // out of first-party allow get a strict "block ALL stylesheets"
1252        // semantics with no surprises from the heuristic side-channel.
1253        if skip_networking
1254            && self.allow_first_party_stylesheets
1255            && self.block_stylesheets
1256            && event.resource_type == ResourceType::Stylesheet
1257            && !matches!(initiator_type, Some(InitiatorType::Script))
1258        {
1259            skip_networking = false;
1260        }
1261
1262        if self.blacklist_strict && blacklisted {
1263            skip_networking = true;
1264        }
1265
1266        if skip_networking {
1267            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1268            self.fulfill_request_empty_200(&event.request_id);
1269        } else {
1270            #[cfg(feature = "_cache")]
1271            {
1272                if let (Some(policy), Some(cache_site_key)) =
1273                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1274                {
1275                    let current_url = format!("{}:{}", event.request.method, &current_url);
1276
1277                    if let Some((res, cache_policy)) =
1278                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1279                    {
1280                        if policy.allows_cached(&cache_policy) {
1281                            tracing::debug!(
1282                                "Remote Cached: {:?} - {}",
1283                                &event.resource_type,
1284                                &current_url
1285                            );
1286                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1287                            return self.fulfill_request_from_cache(
1288                                &event.request_id,
1289                                &res.body,
1290                                &flat_headers,
1291                                res.status as i64,
1292                            );
1293                        }
1294                    }
1295                }
1296            }
1297
1298            // check our frame cache for the run.
1299            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1300            self.continue_request_with_url(
1301                &event.request_id,
1302                if had_replacer {
1303                    Some(current_url)
1304                } else {
1305                    None
1306                },
1307                !had_replacer,
1308            );
1309        }
1310    }
1311
1312    /// Shared "visuals + basic blocking" logic.
1313    ///
1314    /// IMPORTANT: Scripts are NOT blocked here anymore.
1315    /// Scripts are allowed by default and only blocked via explicit blocklists
1316    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1317    #[inline]
1318    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1319        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1320            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1321    }
1322
1323    /// Does the network manager have a target domain?
1324    pub fn has_target_domain(&self) -> bool {
1325        !self.document_target_url.is_empty()
1326    }
1327
1328    /// True when `url`'s registrable domain matches the page's primary
1329    /// frame. Empty `document_target_domain` (no nav yet, or a redirect
1330    /// reset) returns `false` so we don't accidentally treat every URL
1331    /// as first-party.
1332    #[inline]
1333    fn is_first_party_url(&self, url: &str) -> bool {
1334        if self.document_target_domain.is_empty() {
1335            return false;
1336        }
1337        match host_and_rest(url) {
1338            Some((host, _)) => base_domain_from_host(host) == self.document_target_domain,
1339            None => false,
1340        }
1341    }
1342
1343    /// Set the target page url for tracking.
1344    pub fn set_page_url(&mut self, page_target_url: String) {
1345        let host_base = host_and_rest(&page_target_url)
1346            .map(|(h, _)| base_domain_from_host(h))
1347            .unwrap_or("");
1348
1349        self.document_target_domain = host_base.to_string();
1350        self.document_target_url = page_target_url;
1351
1352        // Re-push the policy on navigation so the remote engine's first-party
1353        // origin tracks the new page. No-op unless `remote_local_policy` is on
1354        // and interception is active.
1355        self.emit_request_policy();
1356    }
1357
1358    /// Clear the initial target domain on every navigation.
1359    pub fn clear_target_domain(&mut self) {
1360        self.document_reload_tracker = 0;
1361        self.document_target_url = Default::default();
1362        self.document_target_domain = Default::default();
1363    }
1364
1365    /// Handles:
1366    /// - document reload tracking (`document_reload_tracker`)
1367    /// - redirect masking / replacement
1368    /// - xml document detection (`xml_document`)
1369    /// - `document_target_url` updates
1370    ///
1371    /// Returns (current_url, had_replacer).
1372    #[inline]
1373    fn handle_document_replacement_and_tracking<'a>(
1374        &mut self,
1375        event: &'a EventRequestPaused,
1376        document_resource: bool,
1377    ) -> (Cow<'a, str>, bool) {
1378        let mut replacer: Option<String> = None;
1379        let current_url = event.request.url.as_str();
1380
1381        if document_resource {
1382            if self.document_target_url == current_url {
1383                self.document_reload_tracker += 1;
1384            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1385            {
1386                let (http_document_replacement, mut https_document_replacement) =
1387                    if self.document_target_url.starts_with("http://") {
1388                        (
1389                            self.document_target_url.replacen("http://", "http//", 1),
1390                            self.document_target_url.replacen("http://", "https://", 1),
1391                        )
1392                    } else {
1393                        (
1394                            self.document_target_url.replacen("https://", "https//", 1),
1395                            self.document_target_url.replacen("https://", "http://", 1),
1396                        )
1397                    };
1398
1399                // Track trailing slash to restore later.
1400                let trailing = https_document_replacement.ends_with('/');
1401                if trailing {
1402                    https_document_replacement.pop();
1403                }
1404                if https_document_replacement.ends_with('/') {
1405                    https_document_replacement.pop();
1406                }
1407
1408                let redirect_mask = format!(
1409                    "{}{}",
1410                    https_document_replacement, http_document_replacement
1411                );
1412
1413                if current_url == redirect_mask {
1414                    replacer = Some(if trailing {
1415                        format!("{}/", https_document_replacement)
1416                    } else {
1417                        https_document_replacement
1418                    });
1419                }
1420            }
1421
1422            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1423                self.xml_document = true;
1424            }
1425
1426            // Track last seen document URL.
1427            self.document_target_url = event.request.url.clone();
1428            self.document_target_domain = host_and_rest(&self.document_target_url)
1429                .map(|(h, _)| base_domain_from_host(h).to_string())
1430                .unwrap_or_default();
1431        }
1432
1433        let current_url_cow = match replacer {
1434            Some(r) => Cow::Owned(r),
1435            None => Cow::Borrowed(event.request.url.as_str()),
1436        };
1437
1438        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1439        (current_url_cow, had_replacer)
1440    }
1441
1442    /// Perform a page intercept for chrome using the adblock engine.
1443    /// Uses the custom engine when user-supplied filter rules are configured,
1444    /// otherwise falls back to the global default engine with built-in patterns.
1445    #[cfg(feature = "adblock")]
1446    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1447        use adblock::{
1448            lists::{FilterSet, ParseOptions, RuleTypes},
1449            Engine,
1450        };
1451
1452        lazy_static::lazy_static! {
1453            static ref AD_ENGINE: Engine = {
1454                let mut filter_set = FilterSet::new(false);
1455                let mut rules = ParseOptions::default();
1456                rules.rule_types = RuleTypes::All;
1457
1458                filter_set.add_filters(
1459                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1460                    rules,
1461                );
1462
1463                // When adblock_easylist is enabled, EasyList + EasyPrivacy are
1464                // embedded at build time for zero-cost runtime loading.
1465                #[cfg(feature = "adblock_easylist")]
1466                {
1467                    static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1468                    static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1469
1470                    if !EASYLIST.is_empty() {
1471                        filter_set.add_filter_list(EASYLIST, rules);
1472                    }
1473                    if !EASYPRIVACY.is_empty() {
1474                        filter_set.add_filter_list(EASYPRIVACY, rules);
1475                    }
1476                }
1477
1478                Engine::from_filter_set(filter_set, true)
1479            };
1480        }
1481
1482        let blockable = event.resource_type == ResourceType::Script
1483            || event.resource_type == ResourceType::Image
1484            || event.resource_type == ResourceType::Media
1485            || event.resource_type == ResourceType::Stylesheet
1486            || event.resource_type == ResourceType::Document
1487            || event.resource_type == ResourceType::Fetch
1488            || event.resource_type == ResourceType::Xhr;
1489
1490        if !blockable {
1491            return false;
1492        }
1493
1494        let u = &event.request.url;
1495
1496        let source_domain = if self.document_target_domain.is_empty() {
1497            "example.com"
1498        } else {
1499            &self.document_target_domain
1500        };
1501
1502        // Fast hostname extraction without full URL parsing.
1503        // preparsed(url, request_hostname, source_hostname, type, third_party)
1504        let hostname = u
1505            .strip_prefix("https://")
1506            .or_else(|| u.strip_prefix("http://"))
1507            .and_then(|rest| rest.split('/').next())
1508            // Strip userinfo (user:pass@) if present.
1509            .map(
1510                |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1511                    Some(i) => &authority[i + 1..],
1512                    None => authority,
1513                },
1514            )
1515            // Strip port (:8080) if present.
1516            .and_then(|host_port| host_port.split(':').next())
1517            .unwrap_or(source_domain);
1518
1519        let resource_type_str = match event.resource_type {
1520            ResourceType::Script => "script",
1521            ResourceType::Image => "image",
1522            ResourceType::Media => "media",
1523            ResourceType::Stylesheet => "stylesheet",
1524            ResourceType::Document => "document",
1525            ResourceType::Fetch => "fetch",
1526            ResourceType::Xhr => "xhr",
1527            _ => "other",
1528        };
1529
1530        let request = adblock::request::Request::preparsed(
1531            u,
1532            hostname,
1533            source_domain,
1534            resource_type_str,
1535            !event.request.is_same_site.unwrap_or_default(),
1536        );
1537
1538        let engine: &Engine = match self.adblock_engine.as_ref() {
1539            Some(custom) => custom,
1540            None => &AD_ENGINE,
1541        };
1542
1543        engine.check_network_request(&request).matched
1544    }
1545
1546    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1547        let response = if self
1548            .attempted_authentications
1549            .contains(event.request_id.as_ref())
1550        {
1551            AuthChallengeResponseResponse::CancelAuth
1552        } else if self.credentials.is_some() {
1553            self.attempted_authentications
1554                .insert(event.request_id.clone().into());
1555            AuthChallengeResponseResponse::ProvideCredentials
1556        } else {
1557            AuthChallengeResponseResponse::Default
1558        };
1559
1560        let mut auth = AuthChallengeResponse::new(response);
1561        if let Some(creds) = self.credentials.clone() {
1562            auth.username = Some(creds.username);
1563            auth.password = Some(creds.password);
1564        }
1565        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1566    }
1567
1568    /// Set the page offline network emulation condition.
1569    pub fn set_offline_mode(&mut self, value: bool) {
1570        if self.offline == value {
1571            return;
1572        }
1573        self.offline = value;
1574        if let Ok(condition) = NetworkConditions::builder()
1575            .url_pattern("")
1576            .latency(0)
1577            .download_throughput(-1.)
1578            .upload_throughput(-1.)
1579            .build()
1580        {
1581            if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1582                .offline(self.offline)
1583                .matched_network_condition(condition)
1584                .build()
1585            {
1586                self.push_cdp_request(network);
1587            }
1588        }
1589    }
1590
1591    /// Request interception doesn't happen for data URLs with Network Service.
1592    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1593        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1594            if let Some((interception_id, _)) = self
1595                .request_id_to_interception_id
1596                .remove(event.request_id.as_ref())
1597            {
1598                self.on_request(event, Some(interception_id));
1599            } else {
1600                self.requests_will_be_sent
1601                    .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1602            }
1603        } else {
1604            self.on_request(event, None);
1605        }
1606    }
1607
1608    /// The request was served from the cache.
1609    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1610        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1611            request.from_memory_cache = true;
1612        }
1613    }
1614
1615    /// On network response received.
1616    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1617        let mut request_failed = false;
1618
1619        // Track how many bytes we actually deducted from this target.
1620        let mut deducted: u64 = 0;
1621
1622        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1623            let before = *max_bytes;
1624
1625            // encoded_data_length -> saturating cast to u64
1626            let received_bytes: u64 = event.response.encoded_data_length as u64;
1627
1628            // Safe parse of Content-Length
1629            let content_length: Option<u64> = event
1630                .response
1631                .headers
1632                .inner()
1633                .get("content-length")
1634                .and_then(|v| v.as_str())
1635                .and_then(|s| s.trim().parse::<u64>().ok());
1636
1637            // Deduct what we actually received
1638            *max_bytes = max_bytes.saturating_sub(received_bytes);
1639
1640            // If the declared size can't fit, zero out now
1641            if let Some(cl) = content_length {
1642                if cl > *max_bytes {
1643                    *max_bytes = 0;
1644                }
1645            }
1646
1647            request_failed = *max_bytes == 0;
1648
1649            // Compute exact delta deducted on this event
1650            deducted = before.saturating_sub(*max_bytes);
1651        }
1652
1653        // Bubble up the deduction (even if request continues)
1654        if deducted > 0 {
1655            self.queued_events
1656                .push_back(NetworkEvent::BytesConsumed(deducted));
1657        }
1658
1659        // block all network request moving forward.
1660        if request_failed && self.max_bytes_allowed.is_some() {
1661            self.set_block_all(true);
1662        }
1663
1664        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1665            request.set_response(event.response.clone());
1666            self.queued_events.push_back(if request_failed {
1667                NetworkEvent::RequestFailed(request)
1668            } else {
1669                NetworkEvent::RequestFinished(request)
1670            });
1671        }
1672    }
1673
1674    /// On network loading finished.
1675    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1676        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1677            if let Some(interception_id) = request.interception_id.as_ref() {
1678                self.attempted_authentications
1679                    .remove(interception_id.as_ref());
1680            }
1681            self.queued_events
1682                .push_back(NetworkEvent::RequestFinished(request));
1683        }
1684    }
1685
1686    /// On network loading failed.
1687    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1688        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1689            request.failure_text = Some(event.error_text.clone());
1690            if let Some(interception_id) = request.interception_id.as_ref() {
1691                self.attempted_authentications
1692                    .remove(interception_id.as_ref());
1693            }
1694            self.queued_events
1695                .push_back(NetworkEvent::RequestFailed(request));
1696        }
1697    }
1698
1699    /// On request will be sent.
1700    fn on_request(
1701        &mut self,
1702        event: &EventRequestWillBeSent,
1703        interception_id: Option<InterceptionId>,
1704    ) {
1705        let mut redirect_chain = Vec::new();
1706        let mut redirect_location = None;
1707
1708        if let Some(redirect_resp) = &event.redirect_response {
1709            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1710                if is_redirect_status(redirect_resp.status) {
1711                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1712                        if redirect_resp.url != location {
1713                            let fixed_location = location.replace(&redirect_resp.url, "");
1714
1715                            if !fixed_location.is_empty() {
1716                                if let Some(resp) = request.response.as_mut() {
1717                                    resp.headers.0["Location"] =
1718                                        serde_json::Value::String(fixed_location.clone());
1719                                }
1720                            }
1721
1722                            redirect_location = Some(fixed_location);
1723                        }
1724                    }
1725                }
1726
1727                {
1728                    let mut redirect_resp = redirect_resp.clone();
1729
1730                    if let Some(redirect_location) = redirect_location {
1731                        if !redirect_location.is_empty() {
1732                            redirect_resp.headers.0["Location"] =
1733                                serde_json::Value::String(redirect_location);
1734                        }
1735                    }
1736
1737                    self.handle_request_redirect(&mut request, redirect_resp);
1738                }
1739
1740                redirect_chain = std::mem::take(&mut request.redirect_chain);
1741                redirect_chain.push(request);
1742            }
1743        }
1744
1745        // Redirect cap: applies only to Document-type hops and only when
1746        // `max_redirects` is set. Sub-resource chains are untouched.
1747        if let Some(cap) = self.max_redirects {
1748            let is_document = matches!(event.r#type, Some(ResourceType::Document));
1749            if is_document && redirect_chain.len() > cap {
1750                let mut failed = HttpRequest::new(
1751                    event.request_id.clone(),
1752                    event.frame_id.clone(),
1753                    interception_id,
1754                    self.user_request_interception_enabled,
1755                    redirect_chain,
1756                );
1757                failed.url = Some(event.request.url.clone());
1758                failed.method = Some(event.request.method.clone());
1759                failed.failure_text = Some("net::ERR_TOO_MANY_REDIRECTS".into());
1760                self.push_cdp_request(
1761                    chromiumoxide_cdp::cdp::browser_protocol::page::StopLoadingParams::default(),
1762                );
1763                self.queued_events
1764                    .push_back(NetworkEvent::RequestFailed(failed));
1765                return;
1766            }
1767        }
1768
1769        let request = HttpRequest::new(
1770            event.request_id.clone(),
1771            event.frame_id.clone(),
1772            interception_id,
1773            self.user_request_interception_enabled,
1774            redirect_chain,
1775        );
1776
1777        let rid = event.request_id.clone();
1778        self.queued_events
1779            .push_back(NetworkEvent::Request(rid.clone()));
1780        self.requests.insert(rid, request);
1781    }
1782
1783    /// Handle request redirect.
1784    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1785        request.set_response(response);
1786        if let Some(interception_id) = request.interception_id.as_ref() {
1787            self.attempted_authentications
1788                .remove(interception_id.as_ref());
1789        }
1790    }
1791}
1792
1793#[derive(Debug)]
1794pub enum NetworkEvent {
1795    /// Send a CDP request.
1796    SendCdpRequest((MethodId, serde_json::Value)),
1797    /// Request.
1798    Request(RequestId),
1799    /// Response
1800    Response(RequestId),
1801    /// Request failed.
1802    RequestFailed(HttpRequest),
1803    /// Request finished.
1804    RequestFinished(HttpRequest),
1805    /// Bytes consumed.
1806    BytesConsumed(u64),
1807}
1808
1809#[cfg(test)]
1810mod tests {
1811    use super::ALLOWED_MATCHER_3RD_PARTY;
1812    use crate::handler::network::NetworkManager;
1813    use std::time::Duration;
1814
1815    #[test]
1816    fn test_allowed_matcher_3rd_party() {
1817        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1818        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1819        assert!(
1820            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1821            "expected Cloudflare challenge script to be allowed"
1822        );
1823
1824        // Should NOT be allowed (not in allow-list)
1825        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1826        assert!(
1827            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1828            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1829        );
1830
1831        // A couple sanity checks for existing allow patterns
1832        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1833        assert!(ALLOWED_MATCHER_3RD_PARTY
1834            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1835        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1836    }
1837
1838    #[test]
1839    fn test_script_allowed_by_default_when_not_blocklisted() {
1840        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1841        nm.set_page_url(
1842            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1843        );
1844
1845        // A random script that should not match your block tries.
1846        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1847        assert!(
1848            !nm.should_block_script_blocklist_only(ok),
1849            "expected non-blocklisted script to be allowed"
1850        );
1851    }
1852
1853    #[test]
1854    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1855        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1856        nm.set_page_url(
1857            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1858        );
1859
1860        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1861        let bad = "https://cdn.example.net/js/analytics.js";
1862        assert!(
1863            nm.should_block_script_blocklist_only(bad),
1864            "expected analytics.js to be blocklisted"
1865        );
1866    }
1867
1868    #[test]
1869    fn test_allowed_matcher_3rd_party_sanity() {
1870        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1871        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1872        assert!(
1873            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1874            "expected Cloudflare challenge script to be allowed"
1875        );
1876
1877        // Should NOT be allowed (not in allow-list)
1878        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1879        assert!(
1880            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1881            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1882        );
1883
1884        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1885        assert!(ALLOWED_MATCHER_3RD_PARTY
1886            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1887        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1888    }
1889    #[test]
1890    fn test_dynamic_blacklist_blocks_url() {
1891        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1892        nm.set_page_url("https://example.com/".to_string());
1893
1894        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1895        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1896        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1897
1898        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1899    }
1900
1901    #[test]
1902    fn test_blacklist_strict_wins_over_whitelist() {
1903        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1904        nm.set_page_url("https://example.com/".to_string());
1905
1906        // Same URL in both lists.
1907        nm.set_blacklist_patterns(["beacon.min.js"]);
1908        nm.set_whitelist_patterns(["beacon.min.js"]);
1909
1910        nm.set_blacklist_strict(true);
1911
1912        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1913        assert!(nm.is_whitelisted(u));
1914        assert!(nm.is_blacklisted(u));
1915
1916        // In strict mode, it should still be considered blocked at decision time.
1917        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1918        assert!(nm.blacklist_strict);
1919    }
1920
1921    #[cfg(feature = "adblock")]
1922    fn make_request_paused(
1923        url: &str,
1924        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1925        is_same_site: bool,
1926    ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1927        use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1928        use chromiumoxide_cdp::cdp::browser_protocol::network::{
1929            Headers, Request, RequestReferrerPolicy, ResourcePriority,
1930        };
1931
1932        EventRequestPaused {
1933            request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1934                "test-req".to_string(),
1935            )
1936            .into(),
1937            request: Request {
1938                url: url.to_string(),
1939                method: "GET".to_string(),
1940                headers: Headers::new(serde_json::Value::Object(Default::default())),
1941                initial_priority: ResourcePriority::Medium,
1942                referrer_policy: RequestReferrerPolicy::NoReferrer,
1943                url_fragment: None,
1944                has_post_data: None,
1945                post_data_entries: None,
1946                mixed_content_type: None,
1947                is_link_preload: None,
1948                trust_token_params: None,
1949                is_same_site: Some(is_same_site),
1950                is_ad_related: None,
1951            },
1952            frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1953                "frame1".to_string(),
1954            ),
1955            resource_type,
1956            response_error_reason: None,
1957            response_status_code: None,
1958            response_status_text: None,
1959            response_headers: None,
1960            network_id: None,
1961            redirected_request_id: None,
1962        }
1963    }
1964
1965    #[cfg(feature = "adblock")]
1966    #[test]
1967    fn test_detect_ad_blocks_known_tracker_scripts() {
1968        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1969
1970        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1971        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1972
1973        let event = make_request_paused(
1974            "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1975            ResourceType::Script,
1976            false,
1977        );
1978
1979        assert!(
1980            nm.detect_ad(&event),
1981            "googletagmanager.com script should be detected as ad"
1982        );
1983    }
1984
1985    #[cfg(feature = "adblock")]
1986    #[test]
1987    fn test_detect_ad_allows_legitimate_scripts() {
1988        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1989
1990        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1991        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1992
1993        let event = make_request_paused(
1994            "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1995            ResourceType::Script,
1996            true,
1997        );
1998
1999        assert!(
2000            !nm.detect_ad(&event),
2001            "legitimate first-party app bundle should not be blocked"
2002        );
2003    }
2004
2005    #[cfg(feature = "adblock")]
2006    #[test]
2007    fn test_detect_ad_uses_source_domain() {
2008        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2009
2010        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2011        nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
2012
2013        assert!(
2014            !nm.document_target_domain.is_empty(),
2015            "document_target_domain should be set after set_page_url"
2016        );
2017
2018        let event = make_request_paused(
2019            "https://www.google-analytics.com/analytics.js",
2020            ResourceType::Script,
2021            false,
2022        );
2023
2024        assert!(
2025            nm.detect_ad(&event),
2026            "google-analytics.com should be blocked as tracker"
2027        );
2028    }
2029
2030    #[cfg(feature = "adblock")]
2031    #[test]
2032    fn test_custom_adblock_engine_takes_precedence() {
2033        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2034
2035        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2036        nm.set_page_url("https://example.com/".to_string());
2037
2038        // Build a custom engine with a specific rule.
2039        let mut filter_set = adblock::lists::FilterSet::new(false);
2040        let mut opts = adblock::lists::ParseOptions::default();
2041        opts.rule_types = adblock::lists::RuleTypes::All;
2042        filter_set.add_filters(["||custom-tracker.example.net^"], opts);
2043        let engine = adblock::Engine::from_filter_set(filter_set, true);
2044        nm.set_adblock_engine(std::sync::Arc::new(engine));
2045
2046        let event = make_request_paused(
2047            "https://custom-tracker.example.net/pixel.js",
2048            ResourceType::Script,
2049            false,
2050        );
2051
2052        assert!(
2053            nm.detect_ad(&event),
2054            "custom engine rule should block custom-tracker.example.net"
2055        );
2056    }
2057
2058    /// Helper: run a URL through the full `on_fetch_request_paused` pipeline
2059    /// and return whether it was blocked (true) or allowed (false).
2060    #[cfg(feature = "adblock")]
2061    fn run_full_interception(
2062        nm: &mut NetworkManager,
2063        url: &str,
2064        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
2065        is_same_site: bool,
2066    ) -> bool {
2067        use super::NetworkEvent;
2068
2069        // Drain any prior events.
2070        while nm.poll().is_some() {}
2071
2072        let event = make_request_paused(url, resource_type, is_same_site);
2073        nm.on_fetch_request_paused(&event);
2074
2075        // Check what was emitted: Fetch.fulfillRequest = blocked, Fetch.continueRequest = allowed.
2076        let mut blocked = false;
2077        while let Some(ev) = nm.poll() {
2078            if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
2079                let m: &str = method.as_ref();
2080                if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
2081                    blocked = true;
2082                }
2083            }
2084        }
2085        blocked
2086    }
2087
2088    // ── End-to-end interception tests ───────────────────────────────────
2089
2090    #[cfg(feature = "adblock")]
2091    #[test]
2092    fn test_e2e_tracker_script_blocked() {
2093        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2094
2095        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2096        nm.set_page_url("https://www.wine-searcher.com/".to_string());
2097
2098        assert!(
2099            run_full_interception(
2100                &mut nm,
2101                "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2102                ResourceType::Script,
2103                false,
2104            ),
2105            "GTM script should be blocked through full pipeline"
2106        );
2107    }
2108
2109    #[cfg(feature = "adblock")]
2110    #[test]
2111    fn test_e2e_legitimate_script_allowed() {
2112        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2113
2114        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2115        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2116
2117        assert!(
2118            !run_full_interception(
2119                &mut nm,
2120                "https://www.mylegitsite-test.com/static/js/app-bundle.js",
2121                ResourceType::Script,
2122                true,
2123            ),
2124            "legitimate first-party script should be allowed through full pipeline"
2125        );
2126    }
2127
2128    #[cfg(feature = "adblock")]
2129    #[test]
2130    fn test_e2e_analytics_xhr_blocked() {
2131        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2132
2133        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2134        nm.set_page_url("https://example.org/".to_string());
2135
2136        assert!(
2137            run_full_interception(
2138                &mut nm,
2139                "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
2140                ResourceType::Xhr,
2141                false,
2142            ),
2143            "Google Analytics XHR should be blocked through full pipeline"
2144        );
2145    }
2146
2147    #[cfg(feature = "adblock")]
2148    #[test]
2149    fn test_e2e_whitelisted_overrides_adblock() {
2150        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2151
2152        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2153        nm.set_page_url("https://example.org/".to_string());
2154        nm.set_whitelist_patterns(["googletagmanager.com"]);
2155
2156        // GTM would normally be blocked by adblock, but whitelist overrides.
2157        assert!(
2158            !run_full_interception(
2159                &mut nm,
2160                "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
2161                ResourceType::Script,
2162                false,
2163            ),
2164            "whitelisted tracker should be allowed even when adblock would block it"
2165        );
2166    }
2167
2168    #[cfg(feature = "adblock")]
2169    #[test]
2170    fn test_e2e_blacklist_strict_overrides_whitelist() {
2171        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2172
2173        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2174        nm.set_page_url("https://example.org/".to_string());
2175        nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
2176        nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
2177        nm.set_blacklist_strict(true);
2178
2179        assert!(
2180            run_full_interception(
2181                &mut nm,
2182                "https://cdn.example.net/evil.js",
2183                ResourceType::Script,
2184                false,
2185            ),
2186            "strict blacklist should win over whitelist"
2187        );
2188    }
2189
2190    #[cfg(feature = "adblock")]
2191    #[test]
2192    fn test_e2e_first_party_stylesheet_passes_when_block_stylesheets_on() {
2193        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2194
2195        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2196        nm.set_page_url("https://developer.intuit.com/".to_string());
2197        nm.block_stylesheets = true;
2198
2199        assert!(
2200            !run_full_interception(
2201                &mut nm,
2202                "https://developer.intuit.com/static/app.css",
2203                ResourceType::Stylesheet,
2204                true,
2205            ),
2206            "first-party CSS must pass when allow_first_party_stylesheets default-true"
2207        );
2208    }
2209
2210    #[cfg(feature = "adblock")]
2211    #[test]
2212    fn test_e2e_first_party_stylesheet_blocked_when_allow_disabled() {
2213        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2214
2215        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2216        nm.set_page_url("https://developer.intuit.com/".to_string());
2217        nm.block_stylesheets = true;
2218        nm.allow_first_party_stylesheets = false;
2219
2220        assert!(
2221            run_full_interception(
2222                &mut nm,
2223                "https://developer.intuit.com/static/app.css",
2224                ResourceType::Stylesheet,
2225                true,
2226            ),
2227            "first-party CSS must be blocked when allow_first_party_stylesheets=false"
2228        );
2229    }
2230
2231    #[cfg(feature = "adblock")]
2232    #[test]
2233    fn test_e2e_third_party_stylesheet_still_blocked_with_default_allow() {
2234        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2235
2236        // requestWillBeSent fired with `initiator.type = "script"` —
2237        // disqualifies the legacy heuristic fallback. Default first-party
2238        // allow is on but this URL is third-party, so it should still block.
2239        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2240        nm.set_page_url("https://developer.intuit.com/".to_string());
2241        nm.block_stylesheets = true;
2242        // Required for `on_request_will_be_sent` to actually cache the
2243        // RWBS event into `requests_will_be_sent` (otherwise it dispatches
2244        // straight to `on_request` and the initiator lookup misses).
2245        nm.protocol_request_interception_enabled = true;
2246
2247        let rwbs_url = "https://tracker.evil.example/track.css";
2248        let rwbs_json = serde_json::json!({
2249            "requestId": "tp-css-1",
2250            "loaderId": "test-loader",
2251            "documentURL": "https://developer.intuit.com/",
2252            "request": {
2253                "url": rwbs_url,
2254                "method": "GET",
2255                "headers": {},
2256                "initialPriority": "Medium",
2257                "referrerPolicy": "no-referrer"
2258            },
2259            "timestamp": 0.0,
2260            "wallTime": 0.0,
2261            "initiator": { "type": "script" },
2262            "redirectHasExtraInfo": false,
2263            "type": "Stylesheet",
2264            "frameId": "frame1"
2265        });
2266        let rwbs_event: chromiumoxide_cdp::cdp::browser_protocol::network::EventRequestWillBeSent =
2267            serde_json::from_value(rwbs_json).unwrap();
2268        nm.on_request_will_be_sent(&rwbs_event);
2269
2270        // Use the same requestId in the requestPaused event so the
2271        // initiator capture finds the cached RWBS entry.
2272        use super::NetworkEvent;
2273        while nm.poll().is_some() {}
2274        let mut paused_event = make_request_paused(rwbs_url, ResourceType::Stylesheet, false);
2275        paused_event.network_id = Some(
2276            chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
2277                "tp-css-1".to_string(),
2278            ),
2279        );
2280        nm.on_fetch_request_paused(&paused_event);
2281
2282        let mut blocked = false;
2283        while let Some(ev) = nm.poll() {
2284            if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
2285                let m: &str = method.as_ref();
2286                if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
2287                    blocked = true;
2288                }
2289            }
2290        }
2291        assert!(
2292            blocked,
2293            "third-party Script-initiated CSS must remain blocked"
2294        );
2295    }
2296
2297    #[cfg(feature = "adblock")]
2298    #[test]
2299    fn test_e2e_first_party_image_passes_when_ignore_visuals_on() {
2300        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2301
2302        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2303        nm.set_page_url("https://shop.example/".to_string());
2304        nm.ignore_visuals = true;
2305
2306        assert!(
2307            !run_full_interception(
2308                &mut nm,
2309                "https://shop.example/img/hero.png",
2310                ResourceType::Image,
2311                true,
2312            ),
2313            "first-party image must pass when allow_first_party_visuals default-true"
2314        );
2315    }
2316
2317    #[cfg(feature = "adblock")]
2318    #[test]
2319    fn test_e2e_third_party_image_blocked_when_ignore_visuals_on() {
2320        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2321
2322        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2323        nm.set_page_url("https://shop.example/".to_string());
2324        nm.ignore_visuals = true;
2325
2326        assert!(
2327            run_full_interception(
2328                &mut nm,
2329                "https://cdn.thirdparty.io/banner.png",
2330                ResourceType::Image,
2331                false,
2332            ),
2333            "third-party image must remain blocked when ignore_visuals=true"
2334        );
2335    }
2336
2337    #[cfg(feature = "adblock")]
2338    #[test]
2339    fn test_e2e_first_party_document_not_blocked() {
2340        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2341
2342        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2343        nm.set_page_url("https://www.nytimes.com/".to_string());
2344
2345        assert!(
2346            !run_full_interception(
2347                &mut nm,
2348                "https://www.nytimes.com/2024/article.html",
2349                ResourceType::Document,
2350                true,
2351            ),
2352            "first-party document navigation should never be blocked"
2353        );
2354    }
2355
2356    #[cfg(feature = "adblock")]
2357    #[test]
2358    fn test_e2e_custom_engine_blocks_through_pipeline() {
2359        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2360
2361        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2362        nm.set_page_url("https://mysite.com/".to_string());
2363
2364        let mut filter_set = adblock::lists::FilterSet::new(false);
2365        let mut opts = adblock::lists::ParseOptions::default();
2366        opts.rule_types = adblock::lists::RuleTypes::All;
2367        filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
2368        let engine = adblock::Engine::from_filter_set(filter_set, true);
2369        nm.set_adblock_engine(std::sync::Arc::new(engine));
2370
2371        assert!(
2372            run_full_interception(
2373                &mut nm,
2374                "https://evil-cdn.example.net/tracker.js",
2375                ResourceType::Script,
2376                false,
2377            ),
2378            "custom engine rule should block through full pipeline"
2379        );
2380
2381        // Legitimate script on the same site should still pass.
2382        assert!(
2383            !run_full_interception(
2384                &mut nm,
2385                "https://mysite.com/app.js",
2386                ResourceType::Script,
2387                true,
2388            ),
2389            "first-party script should still be allowed with custom engine"
2390        );
2391    }
2392
2393    #[cfg(feature = "adblock")]
2394    #[test]
2395    fn test_e2e_ad_image_blocked() {
2396        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2397
2398        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2399        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2400
2401        // Ad tracking pixel should be blocked via adblock pattern or trie.
2402        assert!(
2403            run_full_interception(
2404                &mut nm,
2405                "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2406                ResourceType::Image,
2407                false,
2408            ),
2409            "doubleclick ad image/tracking pixel should be blocked"
2410        );
2411
2412        // Legitimate first-party image should pass.
2413        assert!(
2414            !run_full_interception(
2415                &mut nm,
2416                "https://www.mylegitsite-test.com/images/logo.png",
2417                ResourceType::Image,
2418                true,
2419            ),
2420            "legitimate first-party image should not be blocked"
2421        );
2422    }
2423
2424    #[cfg(feature = "adblock")]
2425    #[test]
2426    fn test_e2e_hostname_with_userinfo() {
2427        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2428
2429        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2430        nm.set_page_url("https://example.org/".to_string());
2431
2432        // URL with userinfo should still correctly identify googletagmanager.com.
2433        assert!(
2434            run_full_interception(
2435                &mut nm,
2436                "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2437                ResourceType::Script,
2438                false,
2439            ),
2440            "tracker URL with userinfo should still be blocked"
2441        );
2442    }
2443
2444    #[test]
2445    fn test_blacklist_non_strict_allows_whitelist_override() {
2446        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2447        nm.set_page_url("https://example.com/".to_string());
2448
2449        nm.set_blacklist_patterns(["beacon.min.js"]);
2450        nm.set_whitelist_patterns(["beacon.min.js"]);
2451
2452        nm.set_blacklist_strict(false);
2453
2454        let u = "https://static.cloudflareinsights.com/beacon.min.js";
2455        assert!(nm.is_blacklisted(u));
2456        assert!(nm.is_whitelisted(u));
2457        assert!(!nm.blacklist_strict);
2458    }
2459
2460    // ── max_redirects enforcement ───────────────────────────────────────
2461    //
2462    // The redirect cap short-circuits in NetworkManager::on_request when a
2463    // Document-type chain exceeds the configured limit. We drive it via the
2464    // public on_request_will_be_sent entry point by deserializing synthetic
2465    // events — builder APIs exist but require every non-optional field, and
2466    // JSON is less fragile to cdp schema additions.
2467
2468    fn make_request_will_be_sent(
2469        request_id: &str,
2470        url: &str,
2471        resource_type: &str,
2472        redirect_from_url: Option<&str>,
2473    ) -> chromiumoxide_cdp::cdp::browser_protocol::network::EventRequestWillBeSent {
2474        let mut v = serde_json::json!({
2475            "requestId": request_id,
2476            "loaderId": "test-loader",
2477            "documentURL": url,
2478            "request": {
2479                "url": url,
2480                "method": "GET",
2481                "headers": {},
2482                "initialPriority": "Medium",
2483                "referrerPolicy": "no-referrer"
2484            },
2485            "timestamp": 0.0,
2486            "wallTime": 0.0,
2487            "initiator": { "type": "other" },
2488            "redirectHasExtraInfo": false,
2489            "type": resource_type,
2490            "frameId": "frame1"
2491        });
2492        if let Some(from) = redirect_from_url {
2493            v["redirectResponse"] = serde_json::json!({
2494                "url": from,
2495                "status": 302,
2496                "statusText": "Found",
2497                "headers": { "Location": url },
2498                "mimeType": "text/html",
2499                "charset": "",
2500                "connectionReused": false,
2501                "connectionId": 0.0,
2502                "encodedDataLength": 0.0,
2503                "securityState": "unknown"
2504            });
2505        }
2506        serde_json::from_value(v).expect("EventRequestWillBeSent should deserialize")
2507    }
2508
2509    fn drain_too_many_redirects(nm: &mut NetworkManager) -> Option<super::HttpRequest> {
2510        while let Some(ev) = nm.poll() {
2511            if let super::NetworkEvent::RequestFailed(req) = ev {
2512                if req.failure_text.as_deref() == Some("net::ERR_TOO_MANY_REDIRECTS") {
2513                    return Some(req);
2514                }
2515            }
2516        }
2517        None
2518    }
2519
2520    fn drain_stop_loading(nm: &mut NetworkManager) -> bool {
2521        while let Some(ev) = nm.poll() {
2522            if let super::NetworkEvent::SendCdpRequest((method, _)) = ev {
2523                let m: &str = method.as_ref();
2524                if m == "Page.stopLoading" {
2525                    return true;
2526                }
2527            }
2528        }
2529        false
2530    }
2531
2532    #[test]
2533    fn test_max_redirects_none_allows_unlimited_chain() {
2534        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2535        // max_redirects left at its default (None).
2536
2537        // 10 sequential Document hops sharing the same request_id.
2538        nm.on_request_will_be_sent(&make_request_will_be_sent(
2539            "r1",
2540            "https://example.com/0",
2541            "Document",
2542            None,
2543        ));
2544        for i in 1..10 {
2545            nm.on_request_will_be_sent(&make_request_will_be_sent(
2546                "r1",
2547                &format!("https://example.com/{i}"),
2548                "Document",
2549                Some(&format!("https://example.com/{}", i - 1)),
2550            ));
2551        }
2552
2553        assert!(
2554            drain_too_many_redirects(&mut nm).is_none(),
2555            "no cap set: chain of 10 hops must not emit ERR_TOO_MANY_REDIRECTS"
2556        );
2557    }
2558
2559    #[test]
2560    fn test_max_redirects_caps_document_chain() {
2561        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2562        nm.max_redirects = Some(3);
2563
2564        // Initial request + 4 redirect hops. The 4th redirect (chain length 4 > 3)
2565        // must trip the cap.
2566        nm.on_request_will_be_sent(&make_request_will_be_sent(
2567            "r1",
2568            "https://example.com/0",
2569            "Document",
2570            None,
2571        ));
2572        for i in 1..=4 {
2573            nm.on_request_will_be_sent(&make_request_will_be_sent(
2574                "r1",
2575                &format!("https://example.com/{i}"),
2576                "Document",
2577                Some(&format!("https://example.com/{}", i - 1)),
2578            ));
2579        }
2580
2581        let failed = drain_too_many_redirects(&mut nm)
2582            .expect("cap of 3 on a 4-hop chain must emit ERR_TOO_MANY_REDIRECTS");
2583        assert_eq!(
2584            failed.redirect_chain.len(),
2585            4,
2586            "failed request should preserve the full accumulated chain"
2587        );
2588        assert_eq!(
2589            failed.url.as_deref(),
2590            Some("https://example.com/4"),
2591            "failed request url should be the hop that tripped the cap"
2592        );
2593
2594        // Second navigation after the cap is tripped must also schedule
2595        // Page.stopLoading to actually abort the tab.
2596        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2597        nm.max_redirects = Some(3);
2598        nm.on_request_will_be_sent(&make_request_will_be_sent(
2599            "r2",
2600            "https://example.com/0",
2601            "Document",
2602            None,
2603        ));
2604        for i in 1..=4 {
2605            nm.on_request_will_be_sent(&make_request_will_be_sent(
2606                "r2",
2607                &format!("https://example.com/{i}"),
2608                "Document",
2609                Some(&format!("https://example.com/{}", i - 1)),
2610            ));
2611        }
2612        assert!(
2613            drain_stop_loading(&mut nm),
2614            "cap hit must dispatch Page.stopLoading to abort navigation"
2615        );
2616    }
2617
2618    #[test]
2619    fn test_max_redirects_ignores_subresources() {
2620        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2621        nm.max_redirects = Some(2);
2622
2623        // A 5-hop script redirect chain — sub-resources are exempt by design.
2624        nm.on_request_will_be_sent(&make_request_will_be_sent(
2625            "s1",
2626            "https://cdn.example.com/0.js",
2627            "Script",
2628            None,
2629        ));
2630        for i in 1..=5 {
2631            nm.on_request_will_be_sent(&make_request_will_be_sent(
2632                "s1",
2633                &format!("https://cdn.example.com/{i}.js"),
2634                "Script",
2635                Some(&format!("https://cdn.example.com/{}.js", i - 1)),
2636            ));
2637        }
2638
2639        assert!(
2640            drain_too_many_redirects(&mut nm).is_none(),
2641            "sub-resource redirect chains must never be capped"
2642        );
2643    }
2644}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs