Skip to main content

chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20    SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235/// How long a buffered `requests_will_be_sent` / `request_id_to_interception_id`
236/// entry may linger before being evicted. 30 seconds is generous — the CDP
237/// round-trip that reconciles the two racing events normally completes in
238/// milliseconds.
239const STALE_BUFFER_SECS: u64 = 30;
240
241/// How long an in-flight request entry (`requests` map) can live without
242/// being resolved by a `loadingFinished` / `loadingFailed` / `loadingCanceled`
243/// event before it is considered orphaned and evicted.  Longer than the
244/// race-condition buffer timeout because real requests can legitimately take
245/// tens of seconds (streaming, slow origins, etc.).
246const STALE_REQUEST_SECS: u64 = 120;
247
248/// Wrapper around `adblock::Engine` that implements `Debug`.
249#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255        f.debug_struct("AdblockEngine").finish()
256    }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261    type Target = adblock::Engine;
262    fn deref(&self) -> &Self::Target {
263        &self.0
264    }
265}
266
267#[derive(Debug)]
268/// The base network manager.
269pub struct NetworkManager {
270    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
271    ///
272    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
273    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
274    /// Consumers pull from this queue via `poll()`.
275    queued_events: VecDeque<NetworkEvent>,
276    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
277    ///
278    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
279    /// certificates (self-signed, expired, MITM proxies, etc.).
280    ignore_httpserrors: bool,
281    /// Active in-flight requests keyed by CDP `RequestId`.
282    ///
283    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
284    /// and final state used to emit `RequestFinished` / `RequestFailed`.
285    requests: HashMap<RequestId, HttpRequest>,
286    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
287    /// `Fetch.requestPaused` arrives later (or vice versa).
288    ///
289    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
290    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
291    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
292    requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293    /// Extra HTTP headers to apply to subsequent network requests via CDP.
294    ///
295    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
296    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
297    extra_headers: std::collections::HashMap<String, String>,
298    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
299    ///
300    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
301    /// store the interception id here so it can be attached to the `HttpRequest` once the
302    /// network request is observed.
303    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
304    request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305    /// Whether the user has disabled the browser cache.
306    ///
307    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
308    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
309    user_cache_disabled: bool,
310    /// Tracks which requests have already attempted authentication.
311    ///
312    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
313    /// authentication challenges (407/401). Once a request id is present here, subsequent
314    /// challenges for the same request are canceled.
315    attempted_authentications: HashSet<RequestId>,
316    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
317    ///
318    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
319    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
320    credentials: Option<Credentials>,
321    /// User-facing toggle indicating whether request interception is desired.
322    ///
323    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
324    /// not guarantee interception is active; interception is actually enabled/disabled by
325    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
326    ///
327    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
328    /// enabled to satisfy auth challenges.
329    pub(crate) user_request_interception_enabled: bool,
330    /// Hard kill-switch to block all network traffic.
331    ///
332    /// When `true`, the manager immediately blocks requests (typically via
333    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
334    /// and short-circuits most decision logic. This is used for safety conditions such as
335    /// exceeding `max_bytes_allowed` or other runtime protections.
336    block_all: bool,
337    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
338    ///
339    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
340    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
341    /// when `user_request_interception_enabled` or `credentials` change.
342    pub(crate) protocol_request_interception_enabled: bool,
343    /// The network is offline.
344    offline: bool,
345    /// The page request timeout.
346    pub request_timeout: Duration,
347    // made_request: bool,
348    /// Ignore visuals (no pings, prefetching, and etc).
349    pub ignore_visuals: bool,
350    /// Block CSS stylesheets.
351    pub block_stylesheets: bool,
352    /// Block javascript that is not critical to rendering.
353    ///
354    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
355    /// by itself (it remains for config compatibility).
356    pub block_javascript: bool,
357    /// Block analytics from rendering
358    pub block_analytics: bool,
359    /// Block pre-fetch request
360    pub block_prefetch: bool,
361    /// Only html from loading.
362    pub only_html: bool,
363    /// Is xml document?
364    pub xml_document: bool,
365    /// The custom intercept handle logic to run on the website.
366    pub intercept_manager: NetworkInterceptManager,
367    /// Track the amount of times the document reloaded.
368    pub document_reload_tracker: u8,
369    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
370    pub document_target_url: String,
371    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
372    pub document_target_domain: String,
373    /// The max bytes to receive.
374    pub max_bytes_allowed: Option<u64>,
375    #[cfg(feature = "_cache")]
376    /// The cache site_key to use.
377    pub cache_site_key: Option<String>,
378    /// The cache policy to use.
379    #[cfg(feature = "_cache")]
380    pub cache_policy: Option<BasicCachePolicy>,
381    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
382    whitelist_patterns: Vec<String>,
383    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
384    whitelist_matcher: Option<AhoCorasick>,
385    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
386    blacklist_patterns: Vec<String>,
387    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
388    blacklist_matcher: Option<AhoCorasick>,
389    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
390    blacklist_strict: bool,
391    /// Custom adblock engine built from user-supplied filter rules.
392    /// When `Some`, takes precedence over the global default engine.
393    #[cfg(feature = "adblock")]
394    adblock_engine: Option<AdblockEngine>,
395}
396
397impl NetworkManager {
398    /// A new network manager.
399    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
400        Self {
401            queued_events: Default::default(),
402            ignore_httpserrors,
403            requests: Default::default(),
404            requests_will_be_sent: Default::default(),
405            extra_headers: Default::default(),
406            request_id_to_interception_id: Default::default(),
407            user_cache_disabled: false,
408            attempted_authentications: Default::default(),
409            credentials: None,
410            block_all: false,
411            user_request_interception_enabled: false,
412            protocol_request_interception_enabled: false,
413            offline: false,
414            request_timeout,
415            ignore_visuals: false,
416            block_javascript: false,
417            block_stylesheets: false,
418            block_prefetch: true,
419            block_analytics: true,
420            only_html: false,
421            xml_document: false,
422            intercept_manager: NetworkInterceptManager::Unknown,
423            document_reload_tracker: 0,
424            document_target_url: String::new(),
425            document_target_domain: String::new(),
426            whitelist_patterns: Vec::new(),
427            whitelist_matcher: None,
428            blacklist_patterns: Vec::new(),
429            blacklist_matcher: None,
430            blacklist_strict: true,
431            max_bytes_allowed: None,
432            #[cfg(feature = "_cache")]
433            cache_site_key: None,
434            #[cfg(feature = "_cache")]
435            cache_policy: None,
436            #[cfg(feature = "adblock")]
437            adblock_engine: None,
438        }
439    }
440
441    /// Set a custom adblock engine built from user-supplied filter rules.
442    #[cfg(feature = "adblock")]
443    pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
444        self.adblock_engine = Some(AdblockEngine(engine));
445    }
446
447    /// Replace the whitelist patterns (compiled once).
448    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
449    where
450        I: IntoIterator<Item = S>,
451        S: Into<String>,
452    {
453        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
454        self.rebuild_whitelist_matcher();
455    }
456
457    /// Replace the blacklist patterns (compiled once).
458    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
459    where
460        I: IntoIterator<Item = S>,
461        S: Into<String>,
462    {
463        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
464        self.rebuild_blacklist_matcher();
465    }
466
467    /// Add one pattern (cheap) and rebuild (call this sparingly).
468    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
469        self.blacklist_patterns.push(pattern.into());
470        self.rebuild_blacklist_matcher();
471    }
472
473    /// Add many patterns and rebuild once.
474    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
475    where
476        I: IntoIterator<Item = S>,
477        S: Into<String>,
478    {
479        self.blacklist_patterns
480            .extend(patterns.into_iter().map(Into::into));
481        self.rebuild_blacklist_matcher();
482    }
483
484    /// Clear blacklist entirely.
485    pub fn clear_blacklist(&mut self) {
486        self.blacklist_patterns.clear();
487        self.blacklist_matcher = None;
488    }
489
490    /// Control precedence: when true, blacklist always wins.
491    pub fn set_blacklist_strict(&mut self, strict: bool) {
492        self.blacklist_strict = strict;
493    }
494
495    #[inline]
496    fn rebuild_blacklist_matcher(&mut self) {
497        if self.blacklist_patterns.is_empty() {
498            self.blacklist_matcher = None;
499            return;
500        }
501
502        self.blacklist_matcher =
503            AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
504    }
505
506    #[inline]
507    fn is_blacklisted(&self, url: &str) -> bool {
508        self.blacklist_matcher
509            .as_ref()
510            .map(|m| m.is_match(url))
511            .unwrap_or(false)
512    }
513
514    /// Add one pattern (cheap) and rebuild (call this sparingly).
515    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
516        self.whitelist_patterns.push(pattern.into());
517        self.rebuild_whitelist_matcher();
518    }
519
520    /// Add many patterns and rebuild once.
521    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
522    where
523        I: IntoIterator<Item = S>,
524        S: Into<String>,
525    {
526        self.whitelist_patterns
527            .extend(patterns.into_iter().map(Into::into));
528        self.rebuild_whitelist_matcher();
529    }
530
531    #[inline]
532    fn rebuild_whitelist_matcher(&mut self) {
533        if self.whitelist_patterns.is_empty() {
534            self.whitelist_matcher = None;
535            return;
536        }
537
538        // If building fails (shouldn’t for simple patterns), just disable matcher.
539        self.whitelist_matcher =
540            AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
541    }
542
543    #[inline]
544    fn is_whitelisted(&self, url: &str) -> bool {
545        self.whitelist_matcher
546            .as_ref()
547            .map(|m| m.is_match(url))
548            .unwrap_or(false)
549    }
550
551    /// Commands to init the chain with.
552    pub fn init_commands(&self) -> CommandChain {
553        let cmds = if self.ignore_httpserrors {
554            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
555        } else {
556            INIT_CHAIN.clone()
557        };
558        CommandChain::new(cmds, self.request_timeout)
559    }
560
561    /// Push the CDP request.
562    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
563        let method = cmd.identifier();
564        if let Ok(params) = serde_json::to_value(cmd) {
565            self.queued_events
566                .push_back(NetworkEvent::SendCdpRequest((method, params)));
567        }
568    }
569
570    /// The next event to handle.
571    pub fn poll(&mut self) -> Option<NetworkEvent> {
572        self.queued_events.pop_front()
573    }
574
575    /// Evict stale entries from the race-condition buffers and from
576    /// `attempted_authentications`. Call this periodically (e.g. from the
577    /// handler's eviction tick) so that lost CDP events cannot cause unbounded
578    /// map growth.
579    pub fn evict_stale_entries(&mut self, now: Instant) {
580        let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
581
582        self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
583        self.request_id_to_interception_id
584            .retain(|_, (_, ts)| *ts > cutoff);
585
586        // Evict orphaned in-flight requests whose completion events
587        // (`loadingFinished` / `loadingFailed` / `loadingCanceled`) were
588        // never received.  Uses a longer timeout than the race-condition
589        // buffers since real requests can legitimately be long-lived.
590        let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
591        self.requests
592            .retain(|_, req| req.created_at > request_cutoff);
593
594        // `attempted_authentications` entries reference interception IDs that
595        // are cleaned up on loading-finished / loading-failed. If those events
596        // are lost, the set grows forever. Cross-reference with `requests`:
597        // any interception ID that no longer appears in a live request is stale.
598        if !self.attempted_authentications.is_empty() {
599            let live: HashSet<&str> = self
600                .requests
601                .values()
602                .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
603                .collect();
604            self.attempted_authentications
605                .retain(|id| live.contains(id.as_ref()));
606        }
607    }
608
609    /// Get the extra headers.
610    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
611        &self.extra_headers
612    }
613
614    /// Set extra HTTP headers.
615    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
616        self.extra_headers = headers;
617        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
618        self.extra_headers.remove("Proxy-Authorization");
619        if !self.extra_headers.is_empty() {
620            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
621                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
622            }
623        }
624    }
625
626    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
627        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
628    }
629
630    pub fn set_block_all(&mut self, block_all: bool) {
631        self.block_all = block_all;
632    }
633
634    pub fn set_request_interception(&mut self, enabled: bool) {
635        self.user_request_interception_enabled = enabled;
636        self.update_protocol_request_interception();
637    }
638
639    pub fn set_cache_enabled(&mut self, enabled: bool) {
640        let run = self.user_cache_disabled == enabled;
641        self.user_cache_disabled = !enabled;
642        if run {
643            self.update_protocol_cache_disabled();
644        }
645    }
646
647    /// Enable fetch interception.
648    pub fn enable_request_intercept(&mut self) {
649        self.protocol_request_interception_enabled = true;
650    }
651
652    /// Disable fetch interception.
653    pub fn disable_request_intercept(&mut self) {
654        self.protocol_request_interception_enabled = false;
655    }
656
657    /// Set the cache site key.
658    #[cfg(feature = "_cache")]
659    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
660        self.cache_site_key = cache_site_key;
661    }
662
663    /// Set the cache policy.
664    #[cfg(feature = "_cache")]
665    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
666        self.cache_policy = cache_policy;
667    }
668
669    pub fn update_protocol_cache_disabled(&mut self) {
670        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
671    }
672
673    pub fn authenticate(&mut self, credentials: Credentials) {
674        self.credentials = Some(credentials);
675        self.update_protocol_request_interception();
676        self.protocol_request_interception_enabled = true;
677    }
678
679    fn update_protocol_request_interception(&mut self) {
680        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
681
682        if enabled == self.protocol_request_interception_enabled {
683            return;
684        }
685
686        if enabled {
687            self.push_cdp_request(ENABLE_FETCH.clone())
688        } else {
689            self.push_cdp_request(DisableParams::default())
690        }
691    }
692
693    /// Blocklist-only script blocking.
694    /// Returns true only when the URL matches an explicit blocklist condition.
695    #[inline]
696    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
697        // If analytics blocking is off, skip all analytics tries.
698        let block_analytics = self.block_analytics;
699
700        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
701        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
702        {
703            return true;
704        }
705
706        // 2) Custom website block list (explicit).
707        if crate::handler::blockers::block_websites::block_website(url) {
708            return true;
709        }
710
711        // 3) Path-based explicit tries / fallbacks.
712        //
713        // We run these on:
714        // - path with leading slash ("/js/app.js")
715        // - path without leading slash ("js/app.js")
716        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
717        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
718            // Remove query/fragment so matching stays stable.
719            let p_slash = Self::strip_query_fragment(path_with_slash);
720            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
721
722            // Basename for filename-only lists.
723            let base = match p_slash.rsplit('/').next() {
724                Some(b) => b,
725                None => p_slash,
726            };
727
728            // ---- Trie checks ----
729            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
730            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
731                return true;
732            }
733            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
734                return true;
735            }
736            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
737                return true;
738            }
739
740            // Base-path ignore tries (framework noise / known ignorable script paths).
741            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
742            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
743                return true;
744            }
745
746            // Style path ignores only when visuals are ignored.
747            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
748                return true;
749            }
750        }
751
752        false
753    }
754
755    /// Extract the absolute URL path portion WITH the leading slash.
756    ///
757    /// Example:
758    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
759    #[inline]
760    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
761        // find scheme separator
762        let bytes = url.as_bytes();
763        let idx = memchr::memmem::find(bytes, b"//")?;
764        let after_slashes = idx + 2;
765
766        // find first slash after host
767        let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
768        let slash_idx = after_slashes + slash_rel;
769
770        if slash_idx < url.len() {
771            Some(&url[slash_idx..])
772        } else {
773            None
774        }
775    }
776
777    /// Strip query string and fragment from a path-ish string.
778    ///
779    /// Example:
780    /// - "/a/b.js?x=1#y" -> "/a/b.js"
781    #[inline]
782    fn strip_query_fragment(s: &str) -> &str {
783        match memchr::memchr2(b'?', b'#', s.as_bytes()) {
784            Some(i) => &s[..i],
785            None => s,
786        }
787    }
788
789    /// Determine if the request should be skipped.
790    #[inline]
791    fn skip_xhr(
792        &self,
793        skip_networking: bool,
794        event: &EventRequestPaused,
795        network_event: bool,
796    ) -> bool {
797        // XHR check
798        if !skip_networking && network_event {
799            let request_url = event.request.url.as_str();
800
801            // check if part of ignore scripts.
802            let skip_analytics =
803                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
804
805            if skip_analytics {
806                true
807            } else if self.block_stylesheets || self.ignore_visuals {
808                let block_css = self.block_stylesheets;
809                let block_media = self.ignore_visuals;
810
811                let mut block_request = false;
812
813                if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
814                    let hlen = request_url.len();
815                    let has_asset = hlen - position;
816
817                    if has_asset >= 3 {
818                        let next_position = position + 1;
819
820                        if block_media
821                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
822                                &request_url[next_position..].into(),
823                            )
824                        {
825                            block_request = true;
826                        } else if block_css {
827                            block_request = CaseInsensitiveString::from(
828                                &request_url.as_bytes()[next_position..],
829                            )
830                            .contains(&**CSS_EXTENSION)
831                        }
832                    }
833                }
834
835                if !block_request {
836                    block_request = ignore_script_xhr_media(request_url);
837                }
838
839                block_request
840            } else {
841                skip_networking
842            }
843        } else {
844            skip_networking
845        }
846    }
847
848    #[cfg(feature = "adblock")]
849    #[inline]
850    /// Detect if ad enabled.
851    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
852        if skip_networking {
853            true
854        } else {
855            block_ads(&event.request.url) || self.detect_ad(event)
856        }
857    }
858
859    /// When adblock feature is disabled, this is a no-op.
860    #[cfg(not(feature = "adblock"))]
861    #[inline]
862    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
863        use crate::handler::blockers::block_websites::block_ads;
864        if skip_networking {
865            true
866        } else {
867            block_ads(&event.request.url)
868        }
869    }
870
871    #[inline]
872    /// Fail request
873    fn fail_request_blocked(
874        &mut self,
875        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
876    ) {
877        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
878            request_id.clone(),
879            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
880        );
881        self.push_cdp_request(params);
882    }
883
884    #[inline]
885    /// Fulfill request
886    fn fulfill_request_empty_200(
887        &mut self,
888        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
889    ) {
890        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
891            request_id.clone(),
892            200,
893        );
894        self.push_cdp_request(params);
895    }
896
897    #[cfg(feature = "_cache")]
898    #[inline]
899    /// Fulfill a paused Fetch request from cached bytes + header map.
900    ///
901    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
902    fn fulfill_request_from_cache(
903        &mut self,
904        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
905        body: &[u8],
906        headers: &std::collections::HashMap<String, String>,
907        status: i64,
908    ) {
909        use crate::cdp::browser_protocol::fetch::HeaderEntry;
910        use crate::handler::network::fetch::FulfillRequestParams;
911        use base64::Engine;
912
913        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
914
915        for (k, v) in headers.iter() {
916            resp_headers.push(HeaderEntry {
917                name: k.clone().into(),
918                value: v.clone().into(),
919            });
920        }
921
922        let mut params = FulfillRequestParams::new(request_id.clone(), status);
923
924        // TODO: have this already encoded prior.
925        params.body = Some(
926            base64::engine::general_purpose::STANDARD
927                .encode(body)
928                .into(),
929        );
930
931        params.response_headers = Some(resp_headers);
932
933        self.push_cdp_request(params);
934    }
935
936    #[inline]
937    /// Continue the request url.
938    fn continue_request_with_url(
939        &mut self,
940        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
941        url: Option<&str>,
942        intercept_response: bool,
943    ) {
944        let mut params = ContinueRequestParams::new(request_id.clone());
945        if let Some(url) = url {
946            params.url = Some(url.to_string());
947            params.intercept_response = Some(intercept_response);
948        }
949        self.push_cdp_request(params);
950    }
951
952    /// On fetch request paused interception.
953    #[inline]
954    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
955        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
956            return;
957        }
958
959        if self.block_all {
960            tracing::debug!(
961                "Blocked (block_all): {:?} - {}",
962                event.resource_type,
963                event.request.url
964            );
965            return self.fail_request_blocked(&event.request_id);
966        }
967
968        if let Some(network_id) = event.network_id.as_ref() {
969            if let Some((request_will_be_sent, _)) =
970                self.requests_will_be_sent.remove(network_id.as_ref())
971            {
972                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
973            } else {
974                self.request_id_to_interception_id.insert(
975                    network_id.clone(),
976                    (event.request_id.clone().into(), Instant::now()),
977                );
978            }
979        }
980
981        // From here on, we handle the full decision tree.
982        let javascript_resource = event.resource_type == ResourceType::Script;
983        let document_resource = event.resource_type == ResourceType::Document;
984        let network_resource =
985            !document_resource && crate::utils::is_data_resource(&event.resource_type);
986
987        // Start with static / cheap skip checks.
988        let mut skip_networking =
989            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
990
991        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
992            skip_networking = true;
993        }
994
995        // Also short-circuit if we've reloaded this document too many times.
996        if !skip_networking {
997            skip_networking = self.document_reload_tracker >= 3;
998        }
999
1000        // Handle document redirect / masking and track xml documents.
1001        let (current_url_cow, had_replacer) =
1002            self.handle_document_replacement_and_tracking(event, document_resource);
1003
1004        let current_url: &str = current_url_cow.as_ref();
1005
1006        let blacklisted = self.is_blacklisted(current_url);
1007
1008        if !self.blacklist_strict && blacklisted {
1009            skip_networking = true;
1010        }
1011
1012        if !skip_networking {
1013            // Allow XSL for sitemap XML.
1014            if self.xml_document && current_url.ends_with(".xsl") {
1015                skip_networking = false;
1016            } else {
1017                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1018            }
1019        }
1020
1021        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1022
1023        // Ignore embedded scripts, tracker stylesheets, and tracker images when only_html or ignore_visuals is set.
1024        if !skip_networking
1025            && self.block_javascript
1026            && (self.only_html || self.ignore_visuals)
1027            && (javascript_resource
1028                || document_resource
1029                || event.resource_type == ResourceType::Stylesheet
1030                || event.resource_type == ResourceType::Image)
1031        {
1032            skip_networking = ignore_script_embedded(current_url);
1033        }
1034
1035        // Script policy: allow-by-default.
1036        // Block only if explicit block list patterns match.
1037        if !skip_networking && javascript_resource {
1038            skip_networking = self.should_block_script_blocklist_only(current_url);
1039        }
1040
1041        // XHR / data resources.
1042        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1043
1044        // Custom interception layer.
1045        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1046            skip_networking = self.intercept_manager.intercept_detection(
1047                current_url,
1048                self.ignore_visuals,
1049                network_resource,
1050            );
1051        }
1052
1053        // Custom website block list.
1054        if !skip_networking && (javascript_resource || network_resource) {
1055            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1056        }
1057
1058        // whitelist 3rd party
1059        // not required unless explicit blocking.
1060        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1061        {
1062            skip_networking = false;
1063        }
1064
1065        // check if the url is in the whitelist.
1066        if skip_networking && self.is_whitelisted(current_url) {
1067            skip_networking = false;
1068        }
1069
1070        if self.blacklist_strict && blacklisted {
1071            skip_networking = true;
1072        }
1073
1074        if skip_networking {
1075            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1076            self.fulfill_request_empty_200(&event.request_id);
1077        } else {
1078            #[cfg(feature = "_cache")]
1079            {
1080                if let (Some(policy), Some(cache_site_key)) =
1081                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1082                {
1083                    let current_url = format!("{}:{}", event.request.method, &current_url);
1084
1085                    if let Some((res, cache_policy)) =
1086                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1087                    {
1088                        if policy.allows_cached(&cache_policy) {
1089                            tracing::debug!(
1090                                "Remote Cached: {:?} - {}",
1091                                &event.resource_type,
1092                                &current_url
1093                            );
1094                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1095                            return self.fulfill_request_from_cache(
1096                                &event.request_id,
1097                                &res.body,
1098                                &flat_headers,
1099                                res.status as i64,
1100                            );
1101                        }
1102                    }
1103                }
1104            }
1105
1106            // check our frame cache for the run.
1107            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1108            self.continue_request_with_url(
1109                &event.request_id,
1110                if had_replacer {
1111                    Some(current_url)
1112                } else {
1113                    None
1114                },
1115                !had_replacer,
1116            );
1117        }
1118    }
1119
1120    /// Shared "visuals + basic blocking" logic.
1121    ///
1122    /// IMPORTANT: Scripts are NOT blocked here anymore.
1123    /// Scripts are allowed by default and only blocked via explicit blocklists
1124    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1125    #[inline]
1126    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1127        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1128            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1129    }
1130
1131    /// Does the network manager have a target domain?
1132    pub fn has_target_domain(&self) -> bool {
1133        !self.document_target_url.is_empty()
1134    }
1135
1136    /// Set the target page url for tracking.
1137    pub fn set_page_url(&mut self, page_target_url: String) {
1138        let host_base = host_and_rest(&page_target_url)
1139            .map(|(h, _)| base_domain_from_host(h))
1140            .unwrap_or("");
1141
1142        self.document_target_domain = host_base.to_string();
1143        self.document_target_url = page_target_url;
1144    }
1145
1146    /// Clear the initial target domain on every navigation.
1147    pub fn clear_target_domain(&mut self) {
1148        self.document_reload_tracker = 0;
1149        self.document_target_url = Default::default();
1150        self.document_target_domain = Default::default();
1151    }
1152
1153    /// Handles:
1154    /// - document reload tracking (`document_reload_tracker`)
1155    /// - redirect masking / replacement
1156    /// - xml document detection (`xml_document`)
1157    /// - `document_target_url` updates
1158    ///
1159    /// Returns (current_url, had_replacer).
1160    #[inline]
1161    fn handle_document_replacement_and_tracking<'a>(
1162        &mut self,
1163        event: &'a EventRequestPaused,
1164        document_resource: bool,
1165    ) -> (Cow<'a, str>, bool) {
1166        let mut replacer: Option<String> = None;
1167        let current_url = event.request.url.as_str();
1168
1169        if document_resource {
1170            if self.document_target_url == current_url {
1171                self.document_reload_tracker += 1;
1172            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1173            {
1174                let (http_document_replacement, mut https_document_replacement) =
1175                    if self.document_target_url.starts_with("http://") {
1176                        (
1177                            self.document_target_url.replacen("http://", "http//", 1),
1178                            self.document_target_url.replacen("http://", "https://", 1),
1179                        )
1180                    } else {
1181                        (
1182                            self.document_target_url.replacen("https://", "https//", 1),
1183                            self.document_target_url.replacen("https://", "http://", 1),
1184                        )
1185                    };
1186
1187                // Track trailing slash to restore later.
1188                let trailing = https_document_replacement.ends_with('/');
1189                if trailing {
1190                    https_document_replacement.pop();
1191                }
1192                if https_document_replacement.ends_with('/') {
1193                    https_document_replacement.pop();
1194                }
1195
1196                let redirect_mask = format!(
1197                    "{}{}",
1198                    https_document_replacement, http_document_replacement
1199                );
1200
1201                if current_url == redirect_mask {
1202                    replacer = Some(if trailing {
1203                        format!("{}/", https_document_replacement)
1204                    } else {
1205                        https_document_replacement
1206                    });
1207                }
1208            }
1209
1210            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1211                self.xml_document = true;
1212            }
1213
1214            // Track last seen document URL.
1215            self.document_target_url = event.request.url.clone();
1216            self.document_target_domain = host_and_rest(&self.document_target_url)
1217                .map(|(h, _)| base_domain_from_host(h).to_string())
1218                .unwrap_or_default();
1219        }
1220
1221        let current_url_cow = match replacer {
1222            Some(r) => Cow::Owned(r),
1223            None => Cow::Borrowed(event.request.url.as_str()),
1224        };
1225
1226        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1227        (current_url_cow, had_replacer)
1228    }
1229
1230    /// Perform a page intercept for chrome using the adblock engine.
1231    /// Uses the custom engine when user-supplied filter rules are configured,
1232    /// otherwise falls back to the global default engine with built-in patterns.
1233    #[cfg(feature = "adblock")]
1234    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1235        use adblock::{
1236            lists::{FilterSet, ParseOptions, RuleTypes},
1237            Engine,
1238        };
1239
1240        lazy_static::lazy_static! {
1241            static ref AD_ENGINE: Engine = {
1242                let mut filter_set = FilterSet::new(false);
1243                let mut rules = ParseOptions::default();
1244                rules.rule_types = RuleTypes::All;
1245
1246                filter_set.add_filters(
1247                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1248                    rules.clone(),
1249                );
1250
1251                // When adblock_easylist is enabled, EasyList + EasyPrivacy are
1252                // embedded at build time for zero-cost runtime loading.
1253                #[cfg(feature = "adblock_easylist")]
1254                {
1255                    static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1256                    static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1257
1258                    if !EASYLIST.is_empty() {
1259                        filter_set.add_filter_list(EASYLIST, rules.clone());
1260                    }
1261                    if !EASYPRIVACY.is_empty() {
1262                        filter_set.add_filter_list(EASYPRIVACY, rules);
1263                    }
1264                }
1265
1266                Engine::from_filter_set(filter_set, true)
1267            };
1268        }
1269
1270        let blockable = event.resource_type == ResourceType::Script
1271            || event.resource_type == ResourceType::Image
1272            || event.resource_type == ResourceType::Media
1273            || event.resource_type == ResourceType::Stylesheet
1274            || event.resource_type == ResourceType::Document
1275            || event.resource_type == ResourceType::Fetch
1276            || event.resource_type == ResourceType::Xhr;
1277
1278        if !blockable {
1279            return false;
1280        }
1281
1282        let u = &event.request.url;
1283
1284        let source_domain = if self.document_target_domain.is_empty() {
1285            "example.com"
1286        } else {
1287            &self.document_target_domain
1288        };
1289
1290        // Fast hostname extraction without full URL parsing.
1291        // preparsed(url, request_hostname, source_hostname, type, third_party)
1292        let hostname = u
1293            .strip_prefix("https://")
1294            .or_else(|| u.strip_prefix("http://"))
1295            .and_then(|rest| rest.split('/').next())
1296            // Strip userinfo (user:pass@) if present.
1297            .map(
1298                |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1299                    Some(i) => &authority[i + 1..],
1300                    None => authority,
1301                },
1302            )
1303            // Strip port (:8080) if present.
1304            .and_then(|host_port| host_port.split(':').next())
1305            .unwrap_or(source_domain);
1306
1307        let resource_type_str = match event.resource_type {
1308            ResourceType::Script => "script",
1309            ResourceType::Image => "image",
1310            ResourceType::Media => "media",
1311            ResourceType::Stylesheet => "stylesheet",
1312            ResourceType::Document => "document",
1313            ResourceType::Fetch => "fetch",
1314            ResourceType::Xhr => "xhr",
1315            _ => "other",
1316        };
1317
1318        let request = adblock::request::Request::preparsed(
1319            u,
1320            hostname,
1321            source_domain,
1322            resource_type_str,
1323            !event.request.is_same_site.unwrap_or_default(),
1324        );
1325
1326        let engine: &Engine = match self.adblock_engine.as_ref() {
1327            Some(custom) => custom,
1328            None => &AD_ENGINE,
1329        };
1330
1331        engine.check_network_request(&request).matched
1332    }
1333
1334    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1335        let response = if self
1336            .attempted_authentications
1337            .contains(event.request_id.as_ref())
1338        {
1339            AuthChallengeResponseResponse::CancelAuth
1340        } else if self.credentials.is_some() {
1341            self.attempted_authentications
1342                .insert(event.request_id.clone().into());
1343            AuthChallengeResponseResponse::ProvideCredentials
1344        } else {
1345            AuthChallengeResponseResponse::Default
1346        };
1347
1348        let mut auth = AuthChallengeResponse::new(response);
1349        if let Some(creds) = self.credentials.clone() {
1350            auth.username = Some(creds.username);
1351            auth.password = Some(creds.password);
1352        }
1353        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1354    }
1355
1356    /// Set the page offline network emulation condition.
1357    pub fn set_offline_mode(&mut self, value: bool) {
1358        if self.offline == value {
1359            return;
1360        }
1361        self.offline = value;
1362        if let Ok(condition) = NetworkConditions::builder()
1363            .url_pattern("")
1364            .latency(0)
1365            .download_throughput(-1.)
1366            .upload_throughput(-1.)
1367            .build()
1368        {
1369            if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1370                .offline(self.offline)
1371                .matched_network_condition(condition)
1372                .build()
1373            {
1374                self.push_cdp_request(network);
1375            }
1376        }
1377    }
1378
1379    /// Request interception doesn't happen for data URLs with Network Service.
1380    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1381        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1382            if let Some((interception_id, _)) = self
1383                .request_id_to_interception_id
1384                .remove(event.request_id.as_ref())
1385            {
1386                self.on_request(event, Some(interception_id));
1387            } else {
1388                self.requests_will_be_sent
1389                    .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1390            }
1391        } else {
1392            self.on_request(event, None);
1393        }
1394    }
1395
1396    /// The request was served from the cache.
1397    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1398        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1399            request.from_memory_cache = true;
1400        }
1401    }
1402
1403    /// On network response received.
1404    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1405        let mut request_failed = false;
1406
1407        // Track how many bytes we actually deducted from this target.
1408        let mut deducted: u64 = 0;
1409
1410        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1411            let before = *max_bytes;
1412
1413            // encoded_data_length -> saturating cast to u64
1414            let received_bytes: u64 = event.response.encoded_data_length as u64;
1415
1416            // Safe parse of Content-Length
1417            let content_length: Option<u64> = event
1418                .response
1419                .headers
1420                .inner()
1421                .get("content-length")
1422                .and_then(|v| v.as_str())
1423                .and_then(|s| s.trim().parse::<u64>().ok());
1424
1425            // Deduct what we actually received
1426            *max_bytes = max_bytes.saturating_sub(received_bytes);
1427
1428            // If the declared size can't fit, zero out now
1429            if let Some(cl) = content_length {
1430                if cl > *max_bytes {
1431                    *max_bytes = 0;
1432                }
1433            }
1434
1435            request_failed = *max_bytes == 0;
1436
1437            // Compute exact delta deducted on this event
1438            deducted = before.saturating_sub(*max_bytes);
1439        }
1440
1441        // Bubble up the deduction (even if request continues)
1442        if deducted > 0 {
1443            self.queued_events
1444                .push_back(NetworkEvent::BytesConsumed(deducted));
1445        }
1446
1447        // block all network request moving forward.
1448        if request_failed && self.max_bytes_allowed.is_some() {
1449            self.set_block_all(true);
1450        }
1451
1452        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1453            request.set_response(event.response.clone());
1454            self.queued_events.push_back(if request_failed {
1455                NetworkEvent::RequestFailed(request)
1456            } else {
1457                NetworkEvent::RequestFinished(request)
1458            });
1459        }
1460    }
1461
1462    /// On network loading finished.
1463    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1464        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1465            if let Some(interception_id) = request.interception_id.as_ref() {
1466                self.attempted_authentications
1467                    .remove(interception_id.as_ref());
1468            }
1469            self.queued_events
1470                .push_back(NetworkEvent::RequestFinished(request));
1471        }
1472    }
1473
1474    /// On network loading failed.
1475    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1476        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1477            request.failure_text = Some(event.error_text.clone());
1478            if let Some(interception_id) = request.interception_id.as_ref() {
1479                self.attempted_authentications
1480                    .remove(interception_id.as_ref());
1481            }
1482            self.queued_events
1483                .push_back(NetworkEvent::RequestFailed(request));
1484        }
1485    }
1486
1487    /// On request will be sent.
1488    fn on_request(
1489        &mut self,
1490        event: &EventRequestWillBeSent,
1491        interception_id: Option<InterceptionId>,
1492    ) {
1493        let mut redirect_chain = Vec::new();
1494        let mut redirect_location = None;
1495
1496        if let Some(redirect_resp) = &event.redirect_response {
1497            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1498                if is_redirect_status(redirect_resp.status) {
1499                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1500                        if redirect_resp.url != location {
1501                            let fixed_location = location.replace(&redirect_resp.url, "");
1502
1503                            if !fixed_location.is_empty() {
1504                                if let Some(resp) = request.response.as_mut() {
1505                                    resp.headers.0["Location"] =
1506                                        serde_json::Value::String(fixed_location.clone());
1507                                }
1508                            }
1509
1510                            redirect_location = Some(fixed_location);
1511                        }
1512                    }
1513                }
1514
1515                {
1516                    let mut redirect_resp = redirect_resp.clone();
1517
1518                    if let Some(redirect_location) = redirect_location {
1519                        if !redirect_location.is_empty() {
1520                            redirect_resp.headers.0["Location"] =
1521                                serde_json::Value::String(redirect_location);
1522                        }
1523                    }
1524
1525                    self.handle_request_redirect(&mut request, redirect_resp);
1526                }
1527
1528                redirect_chain = std::mem::take(&mut request.redirect_chain);
1529                redirect_chain.push(request);
1530            }
1531        }
1532
1533        let request = HttpRequest::new(
1534            event.request_id.clone(),
1535            event.frame_id.clone(),
1536            interception_id,
1537            self.user_request_interception_enabled,
1538            redirect_chain,
1539        );
1540
1541        let rid = event.request_id.clone();
1542        self.queued_events
1543            .push_back(NetworkEvent::Request(rid.clone()));
1544        self.requests.insert(rid, request);
1545    }
1546
1547    /// Handle request redirect.
1548    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1549        request.set_response(response);
1550        if let Some(interception_id) = request.interception_id.as_ref() {
1551            self.attempted_authentications
1552                .remove(interception_id.as_ref());
1553        }
1554    }
1555}
1556
1557#[derive(Debug)]
1558pub enum NetworkEvent {
1559    /// Send a CDP request.
1560    SendCdpRequest((MethodId, serde_json::Value)),
1561    /// Request.
1562    Request(RequestId),
1563    /// Response
1564    Response(RequestId),
1565    /// Request failed.
1566    RequestFailed(HttpRequest),
1567    /// Request finished.
1568    RequestFinished(HttpRequest),
1569    /// Bytes consumed.
1570    BytesConsumed(u64),
1571}
1572
1573#[cfg(test)]
1574mod tests {
1575    use super::ALLOWED_MATCHER_3RD_PARTY;
1576    use crate::handler::network::NetworkManager;
1577    use std::time::Duration;
1578
1579    #[test]
1580    fn test_allowed_matcher_3rd_party() {
1581        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1582        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1583        assert!(
1584            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1585            "expected Cloudflare challenge script to be allowed"
1586        );
1587
1588        // Should NOT be allowed (not in allow-list)
1589        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1590        assert!(
1591            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1592            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1593        );
1594
1595        // A couple sanity checks for existing allow patterns
1596        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1597        assert!(ALLOWED_MATCHER_3RD_PARTY
1598            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1599        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1600    }
1601
1602    #[test]
1603    fn test_script_allowed_by_default_when_not_blocklisted() {
1604        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1605        nm.set_page_url(
1606            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1607        );
1608
1609        // A random script that should not match your block tries.
1610        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1611        assert!(
1612            !nm.should_block_script_blocklist_only(ok),
1613            "expected non-blocklisted script to be allowed"
1614        );
1615    }
1616
1617    #[test]
1618    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1619        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1620        nm.set_page_url(
1621            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1622        );
1623
1624        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1625        let bad = "https://cdn.example.net/js/analytics.js";
1626        assert!(
1627            nm.should_block_script_blocklist_only(bad),
1628            "expected analytics.js to be blocklisted"
1629        );
1630    }
1631
1632    #[test]
1633    fn test_allowed_matcher_3rd_party_sanity() {
1634        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1635        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1636        assert!(
1637            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1638            "expected Cloudflare challenge script to be allowed"
1639        );
1640
1641        // Should NOT be allowed (not in allow-list)
1642        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1643        assert!(
1644            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1645            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1646        );
1647
1648        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1649        assert!(ALLOWED_MATCHER_3RD_PARTY
1650            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1651        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1652    }
1653    #[test]
1654    fn test_dynamic_blacklist_blocks_url() {
1655        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1656        nm.set_page_url("https://example.com/".to_string());
1657
1658        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1659        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1660        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1661
1662        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1663    }
1664
1665    #[test]
1666    fn test_blacklist_strict_wins_over_whitelist() {
1667        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1668        nm.set_page_url("https://example.com/".to_string());
1669
1670        // Same URL in both lists.
1671        nm.set_blacklist_patterns(["beacon.min.js"]);
1672        nm.set_whitelist_patterns(["beacon.min.js"]);
1673
1674        nm.set_blacklist_strict(true);
1675
1676        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1677        assert!(nm.is_whitelisted(u));
1678        assert!(nm.is_blacklisted(u));
1679
1680        // In strict mode, it should still be considered blocked at decision time.
1681        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1682        assert!(nm.blacklist_strict);
1683    }
1684
1685    #[cfg(feature = "adblock")]
1686    fn make_request_paused(
1687        url: &str,
1688        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1689        is_same_site: bool,
1690    ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1691        use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1692        use chromiumoxide_cdp::cdp::browser_protocol::network::{
1693            Headers, Request, RequestReferrerPolicy, ResourcePriority,
1694        };
1695
1696        EventRequestPaused {
1697            request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1698                "test-req".to_string(),
1699            )
1700            .into(),
1701            request: Request {
1702                url: url.to_string(),
1703                method: "GET".to_string(),
1704                headers: Headers::new(serde_json::Value::Object(Default::default())),
1705                initial_priority: ResourcePriority::Medium,
1706                referrer_policy: RequestReferrerPolicy::NoReferrer,
1707                url_fragment: None,
1708                has_post_data: None,
1709                post_data_entries: None,
1710                mixed_content_type: None,
1711                is_link_preload: None,
1712                trust_token_params: None,
1713                is_same_site: Some(is_same_site),
1714                is_ad_related: None,
1715            },
1716            frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1717                "frame1".to_string(),
1718            ),
1719            resource_type,
1720            response_error_reason: None,
1721            response_status_code: None,
1722            response_status_text: None,
1723            response_headers: None,
1724            network_id: None,
1725            redirected_request_id: None,
1726        }
1727    }
1728
1729    #[cfg(feature = "adblock")]
1730    #[test]
1731    fn test_detect_ad_blocks_known_tracker_scripts() {
1732        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1733
1734        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1735        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1736
1737        let event = make_request_paused(
1738            "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1739            ResourceType::Script,
1740            false,
1741        );
1742
1743        assert!(
1744            nm.detect_ad(&event),
1745            "googletagmanager.com script should be detected as ad"
1746        );
1747    }
1748
1749    #[cfg(feature = "adblock")]
1750    #[test]
1751    fn test_detect_ad_allows_legitimate_scripts() {
1752        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1753
1754        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1755        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1756
1757        let event = make_request_paused(
1758            "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1759            ResourceType::Script,
1760            true,
1761        );
1762
1763        assert!(
1764            !nm.detect_ad(&event),
1765            "legitimate first-party app bundle should not be blocked"
1766        );
1767    }
1768
1769    #[cfg(feature = "adblock")]
1770    #[test]
1771    fn test_detect_ad_uses_source_domain() {
1772        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1773
1774        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1775        nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1776
1777        assert!(
1778            !nm.document_target_domain.is_empty(),
1779            "document_target_domain should be set after set_page_url"
1780        );
1781
1782        let event = make_request_paused(
1783            "https://www.google-analytics.com/analytics.js",
1784            ResourceType::Script,
1785            false,
1786        );
1787
1788        assert!(
1789            nm.detect_ad(&event),
1790            "google-analytics.com should be blocked as tracker"
1791        );
1792    }
1793
1794    #[cfg(feature = "adblock")]
1795    #[test]
1796    fn test_custom_adblock_engine_takes_precedence() {
1797        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1798
1799        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1800        nm.set_page_url("https://example.com/".to_string());
1801
1802        // Build a custom engine with a specific rule.
1803        let mut filter_set = adblock::lists::FilterSet::new(false);
1804        let mut opts = adblock::lists::ParseOptions::default();
1805        opts.rule_types = adblock::lists::RuleTypes::All;
1806        filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1807        let engine = adblock::Engine::from_filter_set(filter_set, true);
1808        nm.set_adblock_engine(std::sync::Arc::new(engine));
1809
1810        let event = make_request_paused(
1811            "https://custom-tracker.example.net/pixel.js",
1812            ResourceType::Script,
1813            false,
1814        );
1815
1816        assert!(
1817            nm.detect_ad(&event),
1818            "custom engine rule should block custom-tracker.example.net"
1819        );
1820    }
1821
1822    /// Helper: run a URL through the full `on_fetch_request_paused` pipeline
1823    /// and return whether it was blocked (true) or allowed (false).
1824    #[cfg(feature = "adblock")]
1825    fn run_full_interception(
1826        nm: &mut NetworkManager,
1827        url: &str,
1828        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1829        is_same_site: bool,
1830    ) -> bool {
1831        use super::NetworkEvent;
1832
1833        // Drain any prior events.
1834        while nm.poll().is_some() {}
1835
1836        let event = make_request_paused(url, resource_type, is_same_site);
1837        nm.on_fetch_request_paused(&event);
1838
1839        // Check what was emitted: Fetch.fulfillRequest = blocked, Fetch.continueRequest = allowed.
1840        let mut blocked = false;
1841        while let Some(ev) = nm.poll() {
1842            if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1843                let m: &str = method.as_ref();
1844                if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1845                    blocked = true;
1846                }
1847            }
1848        }
1849        blocked
1850    }
1851
1852    // ── End-to-end interception tests ───────────────────────────────────
1853
1854    #[cfg(feature = "adblock")]
1855    #[test]
1856    fn test_e2e_tracker_script_blocked() {
1857        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1858
1859        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1860        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1861
1862        assert!(
1863            run_full_interception(
1864                &mut nm,
1865                "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1866                ResourceType::Script,
1867                false,
1868            ),
1869            "GTM script should be blocked through full pipeline"
1870        );
1871    }
1872
1873    #[cfg(feature = "adblock")]
1874    #[test]
1875    fn test_e2e_legitimate_script_allowed() {
1876        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1877
1878        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1879        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1880
1881        assert!(
1882            !run_full_interception(
1883                &mut nm,
1884                "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1885                ResourceType::Script,
1886                true,
1887            ),
1888            "legitimate first-party script should be allowed through full pipeline"
1889        );
1890    }
1891
1892    #[cfg(feature = "adblock")]
1893    #[test]
1894    fn test_e2e_analytics_xhr_blocked() {
1895        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1896
1897        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1898        nm.set_page_url("https://example.org/".to_string());
1899
1900        assert!(
1901            run_full_interception(
1902                &mut nm,
1903                "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
1904                ResourceType::Xhr,
1905                false,
1906            ),
1907            "Google Analytics XHR should be blocked through full pipeline"
1908        );
1909    }
1910
1911    #[cfg(feature = "adblock")]
1912    #[test]
1913    fn test_e2e_whitelisted_overrides_adblock() {
1914        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1915
1916        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1917        nm.set_page_url("https://example.org/".to_string());
1918        nm.set_whitelist_patterns(["googletagmanager.com"]);
1919
1920        // GTM would normally be blocked by adblock, but whitelist overrides.
1921        assert!(
1922            !run_full_interception(
1923                &mut nm,
1924                "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
1925                ResourceType::Script,
1926                false,
1927            ),
1928            "whitelisted tracker should be allowed even when adblock would block it"
1929        );
1930    }
1931
1932    #[cfg(feature = "adblock")]
1933    #[test]
1934    fn test_e2e_blacklist_strict_overrides_whitelist() {
1935        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1936
1937        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1938        nm.set_page_url("https://example.org/".to_string());
1939        nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
1940        nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
1941        nm.set_blacklist_strict(true);
1942
1943        assert!(
1944            run_full_interception(
1945                &mut nm,
1946                "https://cdn.example.net/evil.js",
1947                ResourceType::Script,
1948                false,
1949            ),
1950            "strict blacklist should win over whitelist"
1951        );
1952    }
1953
1954    #[cfg(feature = "adblock")]
1955    #[test]
1956    fn test_e2e_first_party_document_not_blocked() {
1957        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1958
1959        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1960        nm.set_page_url("https://www.nytimes.com/".to_string());
1961
1962        assert!(
1963            !run_full_interception(
1964                &mut nm,
1965                "https://www.nytimes.com/2024/article.html",
1966                ResourceType::Document,
1967                true,
1968            ),
1969            "first-party document navigation should never be blocked"
1970        );
1971    }
1972
1973    #[cfg(feature = "adblock")]
1974    #[test]
1975    fn test_e2e_custom_engine_blocks_through_pipeline() {
1976        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1977
1978        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1979        nm.set_page_url("https://mysite.com/".to_string());
1980
1981        let mut filter_set = adblock::lists::FilterSet::new(false);
1982        let mut opts = adblock::lists::ParseOptions::default();
1983        opts.rule_types = adblock::lists::RuleTypes::All;
1984        filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
1985        let engine = adblock::Engine::from_filter_set(filter_set, true);
1986        nm.set_adblock_engine(std::sync::Arc::new(engine));
1987
1988        assert!(
1989            run_full_interception(
1990                &mut nm,
1991                "https://evil-cdn.example.net/tracker.js",
1992                ResourceType::Script,
1993                false,
1994            ),
1995            "custom engine rule should block through full pipeline"
1996        );
1997
1998        // Legitimate script on the same site should still pass.
1999        assert!(
2000            !run_full_interception(
2001                &mut nm,
2002                "https://mysite.com/app.js",
2003                ResourceType::Script,
2004                true,
2005            ),
2006            "first-party script should still be allowed with custom engine"
2007        );
2008    }
2009
2010    #[cfg(feature = "adblock")]
2011    #[test]
2012    fn test_e2e_ad_image_blocked() {
2013        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2014
2015        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2016        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2017
2018        // Ad tracking pixel should be blocked via adblock pattern or trie.
2019        assert!(
2020            run_full_interception(
2021                &mut nm,
2022                "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2023                ResourceType::Image,
2024                false,
2025            ),
2026            "doubleclick ad image/tracking pixel should be blocked"
2027        );
2028
2029        // Legitimate first-party image should pass.
2030        assert!(
2031            !run_full_interception(
2032                &mut nm,
2033                "https://www.mylegitsite-test.com/images/logo.png",
2034                ResourceType::Image,
2035                true,
2036            ),
2037            "legitimate first-party image should not be blocked"
2038        );
2039    }
2040
2041    #[cfg(feature = "adblock")]
2042    #[test]
2043    fn test_e2e_hostname_with_userinfo() {
2044        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2045
2046        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2047        nm.set_page_url("https://example.org/".to_string());
2048
2049        // URL with userinfo should still correctly identify googletagmanager.com.
2050        assert!(
2051            run_full_interception(
2052                &mut nm,
2053                "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2054                ResourceType::Script,
2055                false,
2056            ),
2057            "tracker URL with userinfo should still be blocked"
2058        );
2059    }
2060
2061    #[test]
2062    fn test_blacklist_non_strict_allows_whitelist_override() {
2063        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2064        nm.set_page_url("https://example.com/".to_string());
2065
2066        nm.set_blacklist_patterns(["beacon.min.js"]);
2067        nm.set_whitelist_patterns(["beacon.min.js"]);
2068
2069        nm.set_blacklist_strict(false);
2070
2071        let u = "https://static.cloudflareinsights.com/beacon.min.js";
2072        assert!(nm.is_blacklisted(u));
2073        assert!(nm.is_whitelisted(u));
2074        assert!(!nm.blacklist_strict);
2075    }
2076}