Skip to main content

chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20    SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235/// How long a buffered `requests_will_be_sent` / `request_id_to_interception_id`
236/// entry may linger before being evicted. 30 seconds is generous — the CDP
237/// round-trip that reconciles the two racing events normally completes in
238/// milliseconds.
239const STALE_BUFFER_SECS: u64 = 30;
240
241/// How long an in-flight request entry (`requests` map) can live without
242/// being resolved by a `loadingFinished` / `loadingFailed` / `loadingCanceled`
243/// event before it is considered orphaned and evicted.  Longer than the
244/// race-condition buffer timeout because real requests can legitimately take
245/// tens of seconds (streaming, slow origins, etc.).
246const STALE_REQUEST_SECS: u64 = 120;
247
248/// Wrapper around `adblock::Engine` that implements `Debug`.
249#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255        f.debug_struct("AdblockEngine").finish()
256    }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261    type Target = adblock::Engine;
262    fn deref(&self) -> &Self::Target {
263        &self.0
264    }
265}
266
267#[derive(Debug)]
268/// The base network manager.
269pub struct NetworkManager {
270    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
271    ///
272    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
273    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
274    /// Consumers pull from this queue via `poll()`.
275    queued_events: VecDeque<NetworkEvent>,
276    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
277    ///
278    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
279    /// certificates (self-signed, expired, MITM proxies, etc.).
280    ignore_httpserrors: bool,
281    /// Active in-flight requests keyed by CDP `RequestId`.
282    ///
283    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
284    /// and final state used to emit `RequestFinished` / `RequestFailed`.
285    requests: HashMap<RequestId, HttpRequest>,
286    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
287    /// `Fetch.requestPaused` arrives later (or vice versa).
288    ///
289    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
290    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
291    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
292    requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293    /// Extra HTTP headers to apply to subsequent network requests via CDP.
294    ///
295    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
296    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
297    extra_headers: std::collections::HashMap<String, String>,
298    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
299    ///
300    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
301    /// store the interception id here so it can be attached to the `HttpRequest` once the
302    /// network request is observed.
303    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
304    request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305    /// Whether the user has disabled the browser cache.
306    ///
307    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
308    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
309    user_cache_disabled: bool,
310    /// Tracks which requests have already attempted authentication.
311    ///
312    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
313    /// authentication challenges (407/401). Once a request id is present here, subsequent
314    /// challenges for the same request are canceled.
315    attempted_authentications: HashSet<RequestId>,
316    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
317    ///
318    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
319    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
320    credentials: Option<Credentials>,
321    /// User-facing toggle indicating whether request interception is desired.
322    ///
323    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
324    /// not guarantee interception is active; interception is actually enabled/disabled by
325    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
326    ///
327    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
328    /// enabled to satisfy auth challenges.
329    pub(crate) user_request_interception_enabled: bool,
330    /// Hard kill-switch to block all network traffic.
331    ///
332    /// When `true`, the manager immediately blocks requests (typically via
333    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
334    /// and short-circuits most decision logic. This is used for safety conditions such as
335    /// exceeding `max_bytes_allowed` or other runtime protections.
336    block_all: bool,
337    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
338    ///
339    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
340    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
341    /// when `user_request_interception_enabled` or `credentials` change.
342    pub(crate) protocol_request_interception_enabled: bool,
343    /// The network is offline.
344    offline: bool,
345    /// The page request timeout.
346    pub request_timeout: Duration,
347    // made_request: bool,
348    /// Ignore visuals (no pings, prefetching, and etc).
349    pub ignore_visuals: bool,
350    /// Block CSS stylesheets.
351    pub block_stylesheets: bool,
352    /// Block javascript that is not critical to rendering.
353    ///
354    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
355    /// by itself (it remains for config compatibility).
356    pub block_javascript: bool,
357    /// Block analytics from rendering
358    pub block_analytics: bool,
359    /// Block pre-fetch request
360    pub block_prefetch: bool,
361    /// Only html from loading.
362    pub only_html: bool,
363    /// Is xml document?
364    pub xml_document: bool,
365    /// The custom intercept handle logic to run on the website.
366    pub intercept_manager: NetworkInterceptManager,
367    /// Track the amount of times the document reloaded.
368    pub document_reload_tracker: u8,
369    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
370    pub document_target_url: String,
371    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
372    pub document_target_domain: String,
373    /// The max bytes to receive.
374    pub max_bytes_allowed: Option<u64>,
375    #[cfg(feature = "_cache")]
376    /// The cache site_key to use.
377    pub cache_site_key: Option<String>,
378    /// The cache policy to use.
379    #[cfg(feature = "_cache")]
380    pub cache_policy: Option<BasicCachePolicy>,
381    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
382    whitelist_patterns: Vec<String>,
383    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
384    whitelist_matcher: Option<AhoCorasick>,
385    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
386    blacklist_patterns: Vec<String>,
387    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
388    blacklist_matcher: Option<AhoCorasick>,
389    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
390    blacklist_strict: bool,
391    /// Custom adblock engine built from user-supplied filter rules.
392    /// When `Some`, takes precedence over the global default engine.
393    #[cfg(feature = "adblock")]
394    adblock_engine: Option<AdblockEngine>,
395}
396
397impl NetworkManager {
398    /// A new network manager.
399    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
400        Self {
401            queued_events: Default::default(),
402            ignore_httpserrors,
403            requests: Default::default(),
404            requests_will_be_sent: Default::default(),
405            extra_headers: Default::default(),
406            request_id_to_interception_id: Default::default(),
407            user_cache_disabled: false,
408            attempted_authentications: Default::default(),
409            credentials: None,
410            block_all: false,
411            user_request_interception_enabled: false,
412            protocol_request_interception_enabled: false,
413            offline: false,
414            request_timeout,
415            ignore_visuals: false,
416            block_javascript: false,
417            block_stylesheets: false,
418            block_prefetch: true,
419            block_analytics: true,
420            only_html: false,
421            xml_document: false,
422            intercept_manager: NetworkInterceptManager::Unknown,
423            document_reload_tracker: 0,
424            document_target_url: String::new(),
425            document_target_domain: String::new(),
426            whitelist_patterns: Vec::new(),
427            whitelist_matcher: None,
428            blacklist_patterns: Vec::new(),
429            blacklist_matcher: None,
430            blacklist_strict: true,
431            max_bytes_allowed: None,
432            #[cfg(feature = "_cache")]
433            cache_site_key: None,
434            #[cfg(feature = "_cache")]
435            cache_policy: None,
436            #[cfg(feature = "adblock")]
437            adblock_engine: None,
438        }
439    }
440
441    /// Set a custom adblock engine built from user-supplied filter rules.
442    #[cfg(feature = "adblock")]
443    pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
444        self.adblock_engine = Some(AdblockEngine(engine));
445    }
446
447    /// Replace the whitelist patterns (compiled once).
448    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
449    where
450        I: IntoIterator<Item = S>,
451        S: Into<String>,
452    {
453        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
454        self.rebuild_whitelist_matcher();
455    }
456
457    /// Replace the blacklist patterns (compiled once).
458    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
459    where
460        I: IntoIterator<Item = S>,
461        S: Into<String>,
462    {
463        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
464        self.rebuild_blacklist_matcher();
465    }
466
467    /// Add one pattern (cheap) and rebuild (call this sparingly).
468    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
469        self.blacklist_patterns.push(pattern.into());
470        self.rebuild_blacklist_matcher();
471    }
472
473    /// Add many patterns and rebuild once.
474    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
475    where
476        I: IntoIterator<Item = S>,
477        S: Into<String>,
478    {
479        self.blacklist_patterns
480            .extend(patterns.into_iter().map(Into::into));
481        self.rebuild_blacklist_matcher();
482    }
483
484    /// Clear blacklist entirely.
485    pub fn clear_blacklist(&mut self) {
486        self.blacklist_patterns.clear();
487        self.blacklist_matcher = None;
488    }
489
490    /// Control precedence: when true, blacklist always wins.
491    pub fn set_blacklist_strict(&mut self, strict: bool) {
492        self.blacklist_strict = strict;
493    }
494
495    #[inline]
496    fn rebuild_blacklist_matcher(&mut self) {
497        if self.blacklist_patterns.is_empty() {
498            self.blacklist_matcher = None;
499            return;
500        }
501
502        self.blacklist_matcher =
503            AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
504    }
505
506    #[inline]
507    fn is_blacklisted(&self, url: &str) -> bool {
508        self.blacklist_matcher
509            .as_ref()
510            .map(|m| m.is_match(url))
511            .unwrap_or(false)
512    }
513
514    /// Add one pattern (cheap) and rebuild (call this sparingly).
515    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
516        self.whitelist_patterns.push(pattern.into());
517        self.rebuild_whitelist_matcher();
518    }
519
520    /// Add many patterns and rebuild once.
521    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
522    where
523        I: IntoIterator<Item = S>,
524        S: Into<String>,
525    {
526        self.whitelist_patterns
527            .extend(patterns.into_iter().map(Into::into));
528        self.rebuild_whitelist_matcher();
529    }
530
531    #[inline]
532    fn rebuild_whitelist_matcher(&mut self) {
533        if self.whitelist_patterns.is_empty() {
534            self.whitelist_matcher = None;
535            return;
536        }
537
538        // If building fails (shouldn’t for simple patterns), just disable matcher.
539        self.whitelist_matcher =
540            AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
541    }
542
543    #[inline]
544    fn is_whitelisted(&self, url: &str) -> bool {
545        self.whitelist_matcher
546            .as_ref()
547            .map(|m| m.is_match(url))
548            .unwrap_or(false)
549    }
550
551    /// Commands to init the chain with.
552    pub fn init_commands(&self) -> CommandChain {
553        let cmds = if self.ignore_httpserrors {
554            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
555        } else {
556            INIT_CHAIN.clone()
557        };
558        CommandChain::new(cmds, self.request_timeout)
559    }
560
561    /// Push the CDP request.
562    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
563        let method = cmd.identifier();
564        if let Ok(params) = serde_json::to_value(cmd) {
565            self.queued_events
566                .push_back(NetworkEvent::SendCdpRequest((method, params)));
567        }
568    }
569
570    /// The next event to handle.
571    pub fn poll(&mut self) -> Option<NetworkEvent> {
572        self.queued_events.pop_front()
573    }
574
575    /// Evict stale entries from the race-condition buffers and from
576    /// `attempted_authentications`. Call this periodically (e.g. from the
577    /// handler's eviction tick) so that lost CDP events cannot cause unbounded
578    /// map growth.
579    pub fn evict_stale_entries(&mut self, now: Instant) {
580        let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
581
582        self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
583        self.request_id_to_interception_id
584            .retain(|_, (_, ts)| *ts > cutoff);
585
586        // Evict orphaned in-flight requests whose completion events
587        // (`loadingFinished` / `loadingFailed` / `loadingCanceled`) were
588        // never received.  Uses a longer timeout than the race-condition
589        // buffers since real requests can legitimately be long-lived.
590        let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
591        self.requests
592            .retain(|_, req| req.created_at > request_cutoff);
593
594        // `attempted_authentications` entries reference interception IDs that
595        // are cleaned up on loading-finished / loading-failed. If those events
596        // are lost, the set grows forever. Cross-reference with `requests`:
597        // any interception ID that no longer appears in a live request is stale.
598        if !self.attempted_authentications.is_empty() {
599            let live: HashSet<&str> = self
600                .requests
601                .values()
602                .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
603                .collect();
604            self.attempted_authentications
605                .retain(|id| live.contains(id.as_ref()));
606        }
607    }
608
609    /// Get the extra headers.
610    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
611        &self.extra_headers
612    }
613
614    /// Set extra HTTP headers.
615    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
616        self.extra_headers = headers;
617        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
618        self.extra_headers.remove("Proxy-Authorization");
619        if !self.extra_headers.is_empty() {
620            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
621                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
622            }
623        }
624    }
625
626    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
627        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
628    }
629
630    pub fn set_block_all(&mut self, block_all: bool) {
631        self.block_all = block_all;
632    }
633
634    pub fn set_request_interception(&mut self, enabled: bool) {
635        self.user_request_interception_enabled = enabled;
636        self.update_protocol_request_interception();
637    }
638
639    pub fn set_cache_enabled(&mut self, enabled: bool) {
640        let run = self.user_cache_disabled == enabled;
641        self.user_cache_disabled = !enabled;
642        if run {
643            self.update_protocol_cache_disabled();
644        }
645    }
646
647    /// Enable fetch interception.
648    pub fn enable_request_intercept(&mut self) {
649        self.protocol_request_interception_enabled = true;
650    }
651
652    /// Disable fetch interception.
653    pub fn disable_request_intercept(&mut self) {
654        self.protocol_request_interception_enabled = false;
655    }
656
657    /// Set the cache site key.
658    #[cfg(feature = "_cache")]
659    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
660        self.cache_site_key = cache_site_key;
661    }
662
663    /// Set the cache policy.
664    #[cfg(feature = "_cache")]
665    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
666        self.cache_policy = cache_policy;
667    }
668
669    pub fn update_protocol_cache_disabled(&mut self) {
670        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
671    }
672
673    pub fn authenticate(&mut self, credentials: Credentials) {
674        self.credentials = Some(credentials);
675        self.update_protocol_request_interception();
676        self.protocol_request_interception_enabled = true;
677    }
678
679    fn update_protocol_request_interception(&mut self) {
680        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
681
682        if enabled == self.protocol_request_interception_enabled {
683            return;
684        }
685
686        if enabled {
687            self.push_cdp_request(ENABLE_FETCH.clone())
688        } else {
689            self.push_cdp_request(DisableParams::default())
690        }
691    }
692
693    /// Blocklist-only script blocking.
694    /// Returns true only when the URL matches an explicit blocklist condition.
695    #[inline]
696    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
697        // If analytics blocking is off, skip all analytics tries.
698        let block_analytics = self.block_analytics;
699
700        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
701        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
702        {
703            return true;
704        }
705
706        // 2) Custom website block list (explicit).
707        if crate::handler::blockers::block_websites::block_website(url) {
708            return true;
709        }
710
711        // 3) Path-based explicit tries / fallbacks.
712        //
713        // We run these on:
714        // - path with leading slash ("/js/app.js")
715        // - path without leading slash ("js/app.js")
716        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
717        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
718            // Remove query/fragment so matching stays stable.
719            let p_slash = Self::strip_query_fragment(path_with_slash);
720            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
721
722            // Basename for filename-only lists.
723            let base = match p_slash.rsplit('/').next() {
724                Some(b) => b,
725                None => p_slash,
726            };
727
728            // ---- Trie checks ----
729            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
730            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
731                return true;
732            }
733            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
734                return true;
735            }
736            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
737                return true;
738            }
739
740            // Base-path ignore tries (framework noise / known ignorable script paths).
741            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
742            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
743                return true;
744            }
745
746            // Style path ignores only when visuals are ignored.
747            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
748                return true;
749            }
750        }
751
752        false
753    }
754
755    /// Extract the absolute URL path portion WITH the leading slash.
756    ///
757    /// Example:
758    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
759    #[inline]
760    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
761        // find scheme separator
762        let bytes = url.as_bytes();
763        let idx = memchr::memmem::find(bytes, b"//")?;
764        let after_slashes = idx + 2;
765
766        // find first slash after host
767        let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
768        let slash_idx = after_slashes + slash_rel;
769
770        if slash_idx < url.len() {
771            Some(&url[slash_idx..])
772        } else {
773            None
774        }
775    }
776
777    /// Strip query string and fragment from a path-ish string.
778    ///
779    /// Example:
780    /// - "/a/b.js?x=1#y" -> "/a/b.js"
781    #[inline]
782    fn strip_query_fragment(s: &str) -> &str {
783        match memchr::memchr2(b'?', b'#', s.as_bytes()) {
784            Some(i) => &s[..i],
785            None => s,
786        }
787    }
788
789    /// Determine if the request should be skipped.
790    #[inline]
791    fn skip_xhr(
792        &self,
793        skip_networking: bool,
794        event: &EventRequestPaused,
795        network_event: bool,
796    ) -> bool {
797        // XHR check
798        if !skip_networking && network_event {
799            let request_url = event.request.url.as_str();
800
801            // check if part of ignore scripts.
802            let skip_analytics =
803                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
804
805            if skip_analytics {
806                true
807            } else if self.block_stylesheets || self.ignore_visuals {
808                let block_css = self.block_stylesheets;
809                let block_media = self.ignore_visuals;
810
811                let mut block_request = false;
812
813                if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
814                    let hlen = request_url.len();
815                    let has_asset = hlen - position;
816
817                    if has_asset >= 3 {
818                        let next_position = position + 1;
819
820                        if block_media
821                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
822                                &request_url[next_position..].into(),
823                            )
824                        {
825                            block_request = true;
826                        } else if block_css {
827                            block_request = CaseInsensitiveString::from(
828                                &request_url.as_bytes()[next_position..],
829                            )
830                            .contains(&**CSS_EXTENSION)
831                        }
832                    }
833                }
834
835                if !block_request {
836                    block_request = ignore_script_xhr_media(request_url);
837                }
838
839                block_request
840            } else {
841                skip_networking
842            }
843        } else {
844            skip_networking
845        }
846    }
847
848    #[cfg(feature = "adblock")]
849    #[inline]
850    /// Detect if ad enabled.
851    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
852        if skip_networking {
853            true
854        } else {
855            block_ads(&event.request.url) || self.detect_ad(event)
856        }
857    }
858
859    /// When adblock feature is disabled, this is a no-op.
860    #[cfg(not(feature = "adblock"))]
861    #[inline]
862    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
863        use crate::handler::blockers::block_websites::block_ads;
864        if skip_networking {
865            true
866        } else {
867            block_ads(&event.request.url)
868        }
869    }
870
871    #[inline]
872    /// Fail request
873    fn fail_request_blocked(
874        &mut self,
875        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
876    ) {
877        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
878            request_id.clone(),
879            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
880        );
881        self.push_cdp_request(params);
882    }
883
884    #[inline]
885    /// Fulfill request
886    fn fulfill_request_empty_200(
887        &mut self,
888        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
889    ) {
890        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
891            request_id.clone(),
892            200,
893        );
894        self.push_cdp_request(params);
895    }
896
897    #[cfg(feature = "_cache")]
898    #[inline]
899    /// Fulfill a paused Fetch request from cached bytes + header map.
900    ///
901    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
902    fn fulfill_request_from_cache(
903        &mut self,
904        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
905        body: &[u8],
906        headers: &std::collections::HashMap<String, String>,
907        status: i64,
908    ) {
909        use crate::cdp::browser_protocol::fetch::HeaderEntry;
910        use crate::handler::network::fetch::FulfillRequestParams;
911        use base64::Engine;
912
913        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
914
915        for (k, v) in headers.iter() {
916            resp_headers.push(HeaderEntry {
917                name: k.clone().into(),
918                value: v.clone().into(),
919            });
920        }
921
922        let mut params = FulfillRequestParams::new(request_id.clone(), status);
923
924        // TODO: have this already encoded prior.
925        params.body = Some(
926            base64::engine::general_purpose::STANDARD
927                .encode(body)
928                .into(),
929        );
930
931        params.response_headers = Some(resp_headers);
932
933        self.push_cdp_request(params);
934    }
935
936    #[inline]
937    /// Continue the request url.
938    fn continue_request_with_url(
939        &mut self,
940        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
941        url: Option<&str>,
942        intercept_response: bool,
943    ) {
944        let mut params = ContinueRequestParams::new(request_id.clone());
945        if let Some(url) = url {
946            params.url = Some(url.to_string());
947            params.intercept_response = Some(intercept_response);
948        }
949        self.push_cdp_request(params);
950    }
951
952    /// On fetch request paused interception.
953    #[inline]
954    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
955        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
956            return;
957        }
958
959        if self.block_all {
960            tracing::debug!(
961                "Blocked (block_all): {:?} - {}",
962                event.resource_type,
963                event.request.url
964            );
965            return self.fail_request_blocked(&event.request_id);
966        }
967
968        if let Some(network_id) = event.network_id.as_ref() {
969            if let Some((request_will_be_sent, _)) =
970                self.requests_will_be_sent.remove(network_id.as_ref())
971            {
972                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
973            } else {
974                self.request_id_to_interception_id.insert(
975                    network_id.clone(),
976                    (event.request_id.clone().into(), Instant::now()),
977                );
978            }
979        }
980
981        // From here on, we handle the full decision tree.
982        let javascript_resource = event.resource_type == ResourceType::Script;
983        let document_resource = event.resource_type == ResourceType::Document;
984        let network_resource =
985            !document_resource && crate::utils::is_data_resource(&event.resource_type);
986
987        // Start with static / cheap skip checks.
988        let mut skip_networking =
989            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
990
991        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
992            skip_networking = true;
993        }
994
995        // Also short-circuit if we've reloaded this document too many times.
996        if !skip_networking {
997            skip_networking = self.document_reload_tracker >= 3;
998        }
999
1000        // Handle document redirect / masking and track xml documents.
1001        let (current_url_cow, had_replacer) =
1002            self.handle_document_replacement_and_tracking(event, document_resource);
1003
1004        let current_url: &str = current_url_cow.as_ref();
1005
1006        let blacklisted = self.is_blacklisted(current_url);
1007
1008        if !self.blacklist_strict && blacklisted {
1009            skip_networking = true;
1010        }
1011
1012        if !skip_networking {
1013            // Allow XSL for sitemap XML.
1014            if self.xml_document && current_url.ends_with(".xsl") {
1015                skip_networking = false;
1016            } else {
1017                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1018            }
1019        }
1020
1021        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1022
1023        // Ignore embedded scripts, tracker stylesheets, and tracker images when only_html or ignore_visuals is set.
1024        if !skip_networking
1025            && self.block_javascript
1026            && (self.only_html || self.ignore_visuals)
1027            && (javascript_resource
1028                || document_resource
1029                || event.resource_type == ResourceType::Stylesheet
1030                || event.resource_type == ResourceType::Image)
1031        {
1032            skip_networking = ignore_script_embedded(current_url);
1033        }
1034
1035        // Script policy: allow-by-default.
1036        // Block only if explicit block list patterns match.
1037        if !skip_networking && javascript_resource {
1038            skip_networking = self.should_block_script_blocklist_only(current_url);
1039        }
1040
1041        // XHR / data resources.
1042        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1043
1044        // Custom interception layer.
1045        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1046            skip_networking = self.intercept_manager.intercept_detection(
1047                current_url,
1048                self.ignore_visuals,
1049                network_resource,
1050            );
1051        }
1052
1053        // Custom website block list.
1054        if !skip_networking && (javascript_resource || network_resource) {
1055            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1056        }
1057
1058        // whitelist 3rd party
1059        // not required unless explicit blocking.
1060        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1061        {
1062            skip_networking = false;
1063        }
1064
1065        // check if the url is in the whitelist.
1066        if skip_networking && self.is_whitelisted(current_url) {
1067            skip_networking = false;
1068        }
1069
1070        if self.blacklist_strict && blacklisted {
1071            skip_networking = true;
1072        }
1073
1074        if skip_networking {
1075            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1076            self.fulfill_request_empty_200(&event.request_id);
1077        } else {
1078            #[cfg(feature = "_cache")]
1079            {
1080                if let (Some(policy), Some(cache_site_key)) =
1081                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1082                {
1083                    let current_url = format!("{}:{}", event.request.method, &current_url);
1084
1085                    if let Some((res, cache_policy)) =
1086                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1087                    {
1088                        if policy.allows_cached(&cache_policy) {
1089                            tracing::debug!(
1090                                "Remote Cached: {:?} - {}",
1091                                &event.resource_type,
1092                                &current_url
1093                            );
1094                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1095                            return self.fulfill_request_from_cache(
1096                                &event.request_id,
1097                                &res.body,
1098                                &flat_headers,
1099                                res.status as i64,
1100                            );
1101                        }
1102                    }
1103                }
1104            }
1105
1106            // check our frame cache for the run.
1107            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1108            self.continue_request_with_url(
1109                &event.request_id,
1110                if had_replacer {
1111                    Some(current_url)
1112                } else {
1113                    None
1114                },
1115                !had_replacer,
1116            );
1117        }
1118    }
1119
1120    /// Shared "visuals + basic blocking" logic.
1121    ///
1122    /// IMPORTANT: Scripts are NOT blocked here anymore.
1123    /// Scripts are allowed by default and only blocked via explicit blocklists
1124    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1125    #[inline]
1126    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1127        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1128            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1129    }
1130
1131    /// Does the network manager have a target domain?
1132    pub fn has_target_domain(&self) -> bool {
1133        !self.document_target_url.is_empty()
1134    }
1135
1136    /// Set the target page url for tracking.
1137    pub fn set_page_url(&mut self, page_target_url: String) {
1138        let host_base = host_and_rest(&page_target_url)
1139            .map(|(h, _)| base_domain_from_host(h))
1140            .unwrap_or("");
1141
1142        self.document_target_domain = host_base.to_string();
1143        self.document_target_url = page_target_url;
1144    }
1145
1146    /// Clear the initial target domain on every navigation.
1147    pub fn clear_target_domain(&mut self) {
1148        self.document_reload_tracker = 0;
1149        self.document_target_url = Default::default();
1150        self.document_target_domain = Default::default();
1151    }
1152
1153    /// Handles:
1154    /// - document reload tracking (`document_reload_tracker`)
1155    /// - redirect masking / replacement
1156    /// - xml document detection (`xml_document`)
1157    /// - `document_target_url` updates
1158    ///
1159    /// Returns (current_url, had_replacer).
1160    #[inline]
1161    fn handle_document_replacement_and_tracking<'a>(
1162        &mut self,
1163        event: &'a EventRequestPaused,
1164        document_resource: bool,
1165    ) -> (Cow<'a, str>, bool) {
1166        let mut replacer: Option<String> = None;
1167        let current_url = event.request.url.as_str();
1168
1169        if document_resource {
1170            if self.document_target_url == current_url {
1171                self.document_reload_tracker += 1;
1172            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1173            {
1174                let (http_document_replacement, mut https_document_replacement) =
1175                    if self.document_target_url.starts_with("http://") {
1176                        (
1177                            self.document_target_url.replacen("http://", "http//", 1),
1178                            self.document_target_url.replacen("http://", "https://", 1),
1179                        )
1180                    } else {
1181                        (
1182                            self.document_target_url.replacen("https://", "https//", 1),
1183                            self.document_target_url.replacen("https://", "http://", 1),
1184                        )
1185                    };
1186
1187                // Track trailing slash to restore later.
1188                let trailing = https_document_replacement.ends_with('/');
1189                if trailing {
1190                    https_document_replacement.pop();
1191                }
1192                if https_document_replacement.ends_with('/') {
1193                    https_document_replacement.pop();
1194                }
1195
1196                let redirect_mask = format!(
1197                    "{}{}",
1198                    https_document_replacement, http_document_replacement
1199                );
1200
1201                if current_url == redirect_mask {
1202                    replacer = Some(if trailing {
1203                        format!("{}/", https_document_replacement)
1204                    } else {
1205                        https_document_replacement
1206                    });
1207                }
1208            }
1209
1210            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1211                self.xml_document = true;
1212            }
1213
1214            // Track last seen document URL.
1215            self.document_target_url = event.request.url.clone();
1216            self.document_target_domain = host_and_rest(&self.document_target_url)
1217                .map(|(h, _)| base_domain_from_host(h).to_string())
1218                .unwrap_or_default();
1219        }
1220
1221        let current_url_cow = match replacer {
1222            Some(r) => Cow::Owned(r),
1223            None => Cow::Borrowed(event.request.url.as_str()),
1224        };
1225
1226        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1227        (current_url_cow, had_replacer)
1228    }
1229
1230    /// Perform a page intercept for chrome using the adblock engine.
1231    /// Uses the custom engine when user-supplied filter rules are configured,
1232    /// otherwise falls back to the global default engine with built-in patterns.
1233    #[cfg(feature = "adblock")]
1234    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1235        use adblock::{
1236            lists::{FilterSet, ParseOptions, RuleTypes},
1237            Engine,
1238        };
1239
1240        lazy_static::lazy_static! {
1241            static ref AD_ENGINE: Engine = {
1242                let mut filter_set = FilterSet::new(false);
1243                let mut rules = ParseOptions::default();
1244                rules.rule_types = RuleTypes::All;
1245
1246                filter_set.add_filters(
1247                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1248                    rules.clone(),
1249                );
1250
1251                // When adblock_easylist is enabled, EasyList + EasyPrivacy are
1252                // embedded at build time for zero-cost runtime loading.
1253                #[cfg(feature = "adblock_easylist")]
1254                {
1255                    static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1256                    static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1257
1258                    if !EASYLIST.is_empty() {
1259                        filter_set.add_filter_list(EASYLIST, rules.clone());
1260                    }
1261                    if !EASYPRIVACY.is_empty() {
1262                        filter_set.add_filter_list(EASYPRIVACY, rules);
1263                    }
1264                }
1265
1266                Engine::from_filter_set(filter_set, true)
1267            };
1268        }
1269
1270        let blockable = event.resource_type == ResourceType::Script
1271            || event.resource_type == ResourceType::Image
1272            || event.resource_type == ResourceType::Media
1273            || event.resource_type == ResourceType::Stylesheet
1274            || event.resource_type == ResourceType::Document
1275            || event.resource_type == ResourceType::Fetch
1276            || event.resource_type == ResourceType::Xhr;
1277
1278        if !blockable {
1279            return false;
1280        }
1281
1282        let u = &event.request.url;
1283
1284        let source_domain = if self.document_target_domain.is_empty() {
1285            "example.com"
1286        } else {
1287            &self.document_target_domain
1288        };
1289
1290        // Fast hostname extraction without full URL parsing.
1291        // preparsed(url, request_hostname, source_hostname, type, third_party)
1292        let hostname = u
1293            .strip_prefix("https://")
1294            .or_else(|| u.strip_prefix("http://"))
1295            .and_then(|rest| rest.split('/').next())
1296            // Strip userinfo (user:pass@) if present.
1297            .map(
1298                |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1299                    Some(i) => &authority[i + 1..],
1300                    None => authority,
1301                },
1302            )
1303            // Strip port (:8080) if present.
1304            .and_then(|host_port| host_port.split(':').next())
1305            .unwrap_or(source_domain);
1306
1307        let resource_type_str = match event.resource_type {
1308            ResourceType::Script => "script",
1309            ResourceType::Image => "image",
1310            ResourceType::Media => "media",
1311            ResourceType::Stylesheet => "stylesheet",
1312            ResourceType::Document => "document",
1313            ResourceType::Fetch => "fetch",
1314            ResourceType::Xhr => "xhr",
1315            _ => "other",
1316        };
1317
1318        let request = adblock::request::Request::preparsed(
1319            u,
1320            hostname,
1321            source_domain,
1322            resource_type_str,
1323            !event.request.is_same_site.unwrap_or_default(),
1324        );
1325
1326        let engine: &Engine = match self.adblock_engine.as_ref() {
1327            Some(custom) => custom,
1328            None => &AD_ENGINE,
1329        };
1330
1331        engine.check_network_request(&request).matched
1332    }
1333
1334    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1335        let response = if self
1336            .attempted_authentications
1337            .contains(event.request_id.as_ref())
1338        {
1339            AuthChallengeResponseResponse::CancelAuth
1340        } else if self.credentials.is_some() {
1341            self.attempted_authentications
1342                .insert(event.request_id.clone().into());
1343            AuthChallengeResponseResponse::ProvideCredentials
1344        } else {
1345            AuthChallengeResponseResponse::Default
1346        };
1347
1348        let mut auth = AuthChallengeResponse::new(response);
1349        if let Some(creds) = self.credentials.clone() {
1350            auth.username = Some(creds.username);
1351            auth.password = Some(creds.password);
1352        }
1353        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1354    }
1355
1356    /// Set the page offline network emulation condition.
1357    pub fn set_offline_mode(&mut self, value: bool) {
1358        if self.offline == value {
1359            return;
1360        }
1361        self.offline = value;
1362        if let Ok(condition) = NetworkConditions::builder()
1363            .url_pattern("")
1364            .latency(0)
1365            .download_throughput(-1.)
1366            .upload_throughput(-1.)
1367            .build()
1368        {
1369            if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1370                .offline(self.offline)
1371                .matched_network_condition(condition)
1372                .build()
1373            {
1374                self.push_cdp_request(network);
1375            }
1376        }
1377    }
1378
1379    /// Request interception doesn't happen for data URLs with Network Service.
1380    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1381        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1382            if let Some((interception_id, _)) = self
1383                .request_id_to_interception_id
1384                .remove(event.request_id.as_ref())
1385            {
1386                self.on_request(event, Some(interception_id));
1387            } else {
1388                self.requests_will_be_sent
1389                    .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1390            }
1391        } else {
1392            self.on_request(event, None);
1393        }
1394    }
1395
1396    /// The request was served from the cache.
1397    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1398        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1399            request.from_memory_cache = true;
1400        }
1401    }
1402
1403    /// On network response received.
1404    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1405        let mut request_failed = false;
1406
1407        // Track how many bytes we actually deducted from this target.
1408        let mut deducted: u64 = 0;
1409
1410        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1411            let before = *max_bytes;
1412
1413            // encoded_data_length -> saturating cast to u64
1414            let received_bytes: u64 = event.response.encoded_data_length as u64;
1415
1416            // Safe parse of Content-Length
1417            let content_length: Option<u64> = event
1418                .response
1419                .headers
1420                .inner()
1421                .get("content-length")
1422                .and_then(|v| v.as_str())
1423                .and_then(|s| s.trim().parse::<u64>().ok());
1424
1425            // Deduct what we actually received
1426            *max_bytes = max_bytes.saturating_sub(received_bytes);
1427
1428            // If the declared size can't fit, zero out now
1429            if let Some(cl) = content_length {
1430                if cl > *max_bytes {
1431                    *max_bytes = 0;
1432                }
1433            }
1434
1435            request_failed = *max_bytes == 0;
1436
1437            // Compute exact delta deducted on this event
1438            deducted = before.saturating_sub(*max_bytes);
1439        }
1440
1441        // Bubble up the deduction (even if request continues)
1442        if deducted > 0 {
1443            self.queued_events
1444                .push_back(NetworkEvent::BytesConsumed(deducted));
1445        }
1446
1447        // block all network request moving forward.
1448        if request_failed && self.max_bytes_allowed.is_some() {
1449            self.set_block_all(true);
1450        }
1451
1452        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1453            request.set_response(event.response.clone());
1454            self.queued_events.push_back(if request_failed {
1455                NetworkEvent::RequestFailed(request)
1456            } else {
1457                NetworkEvent::RequestFinished(request)
1458            });
1459        }
1460    }
1461
1462    /// On network loading finished.
1463    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1464        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1465            if let Some(interception_id) = request.interception_id.as_ref() {
1466                self.attempted_authentications
1467                    .remove(interception_id.as_ref());
1468            }
1469            self.queued_events
1470                .push_back(NetworkEvent::RequestFinished(request));
1471        }
1472    }
1473
1474    /// On network loading failed.
1475    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1476        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1477            request.failure_text = Some(event.error_text.clone());
1478            if let Some(interception_id) = request.interception_id.as_ref() {
1479                self.attempted_authentications
1480                    .remove(interception_id.as_ref());
1481            }
1482            self.queued_events
1483                .push_back(NetworkEvent::RequestFailed(request));
1484        }
1485    }
1486
1487    /// On request will be sent.
1488    fn on_request(
1489        &mut self,
1490        event: &EventRequestWillBeSent,
1491        interception_id: Option<InterceptionId>,
1492    ) {
1493        let mut redirect_chain = Vec::new();
1494        let mut redirect_location = None;
1495
1496        if let Some(redirect_resp) = &event.redirect_response {
1497            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1498                if is_redirect_status(redirect_resp.status) {
1499                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1500                        if redirect_resp.url != location {
1501                            let fixed_location = location.replace(&redirect_resp.url, "");
1502
1503                            if !fixed_location.is_empty() {
1504                                if let Some(resp) = request.response.as_mut() {
1505                                    resp.headers.0["Location"] =
1506                                        serde_json::Value::String(fixed_location.clone());
1507                                }
1508                            }
1509
1510                            redirect_location = Some(fixed_location);
1511                        }
1512                    }
1513                }
1514
1515                {
1516                    let mut redirect_resp = redirect_resp.clone();
1517
1518                    if let Some(redirect_location) = redirect_location {
1519                        if !redirect_location.is_empty() {
1520                            redirect_resp.headers.0["Location"] =
1521                                serde_json::Value::String(redirect_location);
1522                        }
1523                    }
1524
1525                    self.handle_request_redirect(&mut request, redirect_resp);
1526                }
1527
1528                redirect_chain = std::mem::take(&mut request.redirect_chain);
1529                redirect_chain.push(request);
1530            }
1531        }
1532
1533        let request = HttpRequest::new(
1534            event.request_id.clone(),
1535            event.frame_id.clone(),
1536            interception_id,
1537            self.user_request_interception_enabled,
1538            redirect_chain,
1539        );
1540
1541        self.requests.insert(event.request_id.clone(), request);
1542        self.queued_events
1543            .push_back(NetworkEvent::Request(event.request_id.clone()));
1544    }
1545
1546    /// Handle request redirect.
1547    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1548        request.set_response(response);
1549        if let Some(interception_id) = request.interception_id.as_ref() {
1550            self.attempted_authentications
1551                .remove(interception_id.as_ref());
1552        }
1553    }
1554}
1555
1556#[derive(Debug)]
1557pub enum NetworkEvent {
1558    /// Send a CDP request.
1559    SendCdpRequest((MethodId, serde_json::Value)),
1560    /// Request.
1561    Request(RequestId),
1562    /// Response
1563    Response(RequestId),
1564    /// Request failed.
1565    RequestFailed(HttpRequest),
1566    /// Request finished.
1567    RequestFinished(HttpRequest),
1568    /// Bytes consumed.
1569    BytesConsumed(u64),
1570}
1571
1572#[cfg(test)]
1573mod tests {
1574    use super::ALLOWED_MATCHER_3RD_PARTY;
1575    use crate::handler::network::NetworkManager;
1576    use std::time::Duration;
1577
1578    #[test]
1579    fn test_allowed_matcher_3rd_party() {
1580        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1581        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1582        assert!(
1583            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1584            "expected Cloudflare challenge script to be allowed"
1585        );
1586
1587        // Should NOT be allowed (not in allow-list)
1588        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1589        assert!(
1590            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1591            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1592        );
1593
1594        // A couple sanity checks for existing allow patterns
1595        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1596        assert!(ALLOWED_MATCHER_3RD_PARTY
1597            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1598        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1599    }
1600
1601    #[test]
1602    fn test_script_allowed_by_default_when_not_blocklisted() {
1603        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1604        nm.set_page_url(
1605            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1606        );
1607
1608        // A random script that should not match your block tries.
1609        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1610        assert!(
1611            !nm.should_block_script_blocklist_only(ok),
1612            "expected non-blocklisted script to be allowed"
1613        );
1614    }
1615
1616    #[test]
1617    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1618        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1619        nm.set_page_url(
1620            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1621        );
1622
1623        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1624        let bad = "https://cdn.example.net/js/analytics.js";
1625        assert!(
1626            nm.should_block_script_blocklist_only(bad),
1627            "expected analytics.js to be blocklisted"
1628        );
1629    }
1630
1631    #[test]
1632    fn test_allowed_matcher_3rd_party_sanity() {
1633        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1634        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1635        assert!(
1636            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1637            "expected Cloudflare challenge script to be allowed"
1638        );
1639
1640        // Should NOT be allowed (not in allow-list)
1641        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1642        assert!(
1643            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1644            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1645        );
1646
1647        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1648        assert!(ALLOWED_MATCHER_3RD_PARTY
1649            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1650        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1651    }
1652    #[test]
1653    fn test_dynamic_blacklist_blocks_url() {
1654        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1655        nm.set_page_url("https://example.com/".to_string());
1656
1657        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1658        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1659        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1660
1661        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1662    }
1663
1664    #[test]
1665    fn test_blacklist_strict_wins_over_whitelist() {
1666        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1667        nm.set_page_url("https://example.com/".to_string());
1668
1669        // Same URL in both lists.
1670        nm.set_blacklist_patterns(["beacon.min.js"]);
1671        nm.set_whitelist_patterns(["beacon.min.js"]);
1672
1673        nm.set_blacklist_strict(true);
1674
1675        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1676        assert!(nm.is_whitelisted(u));
1677        assert!(nm.is_blacklisted(u));
1678
1679        // In strict mode, it should still be considered blocked at decision time.
1680        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1681        assert!(nm.blacklist_strict);
1682    }
1683
1684    #[cfg(feature = "adblock")]
1685    fn make_request_paused(
1686        url: &str,
1687        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1688        is_same_site: bool,
1689    ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1690        use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1691        use chromiumoxide_cdp::cdp::browser_protocol::network::{
1692            Headers, Request, RequestReferrerPolicy, ResourcePriority,
1693        };
1694
1695        EventRequestPaused {
1696            request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1697                "test-req".to_string(),
1698            )
1699            .into(),
1700            request: Request {
1701                url: url.to_string(),
1702                method: "GET".to_string(),
1703                headers: Headers::new(serde_json::Value::Object(Default::default())),
1704                initial_priority: ResourcePriority::Medium,
1705                referrer_policy: RequestReferrerPolicy::NoReferrer,
1706                url_fragment: None,
1707                has_post_data: None,
1708                post_data_entries: None,
1709                mixed_content_type: None,
1710                is_link_preload: None,
1711                trust_token_params: None,
1712                is_same_site: Some(is_same_site),
1713                is_ad_related: None,
1714            },
1715            frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1716                "frame1".to_string(),
1717            ),
1718            resource_type,
1719            response_error_reason: None,
1720            response_status_code: None,
1721            response_status_text: None,
1722            response_headers: None,
1723            network_id: None,
1724            redirected_request_id: None,
1725        }
1726    }
1727
1728    #[cfg(feature = "adblock")]
1729    #[test]
1730    fn test_detect_ad_blocks_known_tracker_scripts() {
1731        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1732
1733        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1734        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1735
1736        let event = make_request_paused(
1737            "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1738            ResourceType::Script,
1739            false,
1740        );
1741
1742        assert!(
1743            nm.detect_ad(&event),
1744            "googletagmanager.com script should be detected as ad"
1745        );
1746    }
1747
1748    #[cfg(feature = "adblock")]
1749    #[test]
1750    fn test_detect_ad_allows_legitimate_scripts() {
1751        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1752
1753        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1754        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1755
1756        let event = make_request_paused(
1757            "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1758            ResourceType::Script,
1759            true,
1760        );
1761
1762        assert!(
1763            !nm.detect_ad(&event),
1764            "legitimate first-party app bundle should not be blocked"
1765        );
1766    }
1767
1768    #[cfg(feature = "adblock")]
1769    #[test]
1770    fn test_detect_ad_uses_source_domain() {
1771        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1772
1773        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1774        nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1775
1776        assert!(
1777            !nm.document_target_domain.is_empty(),
1778            "document_target_domain should be set after set_page_url"
1779        );
1780
1781        let event = make_request_paused(
1782            "https://www.google-analytics.com/analytics.js",
1783            ResourceType::Script,
1784            false,
1785        );
1786
1787        assert!(
1788            nm.detect_ad(&event),
1789            "google-analytics.com should be blocked as tracker"
1790        );
1791    }
1792
1793    #[cfg(feature = "adblock")]
1794    #[test]
1795    fn test_custom_adblock_engine_takes_precedence() {
1796        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1797
1798        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1799        nm.set_page_url("https://example.com/".to_string());
1800
1801        // Build a custom engine with a specific rule.
1802        let mut filter_set = adblock::lists::FilterSet::new(false);
1803        let mut opts = adblock::lists::ParseOptions::default();
1804        opts.rule_types = adblock::lists::RuleTypes::All;
1805        filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1806        let engine = adblock::Engine::from_filter_set(filter_set, true);
1807        nm.set_adblock_engine(std::sync::Arc::new(engine));
1808
1809        let event = make_request_paused(
1810            "https://custom-tracker.example.net/pixel.js",
1811            ResourceType::Script,
1812            false,
1813        );
1814
1815        assert!(
1816            nm.detect_ad(&event),
1817            "custom engine rule should block custom-tracker.example.net"
1818        );
1819    }
1820
1821    /// Helper: run a URL through the full `on_fetch_request_paused` pipeline
1822    /// and return whether it was blocked (true) or allowed (false).
1823    #[cfg(feature = "adblock")]
1824    fn run_full_interception(
1825        nm: &mut NetworkManager,
1826        url: &str,
1827        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1828        is_same_site: bool,
1829    ) -> bool {
1830        use super::NetworkEvent;
1831
1832        // Drain any prior events.
1833        while nm.poll().is_some() {}
1834
1835        let event = make_request_paused(url, resource_type, is_same_site);
1836        nm.on_fetch_request_paused(&event);
1837
1838        // Check what was emitted: Fetch.fulfillRequest = blocked, Fetch.continueRequest = allowed.
1839        let mut blocked = false;
1840        while let Some(ev) = nm.poll() {
1841            if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1842                let m: &str = method.as_ref();
1843                if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1844                    blocked = true;
1845                }
1846            }
1847        }
1848        blocked
1849    }
1850
1851    // ── End-to-end interception tests ───────────────────────────────────
1852
1853    #[cfg(feature = "adblock")]
1854    #[test]
1855    fn test_e2e_tracker_script_blocked() {
1856        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1857
1858        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1859        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1860
1861        assert!(
1862            run_full_interception(
1863                &mut nm,
1864                "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1865                ResourceType::Script,
1866                false,
1867            ),
1868            "GTM script should be blocked through full pipeline"
1869        );
1870    }
1871
1872    #[cfg(feature = "adblock")]
1873    #[test]
1874    fn test_e2e_legitimate_script_allowed() {
1875        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1876
1877        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1878        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1879
1880        assert!(
1881            !run_full_interception(
1882                &mut nm,
1883                "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1884                ResourceType::Script,
1885                true,
1886            ),
1887            "legitimate first-party script should be allowed through full pipeline"
1888        );
1889    }
1890
1891    #[cfg(feature = "adblock")]
1892    #[test]
1893    fn test_e2e_analytics_xhr_blocked() {
1894        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1895
1896        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1897        nm.set_page_url("https://example.org/".to_string());
1898
1899        assert!(
1900            run_full_interception(
1901                &mut nm,
1902                "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
1903                ResourceType::Xhr,
1904                false,
1905            ),
1906            "Google Analytics XHR should be blocked through full pipeline"
1907        );
1908    }
1909
1910    #[cfg(feature = "adblock")]
1911    #[test]
1912    fn test_e2e_whitelisted_overrides_adblock() {
1913        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1914
1915        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1916        nm.set_page_url("https://example.org/".to_string());
1917        nm.set_whitelist_patterns(["googletagmanager.com"]);
1918
1919        // GTM would normally be blocked by adblock, but whitelist overrides.
1920        assert!(
1921            !run_full_interception(
1922                &mut nm,
1923                "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
1924                ResourceType::Script,
1925                false,
1926            ),
1927            "whitelisted tracker should be allowed even when adblock would block it"
1928        );
1929    }
1930
1931    #[cfg(feature = "adblock")]
1932    #[test]
1933    fn test_e2e_blacklist_strict_overrides_whitelist() {
1934        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1935
1936        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1937        nm.set_page_url("https://example.org/".to_string());
1938        nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
1939        nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
1940        nm.set_blacklist_strict(true);
1941
1942        assert!(
1943            run_full_interception(
1944                &mut nm,
1945                "https://cdn.example.net/evil.js",
1946                ResourceType::Script,
1947                false,
1948            ),
1949            "strict blacklist should win over whitelist"
1950        );
1951    }
1952
1953    #[cfg(feature = "adblock")]
1954    #[test]
1955    fn test_e2e_first_party_document_not_blocked() {
1956        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1957
1958        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1959        nm.set_page_url("https://www.nytimes.com/".to_string());
1960
1961        assert!(
1962            !run_full_interception(
1963                &mut nm,
1964                "https://www.nytimes.com/2024/article.html",
1965                ResourceType::Document,
1966                true,
1967            ),
1968            "first-party document navigation should never be blocked"
1969        );
1970    }
1971
1972    #[cfg(feature = "adblock")]
1973    #[test]
1974    fn test_e2e_custom_engine_blocks_through_pipeline() {
1975        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1976
1977        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1978        nm.set_page_url("https://mysite.com/".to_string());
1979
1980        let mut filter_set = adblock::lists::FilterSet::new(false);
1981        let mut opts = adblock::lists::ParseOptions::default();
1982        opts.rule_types = adblock::lists::RuleTypes::All;
1983        filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
1984        let engine = adblock::Engine::from_filter_set(filter_set, true);
1985        nm.set_adblock_engine(std::sync::Arc::new(engine));
1986
1987        assert!(
1988            run_full_interception(
1989                &mut nm,
1990                "https://evil-cdn.example.net/tracker.js",
1991                ResourceType::Script,
1992                false,
1993            ),
1994            "custom engine rule should block through full pipeline"
1995        );
1996
1997        // Legitimate script on the same site should still pass.
1998        assert!(
1999            !run_full_interception(
2000                &mut nm,
2001                "https://mysite.com/app.js",
2002                ResourceType::Script,
2003                true,
2004            ),
2005            "first-party script should still be allowed with custom engine"
2006        );
2007    }
2008
2009    #[cfg(feature = "adblock")]
2010    #[test]
2011    fn test_e2e_ad_image_blocked() {
2012        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2013
2014        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2015        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2016
2017        // Ad tracking pixel should be blocked via adblock pattern or trie.
2018        assert!(
2019            run_full_interception(
2020                &mut nm,
2021                "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2022                ResourceType::Image,
2023                false,
2024            ),
2025            "doubleclick ad image/tracking pixel should be blocked"
2026        );
2027
2028        // Legitimate first-party image should pass.
2029        assert!(
2030            !run_full_interception(
2031                &mut nm,
2032                "https://www.mylegitsite-test.com/images/logo.png",
2033                ResourceType::Image,
2034                true,
2035            ),
2036            "legitimate first-party image should not be blocked"
2037        );
2038    }
2039
2040    #[cfg(feature = "adblock")]
2041    #[test]
2042    fn test_e2e_hostname_with_userinfo() {
2043        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2044
2045        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2046        nm.set_page_url("https://example.org/".to_string());
2047
2048        // URL with userinfo should still correctly identify googletagmanager.com.
2049        assert!(
2050            run_full_interception(
2051                &mut nm,
2052                "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2053                ResourceType::Script,
2054                false,
2055            ),
2056            "tracker URL with userinfo should still be blocked"
2057        );
2058    }
2059
2060    #[test]
2061    fn test_blacklist_non_strict_allows_whitelist_override() {
2062        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2063        nm.set_page_url("https://example.com/".to_string());
2064
2065        nm.set_blacklist_patterns(["beacon.min.js"]);
2066        nm.set_whitelist_patterns(["beacon.min.js"]);
2067
2068        nm.set_blacklist_strict(false);
2069
2070        let u = "https://static.cloudflareinsights.com/beacon.min.js";
2071        assert!(nm.is_blacklisted(u));
2072        assert!(nm.is_whitelisted(u));
2073        assert!(!nm.blacklist_strict);
2074    }
2075}