chromiumoxide/handler/
network.rs

1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5    xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17    EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19    InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20    SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23    fetch::{
24        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26    },
27    network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::{Duration, Instant};
43
44lazy_static! {
45    /// General patterns for popular libraries and resources
46    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47        "jquery",           // Covers jquery.min.js, jquery.js, etc.
48        "angular",
49        "react",            // Covers all React-related patterns
50        "vue",              // Covers all Vue-related patterns
51        "bootstrap",
52        "d3",
53        "lodash",
54        "ajax",
55        "application",
56        "app",              // Covers general app scripts like app.js
57        "main",
58        "index",
59        "bundle",
60        "vendor",
61        "runtime",
62        "polyfill",
63        "scripts",
64        "es2015.",
65        "es2020.",
66        "webpack",
67        "captcha",
68        "client",
69        "/cdn-cgi/challenge-platform/",
70        "/wp-content/js/",  // Covers Wordpress content
71        // Verified 3rd parties for request
72        "https://m.stripe.network/",
73        "https://challenges.cloudflare.com/",
74        "https://www.google.com/recaptcha/",
75        "https://google.com/recaptcha/api.js",
76        "https://www.gstatic.com/recaptcha/",
77        "https://captcha.px-cloud.net/",
78        "https://geo.captcha-delivery.com/",
79        "https://api.leminnow.com/captcha/",
80        "https://cdn.auth0.com/js/lock/",
81        "https://captcha.gtimg.com",
82        "https://client-api.arkoselabs.com/",
83        "https://www.capy.me/puzzle/",
84        "https://newassets.hcaptcha.com/",
85        "https://cdn.auth0.com/client",
86        "https://js.stripe.com/",
87        "https://cdn.prod.website-files.com/", // webflow cdn scripts
88        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
89        "https://code.jquery.com/jquery-"
90    ];
91
92    /// Determine if a script should be rendered in the browser by name.
93    ///
94    /// NOTE: with "allow all scripts unless blocklisted", this is not used as a gate anymore,
95    /// but we keep it for compatibility and other call sites.
96    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98    /// General patterns for popular libraries and resources
99    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100        // Verified 3rd parties for request
101        "https://m.stripe.network/",
102        "https://challenges.cloudflare.com/",
103        "https://js.stripe.com/",
104        "https://cdn.prod.website-files.com/", // webflow cdn scripts
105        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
106        "https://code.jquery.com/jquery-",
107        "https://ct.captcha-delivery.com/",
108        "https://geo.captcha-delivery.com/",
109        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
110        "https://cdn.auth0.com/client",
111        "https://captcha.px-cloud.net/",
112        "https://www.capy.me/puzzle/",
113        "https://www.gstatic.com/recaptcha/",
114        "https://google.com/recaptcha/",
115        "https://www.google.com/recaptcha/",
116        "https://www.recaptcha.net/recaptcha/",
117        "https://js.hcaptcha.com/1/api.js",
118        "https://hcaptcha.com/1/api.js",
119        "https://js.datadome.co/tags.js",
120        "https://api-js.datadome.co/",
121        "https://client.perimeterx.net/",
122        "https://captcha.px-cdn.net/",
123        "https://newassets.hcaptcha.com/",
124        "https://captcha.px-cloud.net/",
125        "https://s.perimeterx.net/",
126        "https://api.leminnow.com/captcha/",
127        "https://client-api.arkoselabs.com/",
128        "https://static.geetest.com/v4/gt4.js",
129        "https://static.geetest.com/",
130        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131        "https://cdn.perfdrive.com/aperture/",
132        "https://assets.queue-it.net/",
133        "discourse-cdn.com/",
134        "hcaptcha.com",
135        "/cdn-cgi/challenge-platform/",
136        "/_Incapsula_Resource"
137    ];
138
139    /// Determine if a script should be rendered in the browser by name.
140    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142    /// path of a js framework
143    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144        phf::phf_set! {
145            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
146            "_astro/", "_app/immutable"
147        }
148    };
149
150    /// Ignore the content types.
151    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152        "application/pdf",
153        "application/zip",
154        "application/x-rar-compressed",
155        "application/x-tar",
156        "image/png",
157        "image/jpeg",
158        "image/gif",
159        "image/bmp",
160        "image/webp",
161        "image/svg+xml",
162        "video/mp4",
163        "video/x-msvideo",
164        "video/x-matroska",
165        "video/webm",
166        "audio/mpeg",
167        "audio/ogg",
168        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169        "application/vnd.ms-excel",
170        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171        "application/vnd.ms-powerpoint",
172        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173        "application/x-7z-compressed",
174        "application/x-rpm",
175        "application/x-shockwave-flash",
176        "application/rtf",
177    };
178
179    /// Ignore the resources for visual content types.
180    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181        "Image",
182        "Media",
183        "Font"
184    };
185
186    /// Ignore the resources for visual content types.
187    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188        "CspViolationReport",
189        "Ping",
190    };
191
192    /// Case insenstive css matching
193    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195    /// The command chain.
196    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
197        let enable = EnableParams::default();
198
199        if let Ok(c) = serde_json::to_value(&enable) {
200            vec![(enable.identifier(), c)]
201        } else {
202            vec![]
203        }
204    };
205
206    /// The command chain with https ignore.
207    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
208        let enable = EnableParams::default();
209        let mut v = vec![];
210        if let Ok(c) = serde_json::to_value(&enable) {
211            v.push((enable.identifier(), c));
212        }
213        let ignore = SetIgnoreCertificateErrorsParams::new(true);
214        if let Ok(ignored) = serde_json::to_value(&ignore) {
215            v.push((ignore.identifier(), ignored));
216        }
217
218        v
219    };
220
221    /// Enable the fetch intercept command
222    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223        fetch::EnableParams::builder()
224        .handle_auth_requests(true)
225        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226        .build()
227    };
228}
229
230/// Determine if a redirect is true.
231pub(crate) fn is_redirect_status(status: i64) -> bool {
232    matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235/// How long a buffered `requests_will_be_sent` / `request_id_to_interception_id`
236/// entry may linger before being evicted. 30 seconds is generous — the CDP
237/// round-trip that reconciles the two racing events normally completes in
238/// milliseconds.
239const STALE_BUFFER_SECS: u64 = 30;
240
241/// How long an in-flight request entry (`requests` map) can live without
242/// being resolved by a `loadingFinished` / `loadingFailed` / `loadingCanceled`
243/// event before it is considered orphaned and evicted.  Longer than the
244/// race-condition buffer timeout because real requests can legitimately take
245/// tens of seconds (streaming, slow origins, etc.).
246const STALE_REQUEST_SECS: u64 = 120;
247
248/// Wrapper around `adblock::Engine` that implements `Debug`.
249#[cfg(feature = "adblock")]
250pub struct AdblockEngine(std::sync::Arc<adblock::Engine>);
251
252#[cfg(feature = "adblock")]
253impl std::fmt::Debug for AdblockEngine {
254    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255        f.debug_struct("AdblockEngine").finish()
256    }
257}
258
259#[cfg(feature = "adblock")]
260impl std::ops::Deref for AdblockEngine {
261    type Target = adblock::Engine;
262    fn deref(&self) -> &Self::Target {
263        &self.0
264    }
265}
266
267#[derive(Debug)]
268/// The base network manager.
269pub struct NetworkManager {
270    /// FIFO queue of internal `NetworkEvent`s emitted by the manager.
271    ///
272    /// The manager pushes events here as CDP commands are scheduled (e.g. `SendCdpRequest`)
273    /// and as request lifecycle transitions occur (`RequestFinished`, `RequestFailed`, etc.).
274    /// Consumers pull from this queue via `poll()`.
275    queued_events: VecDeque<NetworkEvent>,
276    /// If `true`, the init command chain includes `Security.setIgnoreCertificateErrors(true)`.
277    ///
278    /// This is used to allow navigation / resource loading to proceed on sites with invalid TLS
279    /// certificates (self-signed, expired, MITM proxies, etc.).
280    ignore_httpserrors: bool,
281    /// Active in-flight requests keyed by CDP `RequestId`.
282    ///
283    /// Each entry tracks request/response metadata, redirect chain, optional interception id,
284    /// and final state used to emit `RequestFinished` / `RequestFailed`.
285    requests: HashMap<RequestId, HttpRequest>,
286    /// Temporary storage for `Network.requestWillBeSent` events when the corresponding
287    /// `Fetch.requestPaused` arrives later (or vice versa).
288    ///
289    /// When Fetch interception is enabled, `requestPaused` and `requestWillBeSent` can race.
290    /// We buffer `requestWillBeSent` here until we can attach the `InterceptionId`.
291    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
292    requests_will_be_sent: HashMap<RequestId, (EventRequestWillBeSent, Instant)>,
293    /// Extra HTTP headers to apply to subsequent network requests via CDP.
294    ///
295    /// This map is mirrored from user-supplied headers but stripped of proxy auth headers
296    /// (`Proxy-Authorization`) to avoid accidental leakage / incorrect forwarding.
297    extra_headers: std::collections::HashMap<String, String>,
298    /// Mapping from Network `RequestId` to Fetch `InterceptionId`.
299    ///
300    /// When `Fetch.requestPaused` fires before `Network.requestWillBeSent`, we temporarily
301    /// store the interception id here so it can be attached to the `HttpRequest` once the
302    /// network request is observed.
303    /// Entries older than `STALE_BUFFER_SECS` are evicted to prevent unbounded growth.
304    request_id_to_interception_id: HashMap<RequestId, (InterceptionId, Instant)>,
305    /// Whether the user has disabled the browser cache.
306    ///
307    /// This is surfaced via `Network.setCacheDisabled(true/false)` and toggled through
308    /// `set_cache_enabled()`. Internally the field is stored as “disabled” to match the CDP API.
309    user_cache_disabled: bool,
310    /// Tracks which requests have already attempted authentication.
311    ///
312    /// Used to prevent infinite auth retry loops when the origin repeatedly issues
313    /// authentication challenges (407/401). Once a request id is present here, subsequent
314    /// challenges for the same request are canceled.
315    attempted_authentications: HashSet<RequestId>,
316    /// Optional credentials used to respond to `Fetch.authRequired` challenges.
317    ///
318    /// When set, the manager will answer challenges with `ProvideCredentials` once per request
319    /// (guarded by `attempted_authentications`), otherwise it falls back to default handling.
320    credentials: Option<Credentials>,
321    /// User-facing toggle indicating whether request interception is desired.
322    ///
323    /// This is the “intent” flag controlled by `set_request_interception()`. On its own it does
324    /// not guarantee interception is active; interception is actually enabled/disabled by
325    /// `update_protocol_request_interception()` which reconciles this flag with `credentials`.
326    ///
327    /// In other words: if this is `false` but `credentials.is_some()`, interception may still be
328    /// enabled to satisfy auth challenges.
329    pub(crate) user_request_interception_enabled: bool,
330    /// Hard kill-switch to block all network traffic.
331    ///
332    /// When `true`, the manager immediately blocks requests (typically via
333    /// `FailRequest(BlockedByClient)` or fulfillment with an empty response depending on path),
334    /// and short-circuits most decision logic. This is used for safety conditions such as
335    /// exceeding `max_bytes_allowed` or other runtime protections.
336    block_all: bool,
337    /// Tracks whether the Fetch interception protocol is currently enabled in CDP.
338    ///
339    /// This is the “actual state” flag that reflects whether we have sent `Fetch.enable` or
340    /// `Fetch.disable` to the browser. It is updated by `update_protocol_request_interception()`
341    /// when `user_request_interception_enabled` or `credentials` change.
342    pub(crate) protocol_request_interception_enabled: bool,
343    /// The network is offline.
344    offline: bool,
345    /// The page request timeout.
346    pub request_timeout: Duration,
347    // made_request: bool,
348    /// Ignore visuals (no pings, prefetching, and etc).
349    pub ignore_visuals: bool,
350    /// Block CSS stylesheets.
351    pub block_stylesheets: bool,
352    /// Block javascript that is not critical to rendering.
353    ///
354    /// NOTE: With "allow all scripts unless blocklisted", this no longer blocks scripts
355    /// by itself (it remains for config compatibility).
356    pub block_javascript: bool,
357    /// Block analytics from rendering
358    pub block_analytics: bool,
359    /// Block pre-fetch request
360    pub block_prefetch: bool,
361    /// Only html from loading.
362    pub only_html: bool,
363    /// Is xml document?
364    pub xml_document: bool,
365    /// The custom intercept handle logic to run on the website.
366    pub intercept_manager: NetworkInterceptManager,
367    /// Track the amount of times the document reloaded.
368    pub document_reload_tracker: u8,
369    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
370    pub document_target_url: String,
371    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
372    pub document_target_domain: String,
373    /// The max bytes to receive.
374    pub max_bytes_allowed: Option<u64>,
375    /// Cap on main-frame Document redirect hops before the navigation is aborted.
376    ///
377    /// `None` disables enforcement (default, preserves prior behavior). When `Some(n)`,
378    /// the (n+1)th Document redirect short-circuits: a synthetic `RequestFailed` event
379    /// is emitted with `failure_text = "net::ERR_TOO_MANY_REDIRECTS"` and
380    /// `Page.stopLoading` is dispatched to abort in-flight navigation. The accumulated
381    /// `redirect_chain` is preserved on the failed request so consumers can inspect it.
382    pub max_redirects: Option<usize>,
383    #[cfg(feature = "_cache")]
384    /// The cache site_key to use.
385    pub cache_site_key: Option<String>,
386    /// The cache policy to use.
387    #[cfg(feature = "_cache")]
388    pub cache_policy: Option<BasicCachePolicy>,
389    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
390    whitelist_patterns: Vec<String>,
391    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
392    whitelist_matcher: Option<AhoCorasick>,
393    /// Optional per-run/per-site blacklist of URL substrings (scripts/resources).
394    blacklist_patterns: Vec<String>,
395    /// Compiled matcher for blacklist_patterns (rebuilt when patterns change).
396    blacklist_matcher: Option<AhoCorasick>,
397    /// If true, blacklist always wins (cannot be unblocked by whitelist/3p allow).
398    blacklist_strict: bool,
399    /// Custom adblock engine built from user-supplied filter rules.
400    /// When `Some`, takes precedence over the global default engine.
401    #[cfg(feature = "adblock")]
402    adblock_engine: Option<AdblockEngine>,
403}
404
405impl NetworkManager {
406    /// A new network manager.
407    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
408        Self {
409            queued_events: Default::default(),
410            ignore_httpserrors,
411            requests: Default::default(),
412            requests_will_be_sent: Default::default(),
413            extra_headers: Default::default(),
414            request_id_to_interception_id: Default::default(),
415            user_cache_disabled: false,
416            attempted_authentications: Default::default(),
417            credentials: None,
418            block_all: false,
419            user_request_interception_enabled: false,
420            protocol_request_interception_enabled: false,
421            offline: false,
422            request_timeout,
423            ignore_visuals: false,
424            block_javascript: false,
425            block_stylesheets: false,
426            block_prefetch: true,
427            block_analytics: true,
428            only_html: false,
429            xml_document: false,
430            intercept_manager: NetworkInterceptManager::Unknown,
431            document_reload_tracker: 0,
432            document_target_url: String::new(),
433            document_target_domain: String::new(),
434            whitelist_patterns: Vec::new(),
435            whitelist_matcher: None,
436            blacklist_patterns: Vec::new(),
437            blacklist_matcher: None,
438            blacklist_strict: true,
439            max_bytes_allowed: None,
440            max_redirects: None,
441            #[cfg(feature = "_cache")]
442            cache_site_key: None,
443            #[cfg(feature = "_cache")]
444            cache_policy: None,
445            #[cfg(feature = "adblock")]
446            adblock_engine: None,
447        }
448    }
449
450    /// Set a custom adblock engine built from user-supplied filter rules.
451    #[cfg(feature = "adblock")]
452    pub fn set_adblock_engine(&mut self, engine: std::sync::Arc<adblock::Engine>) {
453        self.adblock_engine = Some(AdblockEngine(engine));
454    }
455
456    /// Replace the whitelist patterns (compiled once).
457    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
458    where
459        I: IntoIterator<Item = S>,
460        S: Into<String>,
461    {
462        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
463        self.rebuild_whitelist_matcher();
464    }
465
466    /// Replace the blacklist patterns (compiled once).
467    pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
468    where
469        I: IntoIterator<Item = S>,
470        S: Into<String>,
471    {
472        self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
473        self.rebuild_blacklist_matcher();
474    }
475
476    /// Add one pattern (cheap) and rebuild (call this sparingly).
477    pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
478        self.blacklist_patterns.push(pattern.into());
479        self.rebuild_blacklist_matcher();
480    }
481
482    /// Add many patterns and rebuild once.
483    pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
484    where
485        I: IntoIterator<Item = S>,
486        S: Into<String>,
487    {
488        self.blacklist_patterns
489            .extend(patterns.into_iter().map(Into::into));
490        self.rebuild_blacklist_matcher();
491    }
492
493    /// Clear blacklist entirely.
494    pub fn clear_blacklist(&mut self) {
495        self.blacklist_patterns.clear();
496        self.blacklist_matcher = None;
497    }
498
499    /// Control precedence: when true, blacklist always wins.
500    pub fn set_blacklist_strict(&mut self, strict: bool) {
501        self.blacklist_strict = strict;
502    }
503
504    #[inline]
505    fn rebuild_blacklist_matcher(&mut self) {
506        if self.blacklist_patterns.is_empty() {
507            self.blacklist_matcher = None;
508            return;
509        }
510
511        self.blacklist_matcher =
512            AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
513    }
514
515    #[inline]
516    fn is_blacklisted(&self, url: &str) -> bool {
517        self.blacklist_matcher
518            .as_ref()
519            .map(|m| m.is_match(url))
520            .unwrap_or(false)
521    }
522
523    /// Add one pattern (cheap) and rebuild (call this sparingly).
524    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
525        self.whitelist_patterns.push(pattern.into());
526        self.rebuild_whitelist_matcher();
527    }
528
529    /// Add many patterns and rebuild once.
530    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
531    where
532        I: IntoIterator<Item = S>,
533        S: Into<String>,
534    {
535        self.whitelist_patterns
536            .extend(patterns.into_iter().map(Into::into));
537        self.rebuild_whitelist_matcher();
538    }
539
540    #[inline]
541    fn rebuild_whitelist_matcher(&mut self) {
542        if self.whitelist_patterns.is_empty() {
543            self.whitelist_matcher = None;
544            return;
545        }
546
547        // If building fails (shouldn’t for simple patterns), just disable matcher.
548        self.whitelist_matcher =
549            AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
550    }
551
552    #[inline]
553    fn is_whitelisted(&self, url: &str) -> bool {
554        self.whitelist_matcher
555            .as_ref()
556            .map(|m| m.is_match(url))
557            .unwrap_or(false)
558    }
559
560    /// Commands to init the chain with.
561    pub fn init_commands(&self) -> CommandChain {
562        let cmds = if self.ignore_httpserrors {
563            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
564        } else {
565            INIT_CHAIN.clone()
566        };
567        CommandChain::new(cmds, self.request_timeout)
568    }
569
570    /// Push the CDP request.
571    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
572        let method = cmd.identifier();
573        if let Ok(params) = serde_json::to_value(cmd) {
574            self.queued_events
575                .push_back(NetworkEvent::SendCdpRequest((method, params)));
576        }
577    }
578
579    /// The next event to handle.
580    pub fn poll(&mut self) -> Option<NetworkEvent> {
581        self.queued_events.pop_front()
582    }
583
584    /// Evict stale entries from the race-condition buffers and from
585    /// `attempted_authentications`. Call this periodically (e.g. from the
586    /// handler's eviction tick) so that lost CDP events cannot cause unbounded
587    /// map growth.
588    pub fn evict_stale_entries(&mut self, now: Instant) {
589        let cutoff = now - Duration::from_secs(STALE_BUFFER_SECS);
590
591        self.requests_will_be_sent.retain(|_, (_, ts)| *ts > cutoff);
592        self.request_id_to_interception_id
593            .retain(|_, (_, ts)| *ts > cutoff);
594
595        // Evict orphaned in-flight requests whose completion events
596        // (`loadingFinished` / `loadingFailed` / `loadingCanceled`) were
597        // never received.  Uses a longer timeout than the race-condition
598        // buffers since real requests can legitimately be long-lived.
599        let request_cutoff = now - Duration::from_secs(STALE_REQUEST_SECS);
600        self.requests
601            .retain(|_, req| req.created_at > request_cutoff);
602
603        // `attempted_authentications` entries reference interception IDs that
604        // are cleaned up on loading-finished / loading-failed. If those events
605        // are lost, the set grows forever. Cross-reference with `requests`:
606        // any interception ID that no longer appears in a live request is stale.
607        if !self.attempted_authentications.is_empty() {
608            let live: HashSet<&str> = self
609                .requests
610                .values()
611                .filter_map(|r| r.interception_id.as_ref().map(|id| id.as_ref()))
612                .collect();
613            self.attempted_authentications
614                .retain(|id| live.contains(id.as_ref()));
615        }
616    }
617
618    /// Get the extra headers.
619    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
620        &self.extra_headers
621    }
622
623    /// Set extra HTTP headers.
624    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
625        self.extra_headers = headers;
626        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
627        self.extra_headers.remove("Proxy-Authorization");
628        if !self.extra_headers.is_empty() {
629            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
630                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
631            }
632        }
633    }
634
635    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
636        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
637    }
638
639    pub fn set_block_all(&mut self, block_all: bool) {
640        self.block_all = block_all;
641    }
642
643    pub fn set_request_interception(&mut self, enabled: bool) {
644        self.user_request_interception_enabled = enabled;
645        self.update_protocol_request_interception();
646    }
647
648    pub fn set_cache_enabled(&mut self, enabled: bool) {
649        let run = self.user_cache_disabled == enabled;
650        self.user_cache_disabled = !enabled;
651        if run {
652            self.update_protocol_cache_disabled();
653        }
654    }
655
656    /// Enable fetch interception.
657    pub fn enable_request_intercept(&mut self) {
658        self.protocol_request_interception_enabled = true;
659    }
660
661    /// Disable fetch interception.
662    pub fn disable_request_intercept(&mut self) {
663        self.protocol_request_interception_enabled = false;
664    }
665
666    /// Set the cache site key.
667    #[cfg(feature = "_cache")]
668    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
669        self.cache_site_key = cache_site_key;
670    }
671
672    /// Set the cache policy.
673    #[cfg(feature = "_cache")]
674    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
675        self.cache_policy = cache_policy;
676    }
677
678    pub fn update_protocol_cache_disabled(&mut self) {
679        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
680    }
681
682    pub fn authenticate(&mut self, credentials: Credentials) {
683        self.credentials = Some(credentials);
684        self.update_protocol_request_interception();
685        self.protocol_request_interception_enabled = true;
686    }
687
688    fn update_protocol_request_interception(&mut self) {
689        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
690
691        if enabled == self.protocol_request_interception_enabled {
692            return;
693        }
694
695        if enabled {
696            self.push_cdp_request(ENABLE_FETCH.clone())
697        } else {
698            self.push_cdp_request(DisableParams::default())
699        }
700    }
701
702    /// Blocklist-only script blocking.
703    /// Returns true only when the URL matches an explicit blocklist condition.
704    #[inline]
705    fn should_block_script_blocklist_only(&self, url: &str) -> bool {
706        // If analytics blocking is off, skip all analytics tries.
707        let block_analytics = self.block_analytics;
708
709        // 1) Explicit full-URL prefix trie (some rules are full URL prefixes).
710        if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
711        {
712            return true;
713        }
714
715        // 2) Custom website block list (explicit).
716        if crate::handler::blockers::block_websites::block_website(url) {
717            return true;
718        }
719
720        // 3) Path-based explicit tries / fallbacks.
721        //
722        // We run these on:
723        // - path with leading slash ("/js/app.js")
724        // - path without leading slash ("js/app.js")
725        // - basename ("app.js") for filename-only rules (this is the fast "analytics.js" fallback)
726        if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
727            // Remove query/fragment so matching stays stable.
728            let p_slash = Self::strip_query_fragment(path_with_slash);
729            let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
730
731            // Basename for filename-only lists.
732            let base = match p_slash.rsplit('/').next() {
733                Some(b) => b,
734                None => p_slash,
735            };
736
737            // ---- Trie checks ----
738            // Some tries store prefixes like "/cdn-cgi/..." (leading slash) OR "cdn-cgi/..." (no slash).
739            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
740                return true;
741            }
742            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
743                return true;
744            }
745            if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
746                return true;
747            }
748
749            // Base-path ignore tries (framework noise / known ignorable script paths).
750            // Note: these are explicit tries, so they are valid “blocklist-only” checks.
751            if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
752                return true;
753            }
754
755            // Style path ignores only when visuals are ignored.
756            if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
757                return true;
758            }
759        }
760
761        false
762    }
763
764    /// Extract the absolute URL path portion WITH the leading slash.
765    ///
766    /// Example:
767    /// - "https://cdn.example.net/js/app.js?x=y" -> Some("/js/app.js?x=y")
768    #[inline]
769    fn url_path_with_leading_slash(url: &str) -> Option<&str> {
770        // find scheme separator
771        let bytes = url.as_bytes();
772        let idx = memchr::memmem::find(bytes, b"//")?;
773        let after_slashes = idx + 2;
774
775        // find first slash after host
776        let slash_rel = memchr::memchr(b'/', &bytes[after_slashes..])?;
777        let slash_idx = after_slashes + slash_rel;
778
779        if slash_idx < url.len() {
780            Some(&url[slash_idx..])
781        } else {
782            None
783        }
784    }
785
786    /// Strip query string and fragment from a path-ish string.
787    ///
788    /// Example:
789    /// - "/a/b.js?x=1#y" -> "/a/b.js"
790    #[inline]
791    fn strip_query_fragment(s: &str) -> &str {
792        match memchr::memchr2(b'?', b'#', s.as_bytes()) {
793            Some(i) => &s[..i],
794            None => s,
795        }
796    }
797
798    /// Determine if the request should be skipped.
799    #[inline]
800    fn skip_xhr(
801        &self,
802        skip_networking: bool,
803        event: &EventRequestPaused,
804        network_event: bool,
805    ) -> bool {
806        // XHR check
807        if !skip_networking && network_event {
808            let request_url = event.request.url.as_str();
809
810            // check if part of ignore scripts.
811            let skip_analytics =
812                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
813
814            if skip_analytics {
815                true
816            } else if self.block_stylesheets || self.ignore_visuals {
817                let block_css = self.block_stylesheets;
818                let block_media = self.ignore_visuals;
819
820                let mut block_request = false;
821
822                if let Some(position) = memchr::memrchr(b'.', request_url.as_bytes()) {
823                    let hlen = request_url.len();
824                    let has_asset = hlen - position;
825
826                    if has_asset >= 3 {
827                        let next_position = position + 1;
828
829                        if block_media
830                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
831                                &request_url[next_position..].into(),
832                            )
833                        {
834                            block_request = true;
835                        } else if block_css {
836                            block_request = CaseInsensitiveString::from(
837                                &request_url.as_bytes()[next_position..],
838                            )
839                            .contains(&**CSS_EXTENSION)
840                        }
841                    }
842                }
843
844                if !block_request {
845                    block_request = ignore_script_xhr_media(request_url);
846                }
847
848                block_request
849            } else {
850                skip_networking
851            }
852        } else {
853            skip_networking
854        }
855    }
856
857    #[cfg(feature = "adblock")]
858    #[inline]
859    /// Detect if ad enabled.
860    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
861        if skip_networking {
862            true
863        } else {
864            block_ads(&event.request.url) || self.detect_ad(event)
865        }
866    }
867
868    /// When adblock feature is disabled, this is a no-op.
869    #[cfg(not(feature = "adblock"))]
870    #[inline]
871    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
872        use crate::handler::blockers::block_websites::block_ads;
873        if skip_networking {
874            true
875        } else {
876            block_ads(&event.request.url)
877        }
878    }
879
880    #[inline]
881    /// Fail request
882    fn fail_request_blocked(
883        &mut self,
884        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
885    ) {
886        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
887            request_id.clone(),
888            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
889        );
890        self.push_cdp_request(params);
891    }
892
893    #[inline]
894    /// Fulfill request
895    fn fulfill_request_empty_200(
896        &mut self,
897        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
898    ) {
899        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
900            request_id.clone(),
901            200,
902        );
903        self.push_cdp_request(params);
904    }
905
906    #[cfg(feature = "_cache")]
907    #[inline]
908    /// Fulfill a paused Fetch request from cached bytes + header map.
909    ///
910    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
911    fn fulfill_request_from_cache(
912        &mut self,
913        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
914        body: &[u8],
915        headers: &std::collections::HashMap<String, String>,
916        status: i64,
917    ) {
918        use crate::cdp::browser_protocol::fetch::HeaderEntry;
919        use crate::handler::network::fetch::FulfillRequestParams;
920        use base64::Engine;
921
922        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
923
924        for (k, v) in headers.iter() {
925            resp_headers.push(HeaderEntry {
926                name: k.clone().into(),
927                value: v.clone().into(),
928            });
929        }
930
931        let mut params = FulfillRequestParams::new(request_id.clone(), status);
932
933        // TODO: have this already encoded prior.
934        params.body = Some(
935            base64::engine::general_purpose::STANDARD
936                .encode(body)
937                .into(),
938        );
939
940        params.response_headers = Some(resp_headers);
941
942        self.push_cdp_request(params);
943    }
944
945    #[inline]
946    /// Continue the request url.
947    fn continue_request_with_url(
948        &mut self,
949        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
950        url: Option<&str>,
951        intercept_response: bool,
952    ) {
953        let mut params = ContinueRequestParams::new(request_id.clone());
954        if let Some(url) = url {
955            params.url = Some(url.to_string());
956            params.intercept_response = Some(intercept_response);
957        }
958        self.push_cdp_request(params);
959    }
960
961    /// On fetch request paused interception.
962    #[inline]
963    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
964        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
965            return;
966        }
967
968        if self.block_all {
969            tracing::debug!(
970                "Blocked (block_all): {:?} - {}",
971                event.resource_type,
972                event.request.url
973            );
974            return self.fail_request_blocked(&event.request_id);
975        }
976
977        if let Some(network_id) = event.network_id.as_ref() {
978            if let Some((request_will_be_sent, _)) =
979                self.requests_will_be_sent.remove(network_id.as_ref())
980            {
981                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
982            } else {
983                self.request_id_to_interception_id.insert(
984                    network_id.clone(),
985                    (event.request_id.clone().into(), Instant::now()),
986                );
987            }
988        }
989
990        // From here on, we handle the full decision tree.
991        let javascript_resource = event.resource_type == ResourceType::Script;
992        let document_resource = event.resource_type == ResourceType::Document;
993        let network_resource =
994            !document_resource && crate::utils::is_data_resource(&event.resource_type);
995
996        // Start with static / cheap skip checks.
997        let mut skip_networking =
998            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
999
1000        if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
1001            skip_networking = true;
1002        }
1003
1004        // Also short-circuit if we've reloaded this document too many times.
1005        if !skip_networking {
1006            skip_networking = self.document_reload_tracker >= 3;
1007        }
1008
1009        // Handle document redirect / masking and track xml documents.
1010        let (current_url_cow, had_replacer) =
1011            self.handle_document_replacement_and_tracking(event, document_resource);
1012
1013        let current_url: &str = current_url_cow.as_ref();
1014
1015        let blacklisted = self.is_blacklisted(current_url);
1016
1017        if !self.blacklist_strict && blacklisted {
1018            skip_networking = true;
1019        }
1020
1021        if !skip_networking {
1022            // Allow XSL for sitemap XML.
1023            if self.xml_document && current_url.ends_with(".xsl") {
1024                skip_networking = false;
1025            } else {
1026                skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
1027            }
1028        }
1029
1030        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
1031
1032        // Ignore embedded scripts, tracker stylesheets, and tracker images when only_html or ignore_visuals is set.
1033        if !skip_networking
1034            && self.block_javascript
1035            && (self.only_html || self.ignore_visuals)
1036            && (javascript_resource
1037                || document_resource
1038                || event.resource_type == ResourceType::Stylesheet
1039                || event.resource_type == ResourceType::Image)
1040        {
1041            skip_networking = ignore_script_embedded(current_url);
1042        }
1043
1044        // Script policy: allow-by-default.
1045        // Block only if explicit block list patterns match.
1046        if !skip_networking && javascript_resource {
1047            skip_networking = self.should_block_script_blocklist_only(current_url);
1048        }
1049
1050        // XHR / data resources.
1051        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1052
1053        // Custom interception layer.
1054        if !skip_networking && (javascript_resource || network_resource || document_resource) {
1055            skip_networking = self.intercept_manager.intercept_detection(
1056                current_url,
1057                self.ignore_visuals,
1058                network_resource,
1059            );
1060        }
1061
1062        // Custom website block list.
1063        if !skip_networking && (javascript_resource || network_resource) {
1064            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1065        }
1066
1067        // whitelist 3rd party
1068        // not required unless explicit blocking.
1069        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1070        {
1071            skip_networking = false;
1072        }
1073
1074        // check if the url is in the whitelist.
1075        if skip_networking && self.is_whitelisted(current_url) {
1076            skip_networking = false;
1077        }
1078
1079        if self.blacklist_strict && blacklisted {
1080            skip_networking = true;
1081        }
1082
1083        if skip_networking {
1084            tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
1085            self.fulfill_request_empty_200(&event.request_id);
1086        } else {
1087            #[cfg(feature = "_cache")]
1088            {
1089                if let (Some(policy), Some(cache_site_key)) =
1090                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1091                {
1092                    let current_url = format!("{}:{}", event.request.method, &current_url);
1093
1094                    if let Some((res, cache_policy)) =
1095                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
1096                    {
1097                        if policy.allows_cached(&cache_policy) {
1098                            tracing::debug!(
1099                                "Remote Cached: {:?} - {}",
1100                                &event.resource_type,
1101                                &current_url
1102                            );
1103                            let flat_headers = crate::http::headers_from_multi(&res.headers);
1104                            return self.fulfill_request_from_cache(
1105                                &event.request_id,
1106                                &res.body,
1107                                &flat_headers,
1108                                res.status as i64,
1109                            );
1110                        }
1111                    }
1112                }
1113            }
1114
1115            // check our frame cache for the run.
1116            tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1117            self.continue_request_with_url(
1118                &event.request_id,
1119                if had_replacer {
1120                    Some(current_url)
1121                } else {
1122                    None
1123                },
1124                !had_replacer,
1125            );
1126        }
1127    }
1128
1129    /// Shared "visuals + basic blocking" logic.
1130    ///
1131    /// IMPORTANT: Scripts are NOT blocked here anymore.
1132    /// Scripts are allowed by default and only blocked via explicit blocklists
1133    /// (should_block_script_blocklist_only / adblock / block_websites / intercept_manager).
1134    #[inline]
1135    fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1136        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1137            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1138    }
1139
1140    /// Does the network manager have a target domain?
1141    pub fn has_target_domain(&self) -> bool {
1142        !self.document_target_url.is_empty()
1143    }
1144
1145    /// Set the target page url for tracking.
1146    pub fn set_page_url(&mut self, page_target_url: String) {
1147        let host_base = host_and_rest(&page_target_url)
1148            .map(|(h, _)| base_domain_from_host(h))
1149            .unwrap_or("");
1150
1151        self.document_target_domain = host_base.to_string();
1152        self.document_target_url = page_target_url;
1153    }
1154
1155    /// Clear the initial target domain on every navigation.
1156    pub fn clear_target_domain(&mut self) {
1157        self.document_reload_tracker = 0;
1158        self.document_target_url = Default::default();
1159        self.document_target_domain = Default::default();
1160    }
1161
1162    /// Handles:
1163    /// - document reload tracking (`document_reload_tracker`)
1164    /// - redirect masking / replacement
1165    /// - xml document detection (`xml_document`)
1166    /// - `document_target_url` updates
1167    ///
1168    /// Returns (current_url, had_replacer).
1169    #[inline]
1170    fn handle_document_replacement_and_tracking<'a>(
1171        &mut self,
1172        event: &'a EventRequestPaused,
1173        document_resource: bool,
1174    ) -> (Cow<'a, str>, bool) {
1175        let mut replacer: Option<String> = None;
1176        let current_url = event.request.url.as_str();
1177
1178        if document_resource {
1179            if self.document_target_url == current_url {
1180                self.document_reload_tracker += 1;
1181            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1182            {
1183                let (http_document_replacement, mut https_document_replacement) =
1184                    if self.document_target_url.starts_with("http://") {
1185                        (
1186                            self.document_target_url.replacen("http://", "http//", 1),
1187                            self.document_target_url.replacen("http://", "https://", 1),
1188                        )
1189                    } else {
1190                        (
1191                            self.document_target_url.replacen("https://", "https//", 1),
1192                            self.document_target_url.replacen("https://", "http://", 1),
1193                        )
1194                    };
1195
1196                // Track trailing slash to restore later.
1197                let trailing = https_document_replacement.ends_with('/');
1198                if trailing {
1199                    https_document_replacement.pop();
1200                }
1201                if https_document_replacement.ends_with('/') {
1202                    https_document_replacement.pop();
1203                }
1204
1205                let redirect_mask = format!(
1206                    "{}{}",
1207                    https_document_replacement, http_document_replacement
1208                );
1209
1210                if current_url == redirect_mask {
1211                    replacer = Some(if trailing {
1212                        format!("{}/", https_document_replacement)
1213                    } else {
1214                        https_document_replacement
1215                    });
1216                }
1217            }
1218
1219            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1220                self.xml_document = true;
1221            }
1222
1223            // Track last seen document URL.
1224            self.document_target_url = event.request.url.clone();
1225            self.document_target_domain = host_and_rest(&self.document_target_url)
1226                .map(|(h, _)| base_domain_from_host(h).to_string())
1227                .unwrap_or_default();
1228        }
1229
1230        let current_url_cow = match replacer {
1231            Some(r) => Cow::Owned(r),
1232            None => Cow::Borrowed(event.request.url.as_str()),
1233        };
1234
1235        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1236        (current_url_cow, had_replacer)
1237    }
1238
1239    /// Perform a page intercept for chrome using the adblock engine.
1240    /// Uses the custom engine when user-supplied filter rules are configured,
1241    /// otherwise falls back to the global default engine with built-in patterns.
1242    #[cfg(feature = "adblock")]
1243    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1244        use adblock::{
1245            lists::{FilterSet, ParseOptions, RuleTypes},
1246            Engine,
1247        };
1248
1249        lazy_static::lazy_static! {
1250            static ref AD_ENGINE: Engine = {
1251                let mut filter_set = FilterSet::new(false);
1252                let mut rules = ParseOptions::default();
1253                rules.rule_types = RuleTypes::All;
1254
1255                filter_set.add_filters(
1256                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1257                    rules.clone(),
1258                );
1259
1260                // When adblock_easylist is enabled, EasyList + EasyPrivacy are
1261                // embedded at build time for zero-cost runtime loading.
1262                #[cfg(feature = "adblock_easylist")]
1263                {
1264                    static EASYLIST: &str = include_str!(concat!(env!("OUT_DIR"), "/easylist.txt"));
1265                    static EASYPRIVACY: &str = include_str!(concat!(env!("OUT_DIR"), "/easyprivacy.txt"));
1266
1267                    if !EASYLIST.is_empty() {
1268                        filter_set.add_filter_list(EASYLIST, rules.clone());
1269                    }
1270                    if !EASYPRIVACY.is_empty() {
1271                        filter_set.add_filter_list(EASYPRIVACY, rules);
1272                    }
1273                }
1274
1275                Engine::from_filter_set(filter_set, true)
1276            };
1277        }
1278
1279        let blockable = event.resource_type == ResourceType::Script
1280            || event.resource_type == ResourceType::Image
1281            || event.resource_type == ResourceType::Media
1282            || event.resource_type == ResourceType::Stylesheet
1283            || event.resource_type == ResourceType::Document
1284            || event.resource_type == ResourceType::Fetch
1285            || event.resource_type == ResourceType::Xhr;
1286
1287        if !blockable {
1288            return false;
1289        }
1290
1291        let u = &event.request.url;
1292
1293        let source_domain = if self.document_target_domain.is_empty() {
1294            "example.com"
1295        } else {
1296            &self.document_target_domain
1297        };
1298
1299        // Fast hostname extraction without full URL parsing.
1300        // preparsed(url, request_hostname, source_hostname, type, third_party)
1301        let hostname = u
1302            .strip_prefix("https://")
1303            .or_else(|| u.strip_prefix("http://"))
1304            .and_then(|rest| rest.split('/').next())
1305            // Strip userinfo (user:pass@) if present.
1306            .map(
1307                |authority| match memchr::memrchr(b'@', authority.as_bytes()) {
1308                    Some(i) => &authority[i + 1..],
1309                    None => authority,
1310                },
1311            )
1312            // Strip port (:8080) if present.
1313            .and_then(|host_port| host_port.split(':').next())
1314            .unwrap_or(source_domain);
1315
1316        let resource_type_str = match event.resource_type {
1317            ResourceType::Script => "script",
1318            ResourceType::Image => "image",
1319            ResourceType::Media => "media",
1320            ResourceType::Stylesheet => "stylesheet",
1321            ResourceType::Document => "document",
1322            ResourceType::Fetch => "fetch",
1323            ResourceType::Xhr => "xhr",
1324            _ => "other",
1325        };
1326
1327        let request = adblock::request::Request::preparsed(
1328            u,
1329            hostname,
1330            source_domain,
1331            resource_type_str,
1332            !event.request.is_same_site.unwrap_or_default(),
1333        );
1334
1335        let engine: &Engine = match self.adblock_engine.as_ref() {
1336            Some(custom) => custom,
1337            None => &AD_ENGINE,
1338        };
1339
1340        engine.check_network_request(&request).matched
1341    }
1342
1343    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1344        let response = if self
1345            .attempted_authentications
1346            .contains(event.request_id.as_ref())
1347        {
1348            AuthChallengeResponseResponse::CancelAuth
1349        } else if self.credentials.is_some() {
1350            self.attempted_authentications
1351                .insert(event.request_id.clone().into());
1352            AuthChallengeResponseResponse::ProvideCredentials
1353        } else {
1354            AuthChallengeResponseResponse::Default
1355        };
1356
1357        let mut auth = AuthChallengeResponse::new(response);
1358        if let Some(creds) = self.credentials.clone() {
1359            auth.username = Some(creds.username);
1360            auth.password = Some(creds.password);
1361        }
1362        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1363    }
1364
1365    /// Set the page offline network emulation condition.
1366    pub fn set_offline_mode(&mut self, value: bool) {
1367        if self.offline == value {
1368            return;
1369        }
1370        self.offline = value;
1371        if let Ok(condition) = NetworkConditions::builder()
1372            .url_pattern("")
1373            .latency(0)
1374            .download_throughput(-1.)
1375            .upload_throughput(-1.)
1376            .build()
1377        {
1378            if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1379                .offline(self.offline)
1380                .matched_network_condition(condition)
1381                .build()
1382            {
1383                self.push_cdp_request(network);
1384            }
1385        }
1386    }
1387
1388    /// Request interception doesn't happen for data URLs with Network Service.
1389    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1390        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1391            if let Some((interception_id, _)) = self
1392                .request_id_to_interception_id
1393                .remove(event.request_id.as_ref())
1394            {
1395                self.on_request(event, Some(interception_id));
1396            } else {
1397                self.requests_will_be_sent
1398                    .insert(event.request_id.clone(), (event.clone(), Instant::now()));
1399            }
1400        } else {
1401            self.on_request(event, None);
1402        }
1403    }
1404
1405    /// The request was served from the cache.
1406    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1407        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1408            request.from_memory_cache = true;
1409        }
1410    }
1411
1412    /// On network response received.
1413    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1414        let mut request_failed = false;
1415
1416        // Track how many bytes we actually deducted from this target.
1417        let mut deducted: u64 = 0;
1418
1419        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1420            let before = *max_bytes;
1421
1422            // encoded_data_length -> saturating cast to u64
1423            let received_bytes: u64 = event.response.encoded_data_length as u64;
1424
1425            // Safe parse of Content-Length
1426            let content_length: Option<u64> = event
1427                .response
1428                .headers
1429                .inner()
1430                .get("content-length")
1431                .and_then(|v| v.as_str())
1432                .and_then(|s| s.trim().parse::<u64>().ok());
1433
1434            // Deduct what we actually received
1435            *max_bytes = max_bytes.saturating_sub(received_bytes);
1436
1437            // If the declared size can't fit, zero out now
1438            if let Some(cl) = content_length {
1439                if cl > *max_bytes {
1440                    *max_bytes = 0;
1441                }
1442            }
1443
1444            request_failed = *max_bytes == 0;
1445
1446            // Compute exact delta deducted on this event
1447            deducted = before.saturating_sub(*max_bytes);
1448        }
1449
1450        // Bubble up the deduction (even if request continues)
1451        if deducted > 0 {
1452            self.queued_events
1453                .push_back(NetworkEvent::BytesConsumed(deducted));
1454        }
1455
1456        // block all network request moving forward.
1457        if request_failed && self.max_bytes_allowed.is_some() {
1458            self.set_block_all(true);
1459        }
1460
1461        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1462            request.set_response(event.response.clone());
1463            self.queued_events.push_back(if request_failed {
1464                NetworkEvent::RequestFailed(request)
1465            } else {
1466                NetworkEvent::RequestFinished(request)
1467            });
1468        }
1469    }
1470
1471    /// On network loading finished.
1472    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1473        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1474            if let Some(interception_id) = request.interception_id.as_ref() {
1475                self.attempted_authentications
1476                    .remove(interception_id.as_ref());
1477            }
1478            self.queued_events
1479                .push_back(NetworkEvent::RequestFinished(request));
1480        }
1481    }
1482
1483    /// On network loading failed.
1484    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1485        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1486            request.failure_text = Some(event.error_text.clone());
1487            if let Some(interception_id) = request.interception_id.as_ref() {
1488                self.attempted_authentications
1489                    .remove(interception_id.as_ref());
1490            }
1491            self.queued_events
1492                .push_back(NetworkEvent::RequestFailed(request));
1493        }
1494    }
1495
1496    /// On request will be sent.
1497    fn on_request(
1498        &mut self,
1499        event: &EventRequestWillBeSent,
1500        interception_id: Option<InterceptionId>,
1501    ) {
1502        let mut redirect_chain = Vec::new();
1503        let mut redirect_location = None;
1504
1505        if let Some(redirect_resp) = &event.redirect_response {
1506            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1507                if is_redirect_status(redirect_resp.status) {
1508                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1509                        if redirect_resp.url != location {
1510                            let fixed_location = location.replace(&redirect_resp.url, "");
1511
1512                            if !fixed_location.is_empty() {
1513                                if let Some(resp) = request.response.as_mut() {
1514                                    resp.headers.0["Location"] =
1515                                        serde_json::Value::String(fixed_location.clone());
1516                                }
1517                            }
1518
1519                            redirect_location = Some(fixed_location);
1520                        }
1521                    }
1522                }
1523
1524                {
1525                    let mut redirect_resp = redirect_resp.clone();
1526
1527                    if let Some(redirect_location) = redirect_location {
1528                        if !redirect_location.is_empty() {
1529                            redirect_resp.headers.0["Location"] =
1530                                serde_json::Value::String(redirect_location);
1531                        }
1532                    }
1533
1534                    self.handle_request_redirect(&mut request, redirect_resp);
1535                }
1536
1537                redirect_chain = std::mem::take(&mut request.redirect_chain);
1538                redirect_chain.push(request);
1539            }
1540        }
1541
1542        // Redirect cap: applies only to Document-type hops and only when
1543        // `max_redirects` is set. Sub-resource chains are untouched.
1544        if let Some(cap) = self.max_redirects {
1545            let is_document = matches!(event.r#type, Some(ResourceType::Document));
1546            if is_document && redirect_chain.len() > cap {
1547                let mut failed = HttpRequest::new(
1548                    event.request_id.clone(),
1549                    event.frame_id.clone(),
1550                    interception_id,
1551                    self.user_request_interception_enabled,
1552                    redirect_chain,
1553                );
1554                failed.url = Some(event.request.url.clone());
1555                failed.method = Some(event.request.method.clone());
1556                failed.failure_text = Some("net::ERR_TOO_MANY_REDIRECTS".into());
1557                self.push_cdp_request(
1558                    chromiumoxide_cdp::cdp::browser_protocol::page::StopLoadingParams::default(),
1559                );
1560                self.queued_events
1561                    .push_back(NetworkEvent::RequestFailed(failed));
1562                return;
1563            }
1564        }
1565
1566        let request = HttpRequest::new(
1567            event.request_id.clone(),
1568            event.frame_id.clone(),
1569            interception_id,
1570            self.user_request_interception_enabled,
1571            redirect_chain,
1572        );
1573
1574        let rid = event.request_id.clone();
1575        self.queued_events
1576            .push_back(NetworkEvent::Request(rid.clone()));
1577        self.requests.insert(rid, request);
1578    }
1579
1580    /// Handle request redirect.
1581    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1582        request.set_response(response);
1583        if let Some(interception_id) = request.interception_id.as_ref() {
1584            self.attempted_authentications
1585                .remove(interception_id.as_ref());
1586        }
1587    }
1588}
1589
1590#[derive(Debug)]
1591pub enum NetworkEvent {
1592    /// Send a CDP request.
1593    SendCdpRequest((MethodId, serde_json::Value)),
1594    /// Request.
1595    Request(RequestId),
1596    /// Response
1597    Response(RequestId),
1598    /// Request failed.
1599    RequestFailed(HttpRequest),
1600    /// Request finished.
1601    RequestFinished(HttpRequest),
1602    /// Bytes consumed.
1603    BytesConsumed(u64),
1604}
1605
1606#[cfg(test)]
1607mod tests {
1608    use super::ALLOWED_MATCHER_3RD_PARTY;
1609    use crate::handler::network::NetworkManager;
1610    use std::time::Duration;
1611
1612    #[test]
1613    fn test_allowed_matcher_3rd_party() {
1614        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1615        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1616        assert!(
1617            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1618            "expected Cloudflare challenge script to be allowed"
1619        );
1620
1621        // Should NOT be allowed (not in allow-list)
1622        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1623        assert!(
1624            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1625            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1626        );
1627
1628        // A couple sanity checks for existing allow patterns
1629        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1630        assert!(ALLOWED_MATCHER_3RD_PARTY
1631            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1632        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1633    }
1634
1635    #[test]
1636    fn test_script_allowed_by_default_when_not_blocklisted() {
1637        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1638        nm.set_page_url(
1639            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1640        );
1641
1642        // A random script that should not match your block tries.
1643        let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1644        assert!(
1645            !nm.should_block_script_blocklist_only(ok),
1646            "expected non-blocklisted script to be allowed"
1647        );
1648    }
1649
1650    #[test]
1651    fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1652        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1653        nm.set_page_url(
1654            "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1655        );
1656
1657        // This should match URL_IGNORE_TRIE_PATHS fallback ("analytics.js") logic.
1658        let bad = "https://cdn.example.net/js/analytics.js";
1659        assert!(
1660            nm.should_block_script_blocklist_only(bad),
1661            "expected analytics.js to be blocklisted"
1662        );
1663    }
1664
1665    #[test]
1666    fn test_allowed_matcher_3rd_party_sanity() {
1667        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1668        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1669        assert!(
1670            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1671            "expected Cloudflare challenge script to be allowed"
1672        );
1673
1674        // Should NOT be allowed (not in allow-list)
1675        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1676        assert!(
1677            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1678            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1679        );
1680
1681        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1682        assert!(ALLOWED_MATCHER_3RD_PARTY
1683            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1684        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1685    }
1686    #[test]
1687    fn test_dynamic_blacklist_blocks_url() {
1688        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1689        nm.set_page_url("https://example.com/".to_string());
1690
1691        nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1692        assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1693        assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1694
1695        assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1696    }
1697
1698    #[test]
1699    fn test_blacklist_strict_wins_over_whitelist() {
1700        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1701        nm.set_page_url("https://example.com/".to_string());
1702
1703        // Same URL in both lists.
1704        nm.set_blacklist_patterns(["beacon.min.js"]);
1705        nm.set_whitelist_patterns(["beacon.min.js"]);
1706
1707        nm.set_blacklist_strict(true);
1708
1709        let u = "https://static.cloudflareinsights.com/beacon.min.js";
1710        assert!(nm.is_whitelisted(u));
1711        assert!(nm.is_blacklisted(u));
1712
1713        // In strict mode, it should still be considered blocked at decision time.
1714        // (We can only directly assert the matchers here; the decision logic is exercised in integration.)
1715        assert!(nm.blacklist_strict);
1716    }
1717
1718    #[cfg(feature = "adblock")]
1719    fn make_request_paused(
1720        url: &str,
1721        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1722        is_same_site: bool,
1723    ) -> chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused {
1724        use chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused;
1725        use chromiumoxide_cdp::cdp::browser_protocol::network::{
1726            Headers, Request, RequestReferrerPolicy, ResourcePriority,
1727        };
1728
1729        EventRequestPaused {
1730            request_id: chromiumoxide_cdp::cdp::browser_protocol::network::RequestId::from(
1731                "test-req".to_string(),
1732            )
1733            .into(),
1734            request: Request {
1735                url: url.to_string(),
1736                method: "GET".to_string(),
1737                headers: Headers::new(serde_json::Value::Object(Default::default())),
1738                initial_priority: ResourcePriority::Medium,
1739                referrer_policy: RequestReferrerPolicy::NoReferrer,
1740                url_fragment: None,
1741                has_post_data: None,
1742                post_data_entries: None,
1743                mixed_content_type: None,
1744                is_link_preload: None,
1745                trust_token_params: None,
1746                is_same_site: Some(is_same_site),
1747                is_ad_related: None,
1748            },
1749            frame_id: chromiumoxide_cdp::cdp::browser_protocol::page::FrameId::from(
1750                "frame1".to_string(),
1751            ),
1752            resource_type,
1753            response_error_reason: None,
1754            response_status_code: None,
1755            response_status_text: None,
1756            response_headers: None,
1757            network_id: None,
1758            redirected_request_id: None,
1759        }
1760    }
1761
1762    #[cfg(feature = "adblock")]
1763    #[test]
1764    fn test_detect_ad_blocks_known_tracker_scripts() {
1765        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1766
1767        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1768        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1769
1770        let event = make_request_paused(
1771            "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1772            ResourceType::Script,
1773            false,
1774        );
1775
1776        assert!(
1777            nm.detect_ad(&event),
1778            "googletagmanager.com script should be detected as ad"
1779        );
1780    }
1781
1782    #[cfg(feature = "adblock")]
1783    #[test]
1784    fn test_detect_ad_allows_legitimate_scripts() {
1785        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1786
1787        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1788        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1789
1790        let event = make_request_paused(
1791            "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1792            ResourceType::Script,
1793            true,
1794        );
1795
1796        assert!(
1797            !nm.detect_ad(&event),
1798            "legitimate first-party app bundle should not be blocked"
1799        );
1800    }
1801
1802    #[cfg(feature = "adblock")]
1803    #[test]
1804    fn test_detect_ad_uses_source_domain() {
1805        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1806
1807        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1808        nm.set_page_url("https://www.wine-searcher.com/some-page".to_string());
1809
1810        assert!(
1811            !nm.document_target_domain.is_empty(),
1812            "document_target_domain should be set after set_page_url"
1813        );
1814
1815        let event = make_request_paused(
1816            "https://www.google-analytics.com/analytics.js",
1817            ResourceType::Script,
1818            false,
1819        );
1820
1821        assert!(
1822            nm.detect_ad(&event),
1823            "google-analytics.com should be blocked as tracker"
1824        );
1825    }
1826
1827    #[cfg(feature = "adblock")]
1828    #[test]
1829    fn test_custom_adblock_engine_takes_precedence() {
1830        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1831
1832        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1833        nm.set_page_url("https://example.com/".to_string());
1834
1835        // Build a custom engine with a specific rule.
1836        let mut filter_set = adblock::lists::FilterSet::new(false);
1837        let mut opts = adblock::lists::ParseOptions::default();
1838        opts.rule_types = adblock::lists::RuleTypes::All;
1839        filter_set.add_filters(["||custom-tracker.example.net^"], opts);
1840        let engine = adblock::Engine::from_filter_set(filter_set, true);
1841        nm.set_adblock_engine(std::sync::Arc::new(engine));
1842
1843        let event = make_request_paused(
1844            "https://custom-tracker.example.net/pixel.js",
1845            ResourceType::Script,
1846            false,
1847        );
1848
1849        assert!(
1850            nm.detect_ad(&event),
1851            "custom engine rule should block custom-tracker.example.net"
1852        );
1853    }
1854
1855    /// Helper: run a URL through the full `on_fetch_request_paused` pipeline
1856    /// and return whether it was blocked (true) or allowed (false).
1857    #[cfg(feature = "adblock")]
1858    fn run_full_interception(
1859        nm: &mut NetworkManager,
1860        url: &str,
1861        resource_type: chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType,
1862        is_same_site: bool,
1863    ) -> bool {
1864        use super::NetworkEvent;
1865
1866        // Drain any prior events.
1867        while nm.poll().is_some() {}
1868
1869        let event = make_request_paused(url, resource_type, is_same_site);
1870        nm.on_fetch_request_paused(&event);
1871
1872        // Check what was emitted: Fetch.fulfillRequest = blocked, Fetch.continueRequest = allowed.
1873        let mut blocked = false;
1874        while let Some(ev) = nm.poll() {
1875            if let NetworkEvent::SendCdpRequest((method, _)) = &ev {
1876                let m: &str = method.as_ref();
1877                if m == "Fetch.fulfillRequest" || m == "Fetch.failRequest" {
1878                    blocked = true;
1879                }
1880            }
1881        }
1882        blocked
1883    }
1884
1885    // ── End-to-end interception tests ───────────────────────────────────
1886
1887    #[cfg(feature = "adblock")]
1888    #[test]
1889    fn test_e2e_tracker_script_blocked() {
1890        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1891
1892        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1893        nm.set_page_url("https://www.wine-searcher.com/".to_string());
1894
1895        assert!(
1896            run_full_interception(
1897                &mut nm,
1898                "https://www.googletagmanager.com/gtm.js?id=GTM-XXXX",
1899                ResourceType::Script,
1900                false,
1901            ),
1902            "GTM script should be blocked through full pipeline"
1903        );
1904    }
1905
1906    #[cfg(feature = "adblock")]
1907    #[test]
1908    fn test_e2e_legitimate_script_allowed() {
1909        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1910
1911        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1912        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
1913
1914        assert!(
1915            !run_full_interception(
1916                &mut nm,
1917                "https://www.mylegitsite-test.com/static/js/app-bundle.js",
1918                ResourceType::Script,
1919                true,
1920            ),
1921            "legitimate first-party script should be allowed through full pipeline"
1922        );
1923    }
1924
1925    #[cfg(feature = "adblock")]
1926    #[test]
1927    fn test_e2e_analytics_xhr_blocked() {
1928        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1929
1930        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1931        nm.set_page_url("https://example.org/".to_string());
1932
1933        assert!(
1934            run_full_interception(
1935                &mut nm,
1936                "https://www.google-analytics.com/g/collect?v=2&tid=UA-123",
1937                ResourceType::Xhr,
1938                false,
1939            ),
1940            "Google Analytics XHR should be blocked through full pipeline"
1941        );
1942    }
1943
1944    #[cfg(feature = "adblock")]
1945    #[test]
1946    fn test_e2e_whitelisted_overrides_adblock() {
1947        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1948
1949        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1950        nm.set_page_url("https://example.org/".to_string());
1951        nm.set_whitelist_patterns(["googletagmanager.com"]);
1952
1953        // GTM would normally be blocked by adblock, but whitelist overrides.
1954        assert!(
1955            !run_full_interception(
1956                &mut nm,
1957                "https://www.googletagmanager.com/gtm.js?id=GTM-TEST",
1958                ResourceType::Script,
1959                false,
1960            ),
1961            "whitelisted tracker should be allowed even when adblock would block it"
1962        );
1963    }
1964
1965    #[cfg(feature = "adblock")]
1966    #[test]
1967    fn test_e2e_blacklist_strict_overrides_whitelist() {
1968        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1969
1970        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1971        nm.set_page_url("https://example.org/".to_string());
1972        nm.set_blacklist_patterns(["cdn.example.net/evil.js"]);
1973        nm.set_whitelist_patterns(["cdn.example.net/evil.js"]);
1974        nm.set_blacklist_strict(true);
1975
1976        assert!(
1977            run_full_interception(
1978                &mut nm,
1979                "https://cdn.example.net/evil.js",
1980                ResourceType::Script,
1981                false,
1982            ),
1983            "strict blacklist should win over whitelist"
1984        );
1985    }
1986
1987    #[cfg(feature = "adblock")]
1988    #[test]
1989    fn test_e2e_first_party_document_not_blocked() {
1990        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
1991
1992        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1993        nm.set_page_url("https://www.nytimes.com/".to_string());
1994
1995        assert!(
1996            !run_full_interception(
1997                &mut nm,
1998                "https://www.nytimes.com/2024/article.html",
1999                ResourceType::Document,
2000                true,
2001            ),
2002            "first-party document navigation should never be blocked"
2003        );
2004    }
2005
2006    #[cfg(feature = "adblock")]
2007    #[test]
2008    fn test_e2e_custom_engine_blocks_through_pipeline() {
2009        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2010
2011        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2012        nm.set_page_url("https://mysite.com/".to_string());
2013
2014        let mut filter_set = adblock::lists::FilterSet::new(false);
2015        let mut opts = adblock::lists::ParseOptions::default();
2016        opts.rule_types = adblock::lists::RuleTypes::All;
2017        filter_set.add_filters(["||evil-cdn.example.net^$script"], opts);
2018        let engine = adblock::Engine::from_filter_set(filter_set, true);
2019        nm.set_adblock_engine(std::sync::Arc::new(engine));
2020
2021        assert!(
2022            run_full_interception(
2023                &mut nm,
2024                "https://evil-cdn.example.net/tracker.js",
2025                ResourceType::Script,
2026                false,
2027            ),
2028            "custom engine rule should block through full pipeline"
2029        );
2030
2031        // Legitimate script on the same site should still pass.
2032        assert!(
2033            !run_full_interception(
2034                &mut nm,
2035                "https://mysite.com/app.js",
2036                ResourceType::Script,
2037                true,
2038            ),
2039            "first-party script should still be allowed with custom engine"
2040        );
2041    }
2042
2043    #[cfg(feature = "adblock")]
2044    #[test]
2045    fn test_e2e_ad_image_blocked() {
2046        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2047
2048        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2049        nm.set_page_url("https://www.mylegitsite-test.com/".to_string());
2050
2051        // Ad tracking pixel should be blocked via adblock pattern or trie.
2052        assert!(
2053            run_full_interception(
2054                &mut nm,
2055                "https://googleads.g.doubleclick.net/pagead/viewthroughconversion/123/?random=456",
2056                ResourceType::Image,
2057                false,
2058            ),
2059            "doubleclick ad image/tracking pixel should be blocked"
2060        );
2061
2062        // Legitimate first-party image should pass.
2063        assert!(
2064            !run_full_interception(
2065                &mut nm,
2066                "https://www.mylegitsite-test.com/images/logo.png",
2067                ResourceType::Image,
2068                true,
2069            ),
2070            "legitimate first-party image should not be blocked"
2071        );
2072    }
2073
2074    #[cfg(feature = "adblock")]
2075    #[test]
2076    fn test_e2e_hostname_with_userinfo() {
2077        use chromiumoxide_cdp::cdp::browser_protocol::network::ResourceType;
2078
2079        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2080        nm.set_page_url("https://example.org/".to_string());
2081
2082        // URL with userinfo should still correctly identify googletagmanager.com.
2083        assert!(
2084            run_full_interception(
2085                &mut nm,
2086                "https://user:pass@www.googletagmanager.com/gtm.js?id=GTM-XXXX",
2087                ResourceType::Script,
2088                false,
2089            ),
2090            "tracker URL with userinfo should still be blocked"
2091        );
2092    }
2093
2094    #[test]
2095    fn test_blacklist_non_strict_allows_whitelist_override() {
2096        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2097        nm.set_page_url("https://example.com/".to_string());
2098
2099        nm.set_blacklist_patterns(["beacon.min.js"]);
2100        nm.set_whitelist_patterns(["beacon.min.js"]);
2101
2102        nm.set_blacklist_strict(false);
2103
2104        let u = "https://static.cloudflareinsights.com/beacon.min.js";
2105        assert!(nm.is_blacklisted(u));
2106        assert!(nm.is_whitelisted(u));
2107        assert!(!nm.blacklist_strict);
2108    }
2109
2110    // ── max_redirects enforcement ───────────────────────────────────────
2111    //
2112    // The redirect cap short-circuits in NetworkManager::on_request when a
2113    // Document-type chain exceeds the configured limit. We drive it via the
2114    // public on_request_will_be_sent entry point by deserializing synthetic
2115    // events — builder APIs exist but require every non-optional field, and
2116    // JSON is less fragile to cdp schema additions.
2117
2118    fn make_request_will_be_sent(
2119        request_id: &str,
2120        url: &str,
2121        resource_type: &str,
2122        redirect_from_url: Option<&str>,
2123    ) -> chromiumoxide_cdp::cdp::browser_protocol::network::EventRequestWillBeSent {
2124        let mut v = serde_json::json!({
2125            "requestId": request_id,
2126            "loaderId": "test-loader",
2127            "documentURL": url,
2128            "request": {
2129                "url": url,
2130                "method": "GET",
2131                "headers": {},
2132                "initialPriority": "Medium",
2133                "referrerPolicy": "no-referrer"
2134            },
2135            "timestamp": 0.0,
2136            "wallTime": 0.0,
2137            "initiator": { "type": "other" },
2138            "redirectHasExtraInfo": false,
2139            "type": resource_type,
2140            "frameId": "frame1"
2141        });
2142        if let Some(from) = redirect_from_url {
2143            v["redirectResponse"] = serde_json::json!({
2144                "url": from,
2145                "status": 302,
2146                "statusText": "Found",
2147                "headers": { "Location": url },
2148                "mimeType": "text/html",
2149                "charset": "",
2150                "connectionReused": false,
2151                "connectionId": 0.0,
2152                "encodedDataLength": 0.0,
2153                "securityState": "unknown"
2154            });
2155        }
2156        serde_json::from_value(v).expect("EventRequestWillBeSent should deserialize")
2157    }
2158
2159    fn drain_too_many_redirects(nm: &mut NetworkManager) -> Option<super::HttpRequest> {
2160        while let Some(ev) = nm.poll() {
2161            if let super::NetworkEvent::RequestFailed(req) = ev {
2162                if req.failure_text.as_deref() == Some("net::ERR_TOO_MANY_REDIRECTS") {
2163                    return Some(req);
2164                }
2165            }
2166        }
2167        None
2168    }
2169
2170    fn drain_stop_loading(nm: &mut NetworkManager) -> bool {
2171        while let Some(ev) = nm.poll() {
2172            if let super::NetworkEvent::SendCdpRequest((method, _)) = ev {
2173                let m: &str = method.as_ref();
2174                if m == "Page.stopLoading" {
2175                    return true;
2176                }
2177            }
2178        }
2179        false
2180    }
2181
2182    #[test]
2183    fn test_max_redirects_none_allows_unlimited_chain() {
2184        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2185        // max_redirects left at its default (None).
2186
2187        // 10 sequential Document hops sharing the same request_id.
2188        nm.on_request_will_be_sent(&make_request_will_be_sent(
2189            "r1",
2190            "https://example.com/0",
2191            "Document",
2192            None,
2193        ));
2194        for i in 1..10 {
2195            nm.on_request_will_be_sent(&make_request_will_be_sent(
2196                "r1",
2197                &format!("https://example.com/{i}"),
2198                "Document",
2199                Some(&format!("https://example.com/{}", i - 1)),
2200            ));
2201        }
2202
2203        assert!(
2204            drain_too_many_redirects(&mut nm).is_none(),
2205            "no cap set: chain of 10 hops must not emit ERR_TOO_MANY_REDIRECTS"
2206        );
2207    }
2208
2209    #[test]
2210    fn test_max_redirects_caps_document_chain() {
2211        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2212        nm.max_redirects = Some(3);
2213
2214        // Initial request + 4 redirect hops. The 4th redirect (chain length 4 > 3)
2215        // must trip the cap.
2216        nm.on_request_will_be_sent(&make_request_will_be_sent(
2217            "r1",
2218            "https://example.com/0",
2219            "Document",
2220            None,
2221        ));
2222        for i in 1..=4 {
2223            nm.on_request_will_be_sent(&make_request_will_be_sent(
2224                "r1",
2225                &format!("https://example.com/{i}"),
2226                "Document",
2227                Some(&format!("https://example.com/{}", i - 1)),
2228            ));
2229        }
2230
2231        let failed = drain_too_many_redirects(&mut nm)
2232            .expect("cap of 3 on a 4-hop chain must emit ERR_TOO_MANY_REDIRECTS");
2233        assert_eq!(
2234            failed.redirect_chain.len(),
2235            4,
2236            "failed request should preserve the full accumulated chain"
2237        );
2238        assert_eq!(
2239            failed.url.as_deref(),
2240            Some("https://example.com/4"),
2241            "failed request url should be the hop that tripped the cap"
2242        );
2243
2244        // Second navigation after the cap is tripped must also schedule
2245        // Page.stopLoading to actually abort the tab.
2246        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2247        nm.max_redirects = Some(3);
2248        nm.on_request_will_be_sent(&make_request_will_be_sent(
2249            "r2",
2250            "https://example.com/0",
2251            "Document",
2252            None,
2253        ));
2254        for i in 1..=4 {
2255            nm.on_request_will_be_sent(&make_request_will_be_sent(
2256                "r2",
2257                &format!("https://example.com/{i}"),
2258                "Document",
2259                Some(&format!("https://example.com/{}", i - 1)),
2260            ));
2261        }
2262        assert!(
2263            drain_stop_loading(&mut nm),
2264            "cap hit must dispatch Page.stopLoading to abort navigation"
2265        );
2266    }
2267
2268    #[test]
2269    fn test_max_redirects_ignores_subresources() {
2270        let mut nm = NetworkManager::new(false, Duration::from_secs(30));
2271        nm.max_redirects = Some(2);
2272
2273        // A 5-hop script redirect chain — sub-resources are exempt by design.
2274        nm.on_request_will_be_sent(&make_request_will_be_sent(
2275            "s1",
2276            "https://cdn.example.com/0.js",
2277            "Script",
2278            None,
2279        ));
2280        for i in 1..=5 {
2281            nm.on_request_will_be_sent(&make_request_will_be_sent(
2282                "s1",
2283                &format!("https://cdn.example.com/{i}.js"),
2284                "Script",
2285                Some(&format!("https://cdn.example.com/{}.js", i - 1)),
2286            ));
2287        }
2288
2289        assert!(
2290            drain_too_many_redirects(&mut nm).is_none(),
2291            "sub-resource redirect chains must never be capped"
2292        );
2293    }
2294}
chromiumoxide/handler/network.rs

chromiumoxide/handler/
network.rs