chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{
11    base_domain_from_any, base_domain_from_host, first_label, host_and_rest,
12    host_contains_label_icase, host_is_subdomain_of,
13};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
19    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
21    SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24    fetch::{
25        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27    },
28    network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46    /// General patterns for popular libraries and resources
47    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48        "jquery",           // Covers jquery.min.js, jquery.js, etc.
49        "angular",
50        "react",            // Covers all React-related patterns
51        "vue",              // Covers all Vue-related patterns
52        "bootstrap",
53        "d3",
54        "lodash",
55        "ajax",
56        "application",
57        "app",              // Covers general app scripts like app.js
58        "main",
59        "index",
60        "bundle",
61        "vendor",
62        "runtime",
63        "polyfill",
64        "scripts",
65        "es2015.",
66        "es2020.",
67        "webpack",
68        "/cdn-cgi/challenge-platform/",
69        "/wp-content/js/",  // Covers Wordpress content
70        // Verified 3rd parties for request
71        "https://m.stripe.network/",
72        "https://challenges.cloudflare.com/",
73        "https://www.google.com/recaptcha/enterprise.js",
74        "https://www.google.com/recaptcha/api.js",
75        "https://google.com/recaptcha/api.js",
76        "https://captcha.px-cloud.net/",
77        "https://cdn.auth0.com/js/lock/",
78        "https://cdn.auth0.com/client",
79        "https://js.stripe.com/",
80        "https://cdn.prod.website-files.com/", // webflow cdn scripts
81        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
82        "https://code.jquery.com/jquery-"
83    ];
84
85    /// Determine if a script should be rendered in the browser by name.
86    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88    /// General patterns for popular libraries and resources
89    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90        // Verified 3rd parties for request
91        "https://m.stripe.network/",
92        "https://challenges.cloudflare.com/",
93        "https://www.google.com/recaptcha/api.js",
94        "https://google.com/recaptcha/api.js",
95        "https://www.google.com/recaptcha/enterprise.js",
96        "https://js.stripe.com/",
97        "https://cdn.prod.website-files.com/", // webflow cdn scripts
98        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
99        "https://code.jquery.com/jquery-",
100        "https://ct.captcha-delivery.com/",
101        "https://geo.captcha-delivery.com/captcha/",
102        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
103        "https://ct.captcha-delivery.com/",
104        "https://cdn.auth0.com/client",
105        "https://captcha.px-cloud.net/",
106        "https://www.gstatic.com/recaptcha/",
107        "https://www.google.com/recaptcha/api2/",
108        "https://www.recaptcha.net/recaptcha/",
109        "https://www.recaptcha.net/recaptcha/api2/",
110        "https://js.hcaptcha.com/1/api.js",
111        "https://hcaptcha.com/1/api.js",
112        "https://js.datadome.co/tags.js",
113        "https://api-js.datadome.co/",
114        "https://client.perimeterx.net/",
115        "https://captcha.px-cdn.net/",
116        "https://captcha.px-cloud.net/",
117        "https://s.perimeterx.net/",
118        "https://client-api.arkoselabs.com/v2/",
119        "https://static.geetest.com/v4/gt4.js",
120        "https://static.geetest.com/",
121        "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
122        "https://cdn.perfdrive.com/aperture/",
123        "https://assets.queue-it.net/",
124        "/cdn-cgi/challenge-platform/",
125        "/_Incapsula_Resource",
126    ];
127
128    /// Determine if a script should be rendered in the browser by name.
129    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
130
131    /// path of a js framework
132    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
133        phf::phf_set! {
134            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
135            "_astro/", "_app/immutable"
136        }
137    };
138
139    /// Ignore the content types.
140    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
141        "application/pdf",
142        "application/zip",
143        "application/x-rar-compressed",
144        "application/x-tar",
145        "image/png",
146        "image/jpeg",
147        "image/gif",
148        "image/bmp",
149        "image/webp",
150        "image/svg+xml",
151        "video/mp4",
152        "video/x-msvideo",
153        "video/x-matroska",
154        "video/webm",
155        "audio/mpeg",
156        "audio/ogg",
157        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
158        "application/vnd.ms-excel",
159        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
160        "application/vnd.ms-powerpoint",
161        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
162        "application/x-7z-compressed",
163        "application/x-rpm",
164        "application/x-shockwave-flash",
165        "application/rtf",
166    };
167
168    /// Ignore the resources for visual content types.
169    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
170        "Image",
171        "Media",
172        "Font"
173    };
174
175    /// Ignore the resources for visual content types.
176    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
177        "CspViolationReport",
178        "Manifest",
179        "Other",
180        "Prefetch",
181        "Ping",
182    };
183
184    /// Case insenstive css matching
185    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
186
187    /// The command chain.
188    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
189        let enable = EnableParams::default();
190
191        if let Ok(c) = serde_json::to_value(&enable) {
192            vec![(enable.identifier(), c)]
193        } else {
194            vec![]
195        }
196    };
197
198    /// The command chain with https ignore.
199    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
200        let enable = EnableParams::default();
201        let mut v = vec![];
202        if let Ok(c) = serde_json::to_value(&enable) {
203            v.push((enable.identifier(), c));
204        }
205        let ignore = SetIgnoreCertificateErrorsParams::new(true);
206        if let Ok(ignored) = serde_json::to_value(&ignore) {
207            v.push((ignore.identifier(), ignored));
208        }
209
210        v
211    };
212
213    /// Enable the fetch intercept command
214    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
215        fetch::EnableParams::builder()
216        .handle_auth_requests(true)
217        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
218        .build()
219    };
220}
221
222/// Determine if a redirect is true.
223pub(crate) fn is_redirect_status(status: i64) -> bool {
224    matches!(status, 301 | 302 | 303 | 307 | 308)
225}
226
227#[derive(Debug)]
228/// The base network manager.
229pub struct NetworkManager {
230    queued_events: VecDeque<NetworkEvent>,
231    ignore_httpserrors: bool,
232    requests: HashMap<RequestId, HttpRequest>,
233    // TODO put event in an Arc?
234    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
235    extra_headers: std::collections::HashMap<String, String>,
236    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
237    user_cache_disabled: bool,
238    attempted_authentications: HashSet<RequestId>,
239    credentials: Option<Credentials>,
240    pub(crate) user_request_interception_enabled: bool,
241    block_all: bool,
242    pub(crate) protocol_request_interception_enabled: bool,
243    /// The network is offline.
244    offline: bool,
245    /// The page request timeout.
246    pub request_timeout: Duration,
247    // made_request: bool,
248    /// Ignore visuals (no pings, prefetching, and etc).
249    pub ignore_visuals: bool,
250    /// Block CSS stylesheets.
251    pub block_stylesheets: bool,
252    /// Block javascript that is not critical to rendering.
253    pub block_javascript: bool,
254    /// Block analytics from rendering
255    pub block_analytics: bool,
256    /// Only html from loading.
257    pub only_html: bool,
258    /// Is xml document?
259    pub xml_document: bool,
260    /// The custom intercept handle logic to run on the website.
261    pub intercept_manager: NetworkInterceptManager,
262    /// Track the amount of times the document reloaded.
263    pub document_reload_tracker: u8,
264    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
265    pub document_target_url: String,
266    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
267    pub document_target_domain: String,
268    /// The max bytes to receive.
269    pub max_bytes_allowed: Option<u64>,
270    #[cfg(feature = "_cache")]
271    /// The cache site_key to use.
272    pub cache_site_key: Option<String>,
273    /// The cache policy to use.
274    #[cfg(feature = "_cache")]
275    pub cache_policy: Option<BasicCachePolicy>,
276    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
277    whitelist_patterns: Vec<String>,
278    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
279    whitelist_matcher: Option<AhoCorasick>,
280}
281
282impl NetworkManager {
283    /// A new network manager.
284    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
285        Self {
286            queued_events: Default::default(),
287            ignore_httpserrors,
288            requests: Default::default(),
289            requests_will_be_sent: Default::default(),
290            extra_headers: Default::default(),
291            request_id_to_interception_id: Default::default(),
292            user_cache_disabled: false,
293            attempted_authentications: Default::default(),
294            credentials: None,
295            block_all: false,
296            user_request_interception_enabled: false,
297            protocol_request_interception_enabled: false,
298            offline: false,
299            request_timeout,
300            ignore_visuals: false,
301            block_javascript: false,
302            block_stylesheets: false,
303            block_analytics: true,
304            only_html: false,
305            xml_document: false,
306            intercept_manager: NetworkInterceptManager::Unknown,
307            document_reload_tracker: 0,
308            document_target_url: String::new(),
309            document_target_domain: String::new(),
310            whitelist_patterns: Vec::new(),
311            whitelist_matcher: None,
312            max_bytes_allowed: None,
313            #[cfg(feature = "_cache")]
314            cache_site_key: None,
315            #[cfg(feature = "_cache")]
316            cache_policy: None,
317        }
318    }
319
320    /// Replace the whitelist patterns (compiled once).
321    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
322    where
323        I: IntoIterator<Item = S>,
324        S: Into<String>,
325    {
326        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
327        self.rebuild_whitelist_matcher();
328    }
329
330    /// Add one pattern (cheap) and rebuild (call this sparingly).
331    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
332        self.whitelist_patterns.push(pattern.into());
333        self.rebuild_whitelist_matcher();
334    }
335
336    /// Add many patterns and rebuild once.
337    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
338    where
339        I: IntoIterator<Item = S>,
340        S: Into<String>,
341    {
342        self.whitelist_patterns
343            .extend(patterns.into_iter().map(Into::into));
344        self.rebuild_whitelist_matcher();
345    }
346
347    #[inline]
348    fn rebuild_whitelist_matcher(&mut self) {
349        if self.whitelist_patterns.is_empty() {
350            self.whitelist_matcher = None;
351            return;
352        }
353
354        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
355
356        // If building fails (shouldn’t for simple patterns), just disable matcher.
357        self.whitelist_matcher = AhoCorasick::new(refs).ok();
358    }
359
360    #[inline]
361    fn is_whitelisted(&self, url: &str) -> bool {
362        self.whitelist_matcher
363            .as_ref()
364            .map(|m| m.is_match(url))
365            .unwrap_or(false)
366    }
367
368    /// Commands to init the chain with.
369    pub fn init_commands(&self) -> CommandChain {
370        let cmds = if self.ignore_httpserrors {
371            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
372        } else {
373            INIT_CHAIN.clone()
374        };
375        CommandChain::new(cmds, self.request_timeout)
376    }
377
378    /// Push the CDP request.
379    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
380        let method = cmd.identifier();
381        if let Ok(params) = serde_json::to_value(cmd) {
382            self.queued_events
383                .push_back(NetworkEvent::SendCdpRequest((method, params)));
384        }
385    }
386
387    /// The next event to handle.
388    pub fn poll(&mut self) -> Option<NetworkEvent> {
389        self.queued_events.pop_front()
390    }
391
392    /// Get the extra headers.
393    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
394        &self.extra_headers
395    }
396
397    /// Set extra HTTP headers.
398    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
399        self.extra_headers = headers;
400        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
401        self.extra_headers.remove("Proxy-Authorization");
402        if !self.extra_headers.is_empty() {
403            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
404                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
405            }
406        }
407    }
408
409    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
410        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
411    }
412
413    pub fn set_block_all(&mut self, block_all: bool) {
414        self.block_all = block_all;
415    }
416
417    pub fn set_request_interception(&mut self, enabled: bool) {
418        self.user_request_interception_enabled = enabled;
419        self.update_protocol_request_interception();
420    }
421
422    pub fn set_cache_enabled(&mut self, enabled: bool) {
423        let run = self.user_cache_disabled != !enabled;
424        self.user_cache_disabled = !enabled;
425        if run {
426            self.update_protocol_cache_disabled();
427        }
428    }
429
430    /// Enable fetch interception.
431    pub fn enable_request_intercept(&mut self) {
432        self.protocol_request_interception_enabled = true;
433    }
434
435    /// Disable fetch interception.
436    pub fn disable_request_intercept(&mut self) {
437        self.protocol_request_interception_enabled = false;
438    }
439
440    /// Set the cache site key.
441    #[cfg(feature = "_cache")]
442    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
443        self.cache_site_key = cache_site_key;
444    }
445
446    /// Set the cache policy.
447    #[cfg(feature = "_cache")]
448    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
449        self.cache_policy = cache_policy;
450    }
451
452    pub fn update_protocol_cache_disabled(&mut self) {
453        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
454    }
455
456    pub fn authenticate(&mut self, credentials: Credentials) {
457        self.credentials = Some(credentials);
458        self.update_protocol_request_interception();
459        self.protocol_request_interception_enabled = true;
460    }
461
462    fn update_protocol_request_interception(&mut self) {
463        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
464
465        if enabled == self.protocol_request_interception_enabled {
466            return;
467        }
468
469        if enabled {
470            self.push_cdp_request(ENABLE_FETCH.clone())
471        } else {
472            self.push_cdp_request(DisableParams::default())
473        }
474    }
475
476    #[inline]
477    fn rel_for_ignore_script<'a>(&self, url: &'a str) -> Cow<'a, str> {
478        if url.starts_with('/') {
479            return Cow::Borrowed(url);
480        }
481
482        let base_raw = self.document_target_domain.as_str();
483
484        if base_raw.is_empty() {
485            return Cow::Borrowed(url);
486        }
487
488        let base = base_domain_from_any(base_raw).trim_end_matches('.');
489        if base.is_empty() {
490            return Cow::Borrowed(url);
491        }
492
493        let brand = first_label(base);
494
495        if let Some((host, rest)) = host_and_rest(url) {
496            if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
497                return if rest.starts_with('/') {
498                    Cow::Borrowed(rest)
499                } else {
500                    Cow::Borrowed("/")
501                };
502            }
503        }
504
505        Cow::Borrowed(url)
506    }
507
508    /// Url matches analytics that we want to ignore or trackers.
509    #[inline]
510    pub(crate) fn ignore_script(
511        &self,
512        url: &str,
513        block_analytics: bool,
514        intercept_manager: NetworkInterceptManager,
515    ) -> bool {
516        // allow relative domains.
517        let mut ignore_script = !url.starts_with("/");
518
519        if !ignore_script
520            && block_analytics
521            && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
522        {
523            ignore_script = true;
524        }
525
526        if !ignore_script {
527            if let Some(index) = url.find("//") {
528                let pos = index + 2;
529
530                // Ensure there is something after `//`
531                if pos < url.len() {
532                    // Find the first slash after the `//`
533                    if let Some(slash_index) = url[pos..].find('/') {
534                        let base_path_index = pos + slash_index + 1;
535
536                        if url.len() > base_path_index {
537                            let new_url: &str = &url[base_path_index..];
538
539                            // ignore assets we do not need for frameworks
540                            if !ignore_script
541                                && intercept_manager == NetworkInterceptManager::Unknown
542                            {
543                                let hydration_file =
544                                    JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
545
546                                // ignore astro paths
547                                if hydration_file && new_url.ends_with(".js") {
548                                    ignore_script = true;
549                                }
550                            }
551
552                            if !ignore_script
553                                && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
554                            {
555                                ignore_script = true;
556                            }
557
558                            if !ignore_script
559                                && self.ignore_visuals
560                                && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
561                            {
562                                ignore_script = true;
563                            }
564                        }
565                    }
566                }
567            }
568        }
569
570        // fallback for file ending in analytics.js
571        if !ignore_script && block_analytics {
572            ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
573        }
574
575        ignore_script
576    }
577
578    /// Determine if the request should be skipped.
579    #[inline]
580    fn skip_xhr(
581        &self,
582        skip_networking: bool,
583        event: &EventRequestPaused,
584        network_event: bool,
585    ) -> bool {
586        // XHR check
587        if !skip_networking && network_event {
588            let request_url = event.request.url.as_str();
589
590            // check if part of ignore scripts.
591            let skip_analytics =
592                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
593
594            if skip_analytics {
595                true
596            } else if self.block_stylesheets || self.ignore_visuals {
597                let block_css = self.block_stylesheets;
598                let block_media = self.ignore_visuals;
599
600                let mut block_request = false;
601
602                if let Some(position) = request_url.rfind('.') {
603                    let hlen = request_url.len();
604                    let has_asset = hlen - position;
605
606                    if has_asset >= 3 {
607                        let next_position = position + 1;
608
609                        if block_media
610                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
611                                &request_url[next_position..].into(),
612                            )
613                        {
614                            block_request = true;
615                        } else if block_css {
616                            block_request =
617                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
618                                    .contains(&**CSS_EXTENSION)
619                        }
620                    }
621                }
622
623                if !block_request {
624                    block_request = ignore_script_xhr_media(request_url);
625                }
626
627                block_request
628            } else {
629                skip_networking
630            }
631        } else {
632            skip_networking
633        }
634    }
635
636    #[cfg(feature = "adblock")]
637    #[inline]
638    /// Detect if ad enabled.
639    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
640        if skip_networking {
641            true
642        } else {
643            self.detect_ad(event)
644        }
645    }
646
647    /// When adblock feature is disabled, this is a no-op.
648    #[cfg(not(feature = "adblock"))]
649    #[inline]
650    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
651        skip_networking
652    }
653
654    #[inline]
655    /// Fail request
656    fn fail_request_blocked(
657        &mut self,
658        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
659    ) {
660        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
661            request_id.clone(),
662            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
663        );
664        self.push_cdp_request(params);
665    }
666
667    #[inline]
668    /// Fulfill request
669    fn fulfill_request_empty_200(
670        &mut self,
671        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
672    ) {
673        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
674            request_id.clone(),
675            200,
676        );
677        self.push_cdp_request(params);
678    }
679
680    #[cfg(feature = "_cache")]
681    #[inline]
682    /// Fulfill a paused Fetch request from cached bytes + header map.
683    ///
684    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
685    fn fulfill_request_from_cache(
686        &mut self,
687        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
688        body: &[u8],
689        headers: &std::collections::HashMap<String, String>,
690        status: i64,
691    ) {
692        use crate::cdp::browser_protocol::fetch::HeaderEntry;
693        use crate::handler::network::fetch::FulfillRequestParams;
694        use base64::Engine;
695
696        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
697
698        for (k, v) in headers.iter() {
699            resp_headers.push(HeaderEntry {
700                name: k.clone().into(),
701                value: v.clone().into(),
702            });
703        }
704
705        let mut params = FulfillRequestParams::new(request_id.clone(), status);
706
707        // TODO: have this already encoded prior.
708        params.body = Some(
709            base64::engine::general_purpose::STANDARD
710                .encode(body)
711                .into(),
712        );
713
714        params.response_headers = Some(resp_headers);
715
716        self.push_cdp_request(params);
717    }
718
719    #[inline]
720    /// Continue the request url.
721    fn continue_request_with_url(
722        &mut self,
723        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
724        url: Option<&str>,
725        intercept_response: bool,
726    ) {
727        let mut params = ContinueRequestParams::new(request_id.clone());
728        if let Some(url) = url {
729            params.url = Some(url.to_string());
730            params.intercept_response = Some(intercept_response);
731        }
732        self.push_cdp_request(params);
733    }
734
735    /// On fetch requesdt paused interception.
736    #[inline]
737    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
738        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
739            return;
740        }
741
742        let resource_type = &event.resource_type;
743
744        if self.block_all {
745            tracing::debug!(
746                "Blocked (block_all): {:?} - {}",
747                event.resource_type,
748                event.request.url
749            );
750            return self.fail_request_blocked(&event.request_id);
751        }
752
753        // // If both interceptions are enabled, do nothing.
754        // if !self.user_request_interception_enabled && self.protocol_request_interception_enabled {
755        //     self.push_cdp_request(ContinueRequestParams::new(event.request_id.clone()))
756        // }
757
758        if let Some(network_id) = event.network_id.as_ref() {
759            if let Some(request_will_be_sent) =
760                self.requests_will_be_sent.remove(network_id.as_ref())
761            {
762                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
763            } else {
764                self.request_id_to_interception_id
765                    .insert(network_id.clone(), event.request_id.clone().into());
766            }
767        }
768
769        // From here on, we handle the full decision tree.
770        let javascript_resource = *resource_type == ResourceType::Script;
771        let document_resource = *resource_type == ResourceType::Document;
772        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
773
774        // Start with static / cheap skip checks.
775        let mut skip_networking =
776            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
777
778        // Also short-circuit if we've reloaded this document too many times.
779        if !skip_networking {
780            skip_networking = self.document_reload_tracker >= 3;
781        }
782
783        // Handle document redirect / masking and track xml documents.
784        let (current_url_cow, had_replacer) =
785            self.handle_document_replacement_and_tracking(event, document_resource);
786
787        let current_url: &str = current_url_cow.as_ref();
788
789        // Main initial check (visuals, stylesheets, simple JS blocking).
790        if !skip_networking {
791            // Allow XSL for sitemap XML.
792            if self.xml_document && current_url.ends_with(".xsl") {
793                skip_networking = false;
794            } else {
795                skip_networking = self.should_skip_for_visuals_and_basic_js(
796                    resource_type,
797                    javascript_resource,
798                    current_url,
799                );
800            }
801        }
802
803        // Ad blocking (only active when feature = "adblock").
804        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
805
806        // Ignore embedded scripts when only_html or ignore_visuals is set.
807        if !skip_networking
808            && (self.only_html || self.ignore_visuals)
809            && (javascript_resource || document_resource)
810        {
811            skip_networking = ignore_script_embedded(current_url);
812        }
813
814        // Analytics check for JS.
815        if skip_networking && javascript_resource {
816            let rel = self.rel_for_ignore_script(current_url);
817            skip_networking =
818                self.ignore_script(rel.as_ref(), self.block_analytics, self.intercept_manager);
819        }
820
821        // XHR / data resources.
822        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
823
824        // Custom interception layer.
825        if !skip_networking && (javascript_resource || network_resource || document_resource) {
826            skip_networking = self.intercept_manager.intercept_detection(
827                current_url,
828                self.ignore_visuals,
829                network_resource,
830            );
831        }
832
833        // Custom website block list.
834        if !skip_networking && (javascript_resource || network_resource) {
835            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
836        }
837
838        // whitelist 3rd party
839        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
840        {
841            skip_networking = false;
842        }
843
844        // check if the url is in the whitelist.
845        if skip_networking && self.is_whitelisted(current_url) {
846            skip_networking = false;
847        }
848
849        if skip_networking {
850            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
851            self.fulfill_request_empty_200(&event.request_id);
852        } else {
853            #[cfg(feature = "_cache")]
854            {
855                if let (Some(policy), Some(cache_site_key)) =
856                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
857                {
858                    let current_url = format!("{}:{}", event.request.method, &current_url);
859
860                    if let Some((res, cache_policy)) =
861                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
862                    {
863                        if policy.allows_cached(&cache_policy) {
864                            tracing::debug!(
865                                "Remote Cached: {:?} - {}",
866                                resource_type,
867                                &current_url
868                            );
869                            return self.fulfill_request_from_cache(
870                                &event.request_id,
871                                &res.body,
872                                &res.headers,
873                                res.status as i64,
874                            );
875                        }
876                    }
877                }
878            }
879
880            // check our frame cache for the run.
881            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
882            self.continue_request_with_url(
883                &event.request_id,
884                if had_replacer {
885                    Some(current_url)
886                } else {
887                    None
888                },
889                !had_replacer,
890            );
891        }
892    }
893
894    /// Does the network manager have a target domain?
895    pub fn has_target_domain(&self) -> bool {
896        !self.document_target_url.is_empty()
897    }
898
899    /// Set the target page url for tracking.
900    pub fn set_page_url(&mut self, page_target_url: String) {
901        let host_base = host_and_rest(&page_target_url)
902            .map(|(h, _)| base_domain_from_host(h))
903            .unwrap_or("");
904
905        self.document_target_domain = host_base.to_string();
906        self.document_target_url = page_target_url;
907    }
908
909    /// Clear the initial target domain on every navigation.
910    pub fn clear_target_domain(&mut self) {
911        self.document_reload_tracker = 0;
912        self.document_target_url = Default::default();
913        self.document_target_domain = Default::default();
914    }
915    /// Handles:
916    /// - document reload tracking (`document_reload_tracker`)
917    /// - redirect masking / replacement
918    /// - xml document detection (`xml_document`)
919    /// - `document_target_url` updates
920    ///
921    /// Returns (current_url, had_replacer).
922    #[inline]
923    fn handle_document_replacement_and_tracking<'a>(
924        &mut self,
925        event: &'a EventRequestPaused,
926        document_resource: bool,
927    ) -> (Cow<'a, str>, bool) {
928        let mut replacer: Option<String> = None;
929        let current_url = event.request.url.as_str();
930
931        if document_resource {
932            if self.document_target_url == current_url {
933                self.document_reload_tracker += 1;
934            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
935            {
936                let (http_document_replacement, mut https_document_replacement) =
937                    if self.document_target_url.starts_with("http://") {
938                        (
939                            self.document_target_url.replacen("http://", "http//", 1),
940                            self.document_target_url.replacen("http://", "https://", 1),
941                        )
942                    } else {
943                        (
944                            self.document_target_url.replacen("https://", "https//", 1),
945                            self.document_target_url.replacen("https://", "http://", 1),
946                        )
947                    };
948
949                // Track trailing slash to restore later.
950                let trailing = https_document_replacement.ends_with('/');
951                if trailing {
952                    https_document_replacement.pop();
953                }
954                if https_document_replacement.ends_with('/') {
955                    https_document_replacement.pop();
956                }
957
958                let redirect_mask = format!(
959                    "{}{}",
960                    https_document_replacement, http_document_replacement
961                );
962
963                if current_url == redirect_mask {
964                    replacer = Some(if trailing {
965                        format!("{}/", https_document_replacement)
966                    } else {
967                        https_document_replacement
968                    });
969                }
970            }
971
972            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
973                self.xml_document = true;
974            }
975
976            // Track last seen document URL.
977            self.document_target_url = event.request.url.clone();
978            self.document_target_domain = host_and_rest(&self.document_target_url)
979                .map(|(h, _)| base_domain_from_host(h).to_string())
980                .unwrap_or_default();
981        }
982
983        let current_url_cow = match replacer {
984            Some(r) => Cow::Owned(r),
985            None => Cow::Borrowed(event.request.url.as_str()),
986        };
987
988        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
989        (current_url_cow, had_replacer)
990    }
991
992    /// Shared "visuals + basic JS blocking" logic.
993    #[inline]
994    fn should_skip_for_visuals_and_basic_js(
995        &self,
996        resource_type: &ResourceType,
997        javascript_resource: bool,
998        current_url: &str,
999    ) -> bool {
1000        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1001            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1002            || (self.block_javascript
1003                && javascript_resource
1004                && self.intercept_manager == NetworkInterceptManager::Unknown
1005                && !ALLOWED_MATCHER.is_match(current_url))
1006    }
1007
1008    /// Perform a page intercept for chrome
1009    #[cfg(feature = "adblock")]
1010    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1011        use adblock::{
1012            lists::{FilterSet, ParseOptions, RuleTypes},
1013            Engine,
1014        };
1015
1016        lazy_static::lazy_static! {
1017            static ref AD_ENGINE: Engine = {
1018                let mut filter_set = FilterSet::new(false);
1019                let mut rules = ParseOptions::default();
1020                rules.rule_types = RuleTypes::All;
1021
1022                filter_set.add_filters(
1023                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1024                    rules,
1025                );
1026
1027                Engine::from_filter_set(filter_set, true)
1028            };
1029        };
1030
1031        let blockable = ResourceType::Image == event.resource_type
1032            || event.resource_type == ResourceType::Media
1033            || event.resource_type == ResourceType::Stylesheet
1034            || event.resource_type == ResourceType::Document
1035            || event.resource_type == ResourceType::Fetch
1036            || event.resource_type == ResourceType::Xhr;
1037
1038        let u = &event.request.url;
1039
1040        let block_request = blockable
1041            // set it to example.com for 3rd party handling is_same_site
1042        && {
1043            let request = adblock::request::Request::preparsed(
1044                 &u,
1045                 "example.com",
1046                 "example.com",
1047                 &event.resource_type.as_ref().to_lowercase(),
1048                 !event.request.is_same_site.unwrap_or_default());
1049
1050            AD_ENGINE.check_network_request(&request).matched
1051        };
1052
1053        block_request
1054    }
1055
1056    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1057        let response = if self
1058            .attempted_authentications
1059            .contains(event.request_id.as_ref())
1060        {
1061            AuthChallengeResponseResponse::CancelAuth
1062        } else if self.credentials.is_some() {
1063            self.attempted_authentications
1064                .insert(event.request_id.clone().into());
1065            AuthChallengeResponseResponse::ProvideCredentials
1066        } else {
1067            AuthChallengeResponseResponse::Default
1068        };
1069
1070        let mut auth = AuthChallengeResponse::new(response);
1071        if let Some(creds) = self.credentials.clone() {
1072            auth.username = Some(creds.username);
1073            auth.password = Some(creds.password);
1074        }
1075        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1076    }
1077
1078    /// Set the page offline network emulation condition.
1079    pub fn set_offline_mode(&mut self, value: bool) {
1080        if self.offline == value {
1081            return;
1082        }
1083        self.offline = value;
1084        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1085            .offline(self.offline)
1086            .latency(0)
1087            .download_throughput(-1.)
1088            .upload_throughput(-1.)
1089            .build()
1090        {
1091            self.push_cdp_request(network);
1092        }
1093    }
1094
1095    /// Request interception doesn't happen for data URLs with Network Service.
1096    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1097        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1098            if let Some(interception_id) = self
1099                .request_id_to_interception_id
1100                .remove(event.request_id.as_ref())
1101            {
1102                self.on_request(event, Some(interception_id));
1103            } else {
1104                // TODO remove the clone for event
1105                self.requests_will_be_sent
1106                    .insert(event.request_id.clone(), event.clone());
1107            }
1108        } else {
1109            self.on_request(event, None);
1110        }
1111    }
1112
1113    /// The request was served from the cache.
1114    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1115        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1116            request.from_memory_cache = true;
1117        }
1118    }
1119
1120    /// On network response received.
1121    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1122        let mut request_failed = false;
1123
1124        // Track how many bytes we actually deducted from this target.
1125        let mut deducted: u64 = 0;
1126
1127        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1128            let before = *max_bytes;
1129
1130            // encoded_data_length -> saturating cast to u64
1131            let received_bytes: u64 = event.response.encoded_data_length as u64;
1132
1133            // Safe parse of Content-Length
1134            let content_length: Option<u64> = event
1135                .response
1136                .headers
1137                .inner()
1138                .get("content-length")
1139                .and_then(|v| v.as_str())
1140                .and_then(|s| s.trim().parse::<u64>().ok());
1141
1142            // Deduct what we actually received
1143            *max_bytes = max_bytes.saturating_sub(received_bytes);
1144
1145            // If the declared size can't fit, zero out now
1146            if let Some(cl) = content_length {
1147                if cl > *max_bytes {
1148                    *max_bytes = 0;
1149                }
1150            }
1151
1152            request_failed = *max_bytes == 0;
1153
1154            // Compute exact delta deducted on this event
1155            deducted = before.saturating_sub(*max_bytes);
1156        }
1157
1158        // Bubble up the deduction (even if request continues)
1159        if deducted > 0 {
1160            self.queued_events
1161                .push_back(NetworkEvent::BytesConsumed(deducted));
1162        }
1163
1164        // block all network request moving forward.
1165        if request_failed && self.max_bytes_allowed.is_some() {
1166            self.set_block_all(true);
1167        }
1168
1169        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1170            request.set_response(event.response.clone());
1171            self.queued_events.push_back(if request_failed {
1172                NetworkEvent::RequestFailed(request)
1173            } else {
1174                NetworkEvent::RequestFinished(request)
1175            });
1176        }
1177    }
1178
1179    /// On network loading finished.
1180    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1181        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1182            if let Some(interception_id) = request.interception_id.as_ref() {
1183                self.attempted_authentications
1184                    .remove(interception_id.as_ref());
1185            }
1186            self.queued_events
1187                .push_back(NetworkEvent::RequestFinished(request));
1188        }
1189    }
1190
1191    /// On network loading failed.
1192    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1193        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1194            request.failure_text = Some(event.error_text.clone());
1195            if let Some(interception_id) = request.interception_id.as_ref() {
1196                self.attempted_authentications
1197                    .remove(interception_id.as_ref());
1198            }
1199            self.queued_events
1200                .push_back(NetworkEvent::RequestFailed(request));
1201        }
1202    }
1203
1204    /// On request will be sent.
1205    fn on_request(
1206        &mut self,
1207        event: &EventRequestWillBeSent,
1208        interception_id: Option<InterceptionId>,
1209    ) {
1210        let mut redirect_chain = Vec::new();
1211        let mut redirect_location = None;
1212
1213        if let Some(redirect_resp) = &event.redirect_response {
1214            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1215                if is_redirect_status(redirect_resp.status) {
1216                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1217                        if redirect_resp.url != location {
1218                            let fixed_location = location.replace(&redirect_resp.url, "");
1219
1220                            if !fixed_location.is_empty() {
1221                                request.response.as_mut().map(|resp| {
1222                                    resp.headers.0["Location"] =
1223                                        serde_json::Value::String(fixed_location.clone());
1224                                });
1225                            }
1226
1227                            redirect_location = Some(fixed_location);
1228                        }
1229                    }
1230                }
1231
1232                self.handle_request_redirect(
1233                    &mut request,
1234                    if let Some(redirect_location) = redirect_location {
1235                        let mut redirect_resp = redirect_resp.clone();
1236
1237                        if !redirect_location.is_empty() {
1238                            redirect_resp.headers.0["Location"] =
1239                                serde_json::Value::String(redirect_location);
1240                        }
1241
1242                        redirect_resp
1243                    } else {
1244                        redirect_resp.clone()
1245                    },
1246                );
1247
1248                redirect_chain = std::mem::take(&mut request.redirect_chain);
1249                redirect_chain.push(request);
1250            }
1251        }
1252
1253        let request = HttpRequest::new(
1254            event.request_id.clone(),
1255            event.frame_id.clone(),
1256            interception_id,
1257            self.user_request_interception_enabled,
1258            redirect_chain,
1259        );
1260
1261        self.requests.insert(event.request_id.clone(), request);
1262        self.queued_events
1263            .push_back(NetworkEvent::Request(event.request_id.clone()));
1264    }
1265
1266    /// Handle request redirect.
1267    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1268        request.set_response(response);
1269        if let Some(interception_id) = request.interception_id.as_ref() {
1270            self.attempted_authentications
1271                .remove(interception_id.as_ref());
1272        }
1273    }
1274}
1275
1276#[derive(Debug)]
1277pub enum NetworkEvent {
1278    /// Send a CDP request.
1279    SendCdpRequest((MethodId, serde_json::Value)),
1280    /// Request.
1281    Request(RequestId),
1282    /// Response
1283    Response(RequestId),
1284    /// Request failed.
1285    RequestFailed(HttpRequest),
1286    /// Request finished.
1287    RequestFinished(HttpRequest),
1288    /// Bytes consumed.
1289    BytesConsumed(u64),
1290}
1291
1292#[cfg(test)]
1293mod tests {
1294    use super::ALLOWED_MATCHER_3RD_PARTY;
1295
1296    #[test]
1297    fn test_allowed_matcher_3rd_party() {
1298        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1299        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1300        assert!(
1301            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1302            "expected Cloudflare challenge script to be allowed"
1303        );
1304
1305        // Should NOT be allowed (not in allow-list)
1306        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1307        assert!(
1308            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1309            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1310        );
1311
1312        // A couple sanity checks for existing allow patterns
1313        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1314        assert!(ALLOWED_MATCHER_3RD_PARTY
1315            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1316        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1317    }
1318}