chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{
11    base_domain_from_any, base_domain_from_host, first_label, host_and_rest,
12    host_contains_label_icase, host_is_subdomain_of,
13};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
19    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
21    SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24    fetch::{
25        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27    },
28    network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46    /// General patterns for popular libraries and resources
47    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48        "jquery",           // Covers jquery.min.js, jquery.js, etc.
49        "angular",
50        "react",            // Covers all React-related patterns
51        "vue",              // Covers all Vue-related patterns
52        "bootstrap",
53        "d3",
54        "lodash",
55        "ajax",
56        "application",
57        "app",              // Covers general app scripts like app.js
58        "main",
59        "index",
60        "bundle",
61        "vendor",
62        "runtime",
63        "polyfill",
64        "scripts",
65        "es2015.",
66        "es2020.",
67        "webpack",
68        "/cdn-cgi/challenge-platform/",
69        "/wp-content/js/",  // Covers Wordpress content
70        // Verified 3rd parties for request
71        "https://m.stripe.network/",
72        "https://challenges.cloudflare.com/",
73        "https://www.google.com/recaptcha/enterprise.js",
74        "https://www.google.com/recaptcha/api.js",
75        "https://google.com/recaptcha/api.js",
76        "https://captcha.px-cloud.net/",
77        "https://cdn.auth0.com/js/lock/",
78        "https://cdn.auth0.com/client",
79        "https://js.stripe.com/",
80        "https://cdn.prod.website-files.com/", // webflow cdn scripts
81        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
82        "https://code.jquery.com/jquery-"
83    ];
84
85    /// Determine if a script should be rendered in the browser by name.
86    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88    /// General patterns for popular libraries and resources
89    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90        // Verified 3rd parties for request
91        "https://m.stripe.network/",
92        "https://challenges.cloudflare.com/",
93        "https://www.google.com/recaptcha/api.js",
94        "https://google.com/recaptcha/api.js",
95        "https://www.google.com/recaptcha/enterprise.js",
96        "https://js.stripe.com/",
97        "https://cdn.prod.website-files.com/", // webflow cdn scripts
98        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
99        "https://code.jquery.com/jquery-",
100        "https://ct.captcha-delivery.com/",
101        "https://geo.captcha-delivery.com/captcha/",
102        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
103        "https://ct.captcha-delivery.com/",
104        "https://cdn.auth0.com/client",
105        "https://captcha.px-cloud.net/",
106        "https://static.intercomassets.com/", // help pages
107        "/cdn-cgi/challenge-platform/"
108    ];
109
110    /// Determine if a script should be rendered in the browser by name.
111    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
112
113    /// path of a js framework
114    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
115        phf::phf_set! {
116            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
117            "_astro/", "_app/immutable"
118        }
119    };
120
121    /// Ignore the content types.
122    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
123        "application/pdf",
124        "application/zip",
125        "application/x-rar-compressed",
126        "application/x-tar",
127        "image/png",
128        "image/jpeg",
129        "image/gif",
130        "image/bmp",
131        "image/webp",
132        "image/svg+xml",
133        "video/mp4",
134        "video/x-msvideo",
135        "video/x-matroska",
136        "video/webm",
137        "audio/mpeg",
138        "audio/ogg",
139        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
140        "application/vnd.ms-excel",
141        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
142        "application/vnd.ms-powerpoint",
143        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
144        "application/x-7z-compressed",
145        "application/x-rpm",
146        "application/x-shockwave-flash",
147        "application/rtf",
148    };
149
150    /// Ignore the resources for visual content types.
151    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
152        "Image",
153        "Media",
154        "Font"
155    };
156
157    /// Ignore the resources for visual content types.
158    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
159        "CspViolationReport",
160        "Manifest",
161        "Other",
162        "Prefetch",
163        "Ping",
164    };
165
166    /// Case insenstive css matching
167    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
168
169    /// The command chain.
170    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
171        let enable = EnableParams::default();
172
173        if let Ok(c) = serde_json::to_value(&enable) {
174            vec![(enable.identifier(), c)]
175        } else {
176            vec![]
177        }
178    };
179
180    /// The command chain with https ignore.
181    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
182        let enable = EnableParams::default();
183        let mut v = vec![];
184        if let Ok(c) = serde_json::to_value(&enable) {
185            v.push((enable.identifier(), c));
186        }
187        let ignore = SetIgnoreCertificateErrorsParams::new(true);
188        if let Ok(ignored) = serde_json::to_value(&ignore) {
189            v.push((ignore.identifier(), ignored));
190        }
191
192        v
193    };
194
195    /// Enable the fetch intercept command
196    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
197        fetch::EnableParams::builder()
198        .handle_auth_requests(true)
199        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
200        .build()
201    };
202}
203
204/// Determine if a redirect is true.
205pub(crate) fn is_redirect_status(status: i64) -> bool {
206    matches!(status, 301 | 302 | 303 | 307 | 308)
207}
208
209#[derive(Debug)]
210/// The base network manager.
211pub struct NetworkManager {
212    queued_events: VecDeque<NetworkEvent>,
213    ignore_httpserrors: bool,
214    requests: HashMap<RequestId, HttpRequest>,
215    // TODO put event in an Arc?
216    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
217    extra_headers: std::collections::HashMap<String, String>,
218    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
219    user_cache_disabled: bool,
220    attempted_authentications: HashSet<RequestId>,
221    credentials: Option<Credentials>,
222    pub(crate) user_request_interception_enabled: bool,
223    block_all: bool,
224    pub(crate) protocol_request_interception_enabled: bool,
225    /// The network is offline.
226    offline: bool,
227    /// The page request timeout.
228    pub request_timeout: Duration,
229    // made_request: bool,
230    /// Ignore visuals (no pings, prefetching, and etc).
231    pub ignore_visuals: bool,
232    /// Block CSS stylesheets.
233    pub block_stylesheets: bool,
234    /// Block javascript that is not critical to rendering.
235    pub block_javascript: bool,
236    /// Block analytics from rendering
237    pub block_analytics: bool,
238    /// Only html from loading.
239    pub only_html: bool,
240    /// Is xml document?
241    pub xml_document: bool,
242    /// The custom intercept handle logic to run on the website.
243    pub intercept_manager: NetworkInterceptManager,
244    /// Track the amount of times the document reloaded.
245    pub document_reload_tracker: u8,
246    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
247    pub document_target_url: String,
248    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
249    pub document_target_domain: String,
250    /// The max bytes to receive.
251    pub max_bytes_allowed: Option<u64>,
252    #[cfg(feature = "_cache")]
253    /// The cache site_key to use.
254    pub cache_site_key: Option<String>,
255    /// The cache policy to use.
256    #[cfg(feature = "_cache")]
257    pub cache_policy: Option<BasicCachePolicy>,
258    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
259    whitelist_patterns: Vec<String>,
260    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
261    whitelist_matcher: Option<AhoCorasick>,
262}
263
264impl NetworkManager {
265    /// A new network manager.
266    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
267        Self {
268            queued_events: Default::default(),
269            ignore_httpserrors,
270            requests: Default::default(),
271            requests_will_be_sent: Default::default(),
272            extra_headers: Default::default(),
273            request_id_to_interception_id: Default::default(),
274            user_cache_disabled: false,
275            attempted_authentications: Default::default(),
276            credentials: None,
277            block_all: false,
278            user_request_interception_enabled: false,
279            protocol_request_interception_enabled: false,
280            offline: false,
281            request_timeout,
282            ignore_visuals: false,
283            block_javascript: false,
284            block_stylesheets: false,
285            block_analytics: true,
286            only_html: false,
287            xml_document: false,
288            intercept_manager: NetworkInterceptManager::Unknown,
289            document_reload_tracker: 0,
290            document_target_url: String::new(),
291            document_target_domain: String::new(),
292            whitelist_patterns: Vec::new(),
293            whitelist_matcher: None,
294            max_bytes_allowed: None,
295            #[cfg(feature = "_cache")]
296            cache_site_key: None,
297            #[cfg(feature = "_cache")]
298            cache_policy: None,
299        }
300    }
301
302    /// Replace the whitelist patterns (compiled once).
303    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
304    where
305        I: IntoIterator<Item = S>,
306        S: Into<String>,
307    {
308        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
309        self.rebuild_whitelist_matcher();
310    }
311
312    /// Add one pattern (cheap) and rebuild (call this sparingly).
313    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
314        self.whitelist_patterns.push(pattern.into());
315        self.rebuild_whitelist_matcher();
316    }
317
318    /// Add many patterns and rebuild once.
319    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
320    where
321        I: IntoIterator<Item = S>,
322        S: Into<String>,
323    {
324        self.whitelist_patterns
325            .extend(patterns.into_iter().map(Into::into));
326        self.rebuild_whitelist_matcher();
327    }
328
329    #[inline]
330    fn rebuild_whitelist_matcher(&mut self) {
331        if self.whitelist_patterns.is_empty() {
332            self.whitelist_matcher = None;
333            return;
334        }
335
336        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
337
338        // If building fails (shouldn’t for simple patterns), just disable matcher.
339        self.whitelist_matcher = AhoCorasick::new(refs).ok();
340    }
341
342    #[inline]
343    fn is_whitelisted(&self, url: &str) -> bool {
344        self.whitelist_matcher
345            .as_ref()
346            .map(|m| m.is_match(url))
347            .unwrap_or(false)
348    }
349
350    /// Commands to init the chain with.
351    pub fn init_commands(&self) -> CommandChain {
352        let cmds = if self.ignore_httpserrors {
353            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
354        } else {
355            INIT_CHAIN.clone()
356        };
357        CommandChain::new(cmds, self.request_timeout)
358    }
359
360    /// Push the CDP request.
361    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
362        let method = cmd.identifier();
363        if let Ok(params) = serde_json::to_value(cmd) {
364            self.queued_events
365                .push_back(NetworkEvent::SendCdpRequest((method, params)));
366        }
367    }
368
369    /// The next event to handle.
370    pub fn poll(&mut self) -> Option<NetworkEvent> {
371        self.queued_events.pop_front()
372    }
373
374    /// Get the extra headers.
375    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
376        &self.extra_headers
377    }
378
379    /// Set extra HTTP headers.
380    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
381        self.extra_headers = headers;
382        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
383        self.extra_headers.remove("Proxy-Authorization");
384        if !self.extra_headers.is_empty() {
385            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
386                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
387            }
388        }
389    }
390
391    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
392        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
393    }
394
395    pub fn set_block_all(&mut self, block_all: bool) {
396        self.block_all = block_all;
397    }
398
399    pub fn set_request_interception(&mut self, enabled: bool) {
400        self.user_request_interception_enabled = enabled;
401        self.update_protocol_request_interception();
402    }
403
404    pub fn set_cache_enabled(&mut self, enabled: bool) {
405        let run = self.user_cache_disabled != !enabled;
406        self.user_cache_disabled = !enabled;
407        if run {
408            self.update_protocol_cache_disabled();
409        }
410    }
411
412    /// Enable fetch interception.
413    pub fn enable_request_intercept(&mut self) {
414        self.protocol_request_interception_enabled = true;
415    }
416
417    /// Disable fetch interception.
418    pub fn disable_request_intercept(&mut self) {
419        self.protocol_request_interception_enabled = false;
420    }
421
422    /// Set the cache site key.
423    #[cfg(feature = "_cache")]
424    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
425        self.cache_site_key = cache_site_key;
426    }
427
428    /// Set the cache policy.
429    #[cfg(feature = "_cache")]
430    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
431        self.cache_policy = cache_policy;
432    }
433
434    pub fn update_protocol_cache_disabled(&mut self) {
435        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
436    }
437
438    pub fn authenticate(&mut self, credentials: Credentials) {
439        self.credentials = Some(credentials);
440        self.update_protocol_request_interception();
441        self.protocol_request_interception_enabled = true;
442    }
443
444    fn update_protocol_request_interception(&mut self) {
445        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
446
447        if enabled == self.protocol_request_interception_enabled {
448            return;
449        }
450
451        if enabled {
452            self.push_cdp_request(ENABLE_FETCH.clone())
453        } else {
454            self.push_cdp_request(DisableParams::default())
455        }
456    }
457
458    #[inline]
459    fn rel_for_ignore_script<'a>(&self, url: &'a str) -> Cow<'a, str> {
460        if url.starts_with('/') {
461            return Cow::Borrowed(url);
462        }
463
464        let base_raw = self.document_target_domain.as_str();
465
466        if base_raw.is_empty() {
467            return Cow::Borrowed(url);
468        }
469
470        let base = base_domain_from_any(base_raw).trim_end_matches('.');
471        if base.is_empty() {
472            return Cow::Borrowed(url);
473        }
474
475        let brand = first_label(base);
476
477        if let Some((host, rest)) = host_and_rest(url) {
478            if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
479                return if rest.starts_with('/') {
480                    Cow::Borrowed(rest)
481                } else {
482                    Cow::Borrowed("/")
483                };
484            }
485        }
486
487        Cow::Borrowed(url)
488    }
489
490    /// Url matches analytics that we want to ignore or trackers.
491    #[inline]
492    pub(crate) fn ignore_script(
493        &self,
494        url: &str,
495        block_analytics: bool,
496        intercept_manager: NetworkInterceptManager,
497    ) -> bool {
498        // allow relative domains.
499        let mut ignore_script = !url.starts_with("/");
500
501        if !ignore_script
502            && block_analytics
503            && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
504        {
505            ignore_script = true;
506        }
507
508        if !ignore_script {
509            if let Some(index) = url.find("//") {
510                let pos = index + 2;
511
512                // Ensure there is something after `//`
513                if pos < url.len() {
514                    // Find the first slash after the `//`
515                    if let Some(slash_index) = url[pos..].find('/') {
516                        let base_path_index = pos + slash_index + 1;
517
518                        if url.len() > base_path_index {
519                            let new_url: &str = &url[base_path_index..];
520
521                            // ignore assets we do not need for frameworks
522                            if !ignore_script
523                                && intercept_manager == NetworkInterceptManager::Unknown
524                            {
525                                let hydration_file =
526                                    JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
527
528                                // ignore astro paths
529                                if hydration_file && new_url.ends_with(".js") {
530                                    ignore_script = true;
531                                }
532                            }
533
534                            if !ignore_script
535                                && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
536                            {
537                                ignore_script = true;
538                            }
539
540                            if !ignore_script
541                                && self.ignore_visuals
542                                && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
543                            {
544                                ignore_script = true;
545                            }
546                        }
547                    }
548                }
549            }
550        }
551
552        // fallback for file ending in analytics.js
553        if !ignore_script && block_analytics {
554            ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
555        }
556
557        ignore_script
558    }
559
560    /// Determine if the request should be skipped.
561    #[inline]
562    fn skip_xhr(
563        &self,
564        skip_networking: bool,
565        event: &EventRequestPaused,
566        network_event: bool,
567    ) -> bool {
568        // XHR check
569        if !skip_networking && network_event {
570            let request_url = event.request.url.as_str();
571
572            // check if part of ignore scripts.
573            let skip_analytics =
574                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
575
576            if skip_analytics {
577                true
578            } else if self.block_stylesheets || self.ignore_visuals {
579                let block_css = self.block_stylesheets;
580                let block_media = self.ignore_visuals;
581
582                let mut block_request = false;
583
584                if let Some(position) = request_url.rfind('.') {
585                    let hlen = request_url.len();
586                    let has_asset = hlen - position;
587
588                    if has_asset >= 3 {
589                        let next_position = position + 1;
590
591                        if block_media
592                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
593                                &request_url[next_position..].into(),
594                            )
595                        {
596                            block_request = true;
597                        } else if block_css {
598                            block_request =
599                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
600                                    .contains(&**CSS_EXTENSION)
601                        }
602                    }
603                }
604
605                if !block_request {
606                    block_request = ignore_script_xhr_media(request_url);
607                }
608
609                block_request
610            } else {
611                skip_networking
612            }
613        } else {
614            skip_networking
615        }
616    }
617
618    #[cfg(feature = "adblock")]
619    #[inline]
620    /// Detect if ad enabled.
621    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
622        if skip_networking {
623            true
624        } else {
625            self.detect_ad(event)
626        }
627    }
628
629    /// When adblock feature is disabled, this is a no-op.
630    #[cfg(not(feature = "adblock"))]
631    #[inline]
632    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
633        skip_networking
634    }
635
636    #[inline]
637    /// Fail request
638    fn fail_request_blocked(
639        &mut self,
640        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
641    ) {
642        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
643            request_id.clone(),
644            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
645        );
646        self.push_cdp_request(params);
647    }
648
649    #[inline]
650    /// Fulfill request
651    fn fulfill_request_empty_200(
652        &mut self,
653        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
654    ) {
655        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
656            request_id.clone(),
657            200,
658        );
659        self.push_cdp_request(params);
660    }
661
662    #[cfg(feature = "_cache")]
663    #[inline]
664    /// Fulfill a paused Fetch request from cached bytes + header map.
665    ///
666    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
667    fn fulfill_request_from_cache(
668        &mut self,
669        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
670        body: &[u8],
671        headers: &std::collections::HashMap<String, String>,
672        status: i64,
673    ) {
674        use crate::cdp::browser_protocol::fetch::HeaderEntry;
675        use crate::handler::network::fetch::FulfillRequestParams;
676        use base64::Engine;
677
678        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
679
680        for (k, v) in headers.iter() {
681            resp_headers.push(HeaderEntry {
682                name: k.clone().into(),
683                value: v.clone().into(),
684            });
685        }
686
687        let mut params = FulfillRequestParams::new(request_id.clone(), status);
688
689        // TODO: have this already encoded prior.
690        params.body = Some(
691            base64::engine::general_purpose::STANDARD
692                .encode(body)
693                .into(),
694        );
695
696        params.response_headers = Some(resp_headers);
697
698        self.push_cdp_request(params);
699    }
700
701    #[inline]
702    /// Continue the request url.
703    fn continue_request_with_url(
704        &mut self,
705        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
706        url: Option<&str>,
707        intercept_response: bool,
708    ) {
709        let mut params = ContinueRequestParams::new(request_id.clone());
710        if let Some(url) = url {
711            params.url = Some(url.to_string());
712            params.intercept_response = Some(intercept_response);
713        }
714        self.push_cdp_request(params);
715    }
716
717    /// On fetch requesdt paused interception.
718    #[inline]
719    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
720        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
721            return;
722        }
723
724        let resource_type = &event.resource_type;
725
726        if self.block_all {
727            tracing::debug!(
728                "Blocked (block_all): {:?} - {}",
729                event.resource_type,
730                event.request.url
731            );
732            return self.fail_request_blocked(&event.request_id);
733        }
734
735        // // If both interceptions are enabled, do nothing.
736        // if !self.user_request_interception_enabled && self.protocol_request_interception_enabled {
737        //     self.push_cdp_request(ContinueRequestParams::new(event.request_id.clone()))
738        // }
739
740        if let Some(network_id) = event.network_id.as_ref() {
741            if let Some(request_will_be_sent) =
742                self.requests_will_be_sent.remove(network_id.as_ref())
743            {
744                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
745            } else {
746                self.request_id_to_interception_id
747                    .insert(network_id.clone(), event.request_id.clone().into());
748            }
749        }
750
751        // From here on, we handle the full decision tree.
752        let javascript_resource = *resource_type == ResourceType::Script;
753        let document_resource = *resource_type == ResourceType::Document;
754        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
755
756        // Start with static / cheap skip checks.
757        let mut skip_networking =
758            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
759
760        // Also short-circuit if we've reloaded this document too many times.
761        if !skip_networking {
762            skip_networking = self.document_reload_tracker >= 3;
763        }
764
765        // Handle document redirect / masking and track xml documents.
766        let (current_url_cow, had_replacer) =
767            self.handle_document_replacement_and_tracking(event, document_resource);
768
769        let current_url: &str = current_url_cow.as_ref();
770
771        // Main initial check (visuals, stylesheets, simple JS blocking).
772        if !skip_networking {
773            // Allow XSL for sitemap XML.
774            if self.xml_document && current_url.ends_with(".xsl") {
775                skip_networking = false;
776            } else {
777                skip_networking = self.should_skip_for_visuals_and_basic_js(
778                    resource_type,
779                    javascript_resource,
780                    current_url,
781                );
782            }
783        }
784
785        // Ad blocking (only active when feature = "adblock").
786        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
787
788        // Ignore embedded scripts when only_html or ignore_visuals is set.
789        if !skip_networking
790            && (self.only_html || self.ignore_visuals)
791            && (javascript_resource || document_resource)
792        {
793            skip_networking = ignore_script_embedded(current_url);
794        }
795
796        // Analytics check for JS.
797        if skip_networking && javascript_resource {
798            let rel = self.rel_for_ignore_script(current_url);
799            skip_networking =
800                self.ignore_script(rel.as_ref(), self.block_analytics, self.intercept_manager);
801        }
802
803        // XHR / data resources.
804        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
805
806        // Custom interception layer.
807        if !skip_networking && (javascript_resource || network_resource || document_resource) {
808            skip_networking = self.intercept_manager.intercept_detection(
809                current_url,
810                self.ignore_visuals,
811                network_resource,
812            );
813        }
814
815        // Custom website block list.
816        if !skip_networking && (javascript_resource || network_resource) {
817            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
818        }
819
820        // whitelist 3rd party
821        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
822        {
823            skip_networking = false;
824        }
825
826        // check if the url is in the whitelist.
827        if skip_networking && self.is_whitelisted(current_url) {
828            skip_networking = false;
829        }
830
831        if skip_networking {
832            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
833            self.fulfill_request_empty_200(&event.request_id);
834        } else {
835            #[cfg(feature = "_cache")]
836            {
837                if let (Some(policy), Some(cache_site_key)) =
838                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
839                {
840                    let current_url = format!("{}:{}", event.request.method, &current_url);
841
842                    if let Some((res, cache_policy)) =
843                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
844                    {
845                        if policy.allows_cached(&cache_policy) {
846                            tracing::debug!(
847                                "Remote Cached: {:?} - {}",
848                                resource_type,
849                                &current_url
850                            );
851                            return self.fulfill_request_from_cache(
852                                &event.request_id,
853                                &res.body,
854                                &res.headers,
855                                res.status as i64,
856                            );
857                        }
858                    }
859                }
860            }
861
862            // check our frame cache for the run.
863            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
864            self.continue_request_with_url(
865                &event.request_id,
866                if had_replacer {
867                    Some(current_url)
868                } else {
869                    None
870                },
871                !had_replacer,
872            );
873        }
874    }
875
876    /// Does the network manager have a target domain?
877    pub fn has_target_domain(&self) -> bool {
878        !self.document_target_url.is_empty()
879    }
880
881    /// Set the target page url for tracking.
882    pub fn set_page_url(&mut self, page_target_url: String) {
883        let host_base = host_and_rest(&page_target_url)
884            .map(|(h, _)| base_domain_from_host(h))
885            .unwrap_or("");
886
887        self.document_target_domain = host_base.to_string();
888        self.document_target_url = page_target_url;
889    }
890
891    /// Clear the initial target domain on every navigation.
892    pub fn clear_target_domain(&mut self) {
893        self.document_reload_tracker = 0;
894        self.document_target_url = Default::default();
895        self.document_target_domain = Default::default();
896    }
897    /// Handles:
898    /// - document reload tracking (`document_reload_tracker`)
899    /// - redirect masking / replacement
900    /// - xml document detection (`xml_document`)
901    /// - `document_target_url` updates
902    ///
903    /// Returns (current_url, had_replacer).
904    #[inline]
905    fn handle_document_replacement_and_tracking<'a>(
906        &mut self,
907        event: &'a EventRequestPaused,
908        document_resource: bool,
909    ) -> (Cow<'a, str>, bool) {
910        let mut replacer: Option<String> = None;
911        let current_url = event.request.url.as_str();
912
913        if document_resource {
914            if self.document_target_url == current_url {
915                self.document_reload_tracker += 1;
916            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
917            {
918                let (http_document_replacement, mut https_document_replacement) =
919                    if self.document_target_url.starts_with("http://") {
920                        (
921                            self.document_target_url.replacen("http://", "http//", 1),
922                            self.document_target_url.replacen("http://", "https://", 1),
923                        )
924                    } else {
925                        (
926                            self.document_target_url.replacen("https://", "https//", 1),
927                            self.document_target_url.replacen("https://", "http://", 1),
928                        )
929                    };
930
931                // Track trailing slash to restore later.
932                let trailing = https_document_replacement.ends_with('/');
933                if trailing {
934                    https_document_replacement.pop();
935                }
936                if https_document_replacement.ends_with('/') {
937                    https_document_replacement.pop();
938                }
939
940                let redirect_mask = format!(
941                    "{}{}",
942                    https_document_replacement, http_document_replacement
943                );
944
945                if current_url == redirect_mask {
946                    replacer = Some(if trailing {
947                        format!("{}/", https_document_replacement)
948                    } else {
949                        https_document_replacement
950                    });
951                }
952            }
953
954            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
955                self.xml_document = true;
956            }
957
958            // Track last seen document URL.
959            self.document_target_url = event.request.url.clone();
960            self.document_target_domain = host_and_rest(&self.document_target_url)
961                .map(|(h, _)| base_domain_from_host(h).to_string())
962                .unwrap_or_default();
963        }
964
965        let current_url_cow = match replacer {
966            Some(r) => Cow::Owned(r),
967            None => Cow::Borrowed(event.request.url.as_str()),
968        };
969
970        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
971        (current_url_cow, had_replacer)
972    }
973
974    /// Shared "visuals + basic JS blocking" logic.
975    #[inline]
976    fn should_skip_for_visuals_and_basic_js(
977        &self,
978        resource_type: &ResourceType,
979        javascript_resource: bool,
980        current_url: &str,
981    ) -> bool {
982        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
983            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
984            || (self.block_javascript
985                && javascript_resource
986                && self.intercept_manager == NetworkInterceptManager::Unknown
987                && !ALLOWED_MATCHER.is_match(current_url))
988    }
989
990    /// Perform a page intercept for chrome
991    #[cfg(feature = "adblock")]
992    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
993        use adblock::{
994            lists::{FilterSet, ParseOptions, RuleTypes},
995            Engine,
996        };
997
998        lazy_static::lazy_static! {
999            static ref AD_ENGINE: Engine = {
1000                let mut filter_set = FilterSet::new(false);
1001                let mut rules = ParseOptions::default();
1002                rules.rule_types = RuleTypes::All;
1003
1004                filter_set.add_filters(
1005                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1006                    rules,
1007                );
1008
1009                Engine::from_filter_set(filter_set, true)
1010            };
1011        };
1012
1013        let blockable = ResourceType::Image == event.resource_type
1014            || event.resource_type == ResourceType::Media
1015            || event.resource_type == ResourceType::Stylesheet
1016            || event.resource_type == ResourceType::Document
1017            || event.resource_type == ResourceType::Fetch
1018            || event.resource_type == ResourceType::Xhr;
1019
1020        let u = &event.request.url;
1021
1022        let block_request = blockable
1023            // set it to example.com for 3rd party handling is_same_site
1024        && {
1025            let request = adblock::request::Request::preparsed(
1026                 &u,
1027                 "example.com",
1028                 "example.com",
1029                 &event.resource_type.as_ref().to_lowercase(),
1030                 !event.request.is_same_site.unwrap_or_default());
1031
1032            AD_ENGINE.check_network_request(&request).matched
1033        };
1034
1035        block_request
1036    }
1037
1038    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1039        let response = if self
1040            .attempted_authentications
1041            .contains(event.request_id.as_ref())
1042        {
1043            AuthChallengeResponseResponse::CancelAuth
1044        } else if self.credentials.is_some() {
1045            self.attempted_authentications
1046                .insert(event.request_id.clone().into());
1047            AuthChallengeResponseResponse::ProvideCredentials
1048        } else {
1049            AuthChallengeResponseResponse::Default
1050        };
1051
1052        let mut auth = AuthChallengeResponse::new(response);
1053        if let Some(creds) = self.credentials.clone() {
1054            auth.username = Some(creds.username);
1055            auth.password = Some(creds.password);
1056        }
1057        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1058    }
1059
1060    /// Set the page offline network emulation condition.
1061    pub fn set_offline_mode(&mut self, value: bool) {
1062        if self.offline == value {
1063            return;
1064        }
1065        self.offline = value;
1066        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1067            .offline(self.offline)
1068            .latency(0)
1069            .download_throughput(-1.)
1070            .upload_throughput(-1.)
1071            .build()
1072        {
1073            self.push_cdp_request(network);
1074        }
1075    }
1076
1077    /// Request interception doesn't happen for data URLs with Network Service.
1078    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1079        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1080            if let Some(interception_id) = self
1081                .request_id_to_interception_id
1082                .remove(event.request_id.as_ref())
1083            {
1084                self.on_request(event, Some(interception_id));
1085            } else {
1086                // TODO remove the clone for event
1087                self.requests_will_be_sent
1088                    .insert(event.request_id.clone(), event.clone());
1089            }
1090        } else {
1091            self.on_request(event, None);
1092        }
1093    }
1094
1095    /// The request was served from the cache.
1096    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1097        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1098            request.from_memory_cache = true;
1099        }
1100    }
1101
1102    /// On network response received.
1103    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1104        let mut request_failed = false;
1105
1106        // Track how many bytes we actually deducted from this target.
1107        let mut deducted: u64 = 0;
1108
1109        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1110            let before = *max_bytes;
1111
1112            // encoded_data_length -> saturating cast to u64
1113            let received_bytes: u64 = event.response.encoded_data_length as u64;
1114
1115            // Safe parse of Content-Length
1116            let content_length: Option<u64> = event
1117                .response
1118                .headers
1119                .inner()
1120                .get("content-length")
1121                .and_then(|v| v.as_str())
1122                .and_then(|s| s.trim().parse::<u64>().ok());
1123
1124            // Deduct what we actually received
1125            *max_bytes = max_bytes.saturating_sub(received_bytes);
1126
1127            // If the declared size can't fit, zero out now
1128            if let Some(cl) = content_length {
1129                if cl > *max_bytes {
1130                    *max_bytes = 0;
1131                }
1132            }
1133
1134            request_failed = *max_bytes == 0;
1135
1136            // Compute exact delta deducted on this event
1137            deducted = before.saturating_sub(*max_bytes);
1138        }
1139
1140        // Bubble up the deduction (even if request continues)
1141        if deducted > 0 {
1142            self.queued_events
1143                .push_back(NetworkEvent::BytesConsumed(deducted));
1144        }
1145
1146        // block all network request moving forward.
1147        if request_failed && self.max_bytes_allowed.is_some() {
1148            self.set_block_all(true);
1149        }
1150
1151        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1152            request.set_response(event.response.clone());
1153            self.queued_events.push_back(if request_failed {
1154                NetworkEvent::RequestFailed(request)
1155            } else {
1156                NetworkEvent::RequestFinished(request)
1157            });
1158        }
1159    }
1160
1161    /// On network loading finished.
1162    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1163        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1164            if let Some(interception_id) = request.interception_id.as_ref() {
1165                self.attempted_authentications
1166                    .remove(interception_id.as_ref());
1167            }
1168            self.queued_events
1169                .push_back(NetworkEvent::RequestFinished(request));
1170        }
1171    }
1172
1173    /// On network loading failed.
1174    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1175        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1176            request.failure_text = Some(event.error_text.clone());
1177            if let Some(interception_id) = request.interception_id.as_ref() {
1178                self.attempted_authentications
1179                    .remove(interception_id.as_ref());
1180            }
1181            self.queued_events
1182                .push_back(NetworkEvent::RequestFailed(request));
1183        }
1184    }
1185
1186    /// On request will be sent.
1187    fn on_request(
1188        &mut self,
1189        event: &EventRequestWillBeSent,
1190        interception_id: Option<InterceptionId>,
1191    ) {
1192        let mut redirect_chain = Vec::new();
1193        let mut redirect_location = None;
1194
1195        if let Some(redirect_resp) = &event.redirect_response {
1196            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1197                if is_redirect_status(redirect_resp.status) {
1198                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1199                        if redirect_resp.url != location {
1200                            let fixed_location = location.replace(&redirect_resp.url, "");
1201
1202                            if !fixed_location.is_empty() {
1203                                request.response.as_mut().map(|resp| {
1204                                    resp.headers.0["Location"] =
1205                                        serde_json::Value::String(fixed_location.clone());
1206                                });
1207                            }
1208
1209                            redirect_location = Some(fixed_location);
1210                        }
1211                    }
1212                }
1213
1214                self.handle_request_redirect(
1215                    &mut request,
1216                    if let Some(redirect_location) = redirect_location {
1217                        let mut redirect_resp = redirect_resp.clone();
1218
1219                        if !redirect_location.is_empty() {
1220                            redirect_resp.headers.0["Location"] =
1221                                serde_json::Value::String(redirect_location);
1222                        }
1223
1224                        redirect_resp
1225                    } else {
1226                        redirect_resp.clone()
1227                    },
1228                );
1229
1230                redirect_chain = std::mem::take(&mut request.redirect_chain);
1231                redirect_chain.push(request);
1232            }
1233        }
1234
1235        let request = HttpRequest::new(
1236            event.request_id.clone(),
1237            event.frame_id.clone(),
1238            interception_id,
1239            self.user_request_interception_enabled,
1240            redirect_chain,
1241        );
1242
1243        self.requests.insert(event.request_id.clone(), request);
1244        self.queued_events
1245            .push_back(NetworkEvent::Request(event.request_id.clone()));
1246    }
1247
1248    /// Handle request redirect.
1249    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1250        request.set_response(response);
1251        if let Some(interception_id) = request.interception_id.as_ref() {
1252            self.attempted_authentications
1253                .remove(interception_id.as_ref());
1254        }
1255    }
1256}
1257
1258#[derive(Debug)]
1259pub enum NetworkEvent {
1260    /// Send a CDP request.
1261    SendCdpRequest((MethodId, serde_json::Value)),
1262    /// Request.
1263    Request(RequestId),
1264    /// Response
1265    Response(RequestId),
1266    /// Request failed.
1267    RequestFailed(HttpRequest),
1268    /// Request finished.
1269    RequestFinished(HttpRequest),
1270    /// Bytes consumed.
1271    BytesConsumed(u64),
1272}
1273
1274#[cfg(test)]
1275mod tests {
1276    use super::ALLOWED_MATCHER_3RD_PARTY;
1277
1278    #[test]
1279    fn test_allowed_matcher_3rd_party() {
1280        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1281        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1282        assert!(
1283            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1284            "expected Cloudflare challenge script to be allowed"
1285        );
1286
1287        // Should NOT be allowed (not in allow-list)
1288        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1289        assert!(
1290            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1291            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1292        );
1293
1294        // A couple sanity checks for existing allow patterns
1295        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1296        assert!(ALLOWED_MATCHER_3RD_PARTY
1297            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1298        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1299    }
1300}