chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{
11    base_domain_from_any, base_domain_from_host, first_label, host_and_rest,
12    host_contains_label_icase, host_is_subdomain_of,
13};
14use aho_corasick::AhoCorasick;
15use case_insensitive_string::CaseInsensitiveString;
16use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
17use chromiumoxide_cdp::cdp::browser_protocol::network::{
18    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
19    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
20    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
21    SetExtraHttpHeadersParams,
22};
23use chromiumoxide_cdp::cdp::browser_protocol::{
24    fetch::{
25        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
26        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
27    },
28    network::SetBypassServiceWorkerParams,
29};
30use chromiumoxide_cdp::cdp::browser_protocol::{
31    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
32};
33use chromiumoxide_types::{Command, Method, MethodId};
34use hashbrown::{HashMap, HashSet};
35use lazy_static::lazy_static;
36use reqwest::header::PROXY_AUTHORIZATION;
37use spider_network_blocker::intercept_manager::NetworkInterceptManager;
38pub use spider_network_blocker::scripts::{
39    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
40};
41use std::borrow::Cow;
42use std::collections::VecDeque;
43use std::time::Duration;
44
45lazy_static! {
46    /// General patterns for popular libraries and resources
47    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
48        "jquery",           // Covers jquery.min.js, jquery.js, etc.
49        "angular",
50        "react",            // Covers all React-related patterns
51        "vue",              // Covers all Vue-related patterns
52        "bootstrap",
53        "d3",
54        "lodash",
55        "ajax",
56        "application",
57        "app",              // Covers general app scripts like app.js
58        "main",
59        "index",
60        "bundle",
61        "vendor",
62        "runtime",
63        "polyfill",
64        "scripts",
65        "es2015.",
66        "es2020.",
67        "webpack",
68        "/cdn-cgi/challenge-platform/",
69        "/wp-content/js/",  // Covers Wordpress content
70        // Verified 3rd parties for request
71        "https://m.stripe.network/",
72        "https://challenges.cloudflare.com/",
73        "https://www.google.com/recaptcha/enterprise.js",
74        "https://www.google.com/recaptcha/api.js",
75        "https://google.com/recaptcha/api.js",
76        "https://captcha.px-cloud.net/",
77        "https://cdn.auth0.com/js/lock/",
78        "https://cdn.auth0.com/client",
79        "https://js.stripe.com/",
80        "https://cdn.prod.website-files.com/", // webflow cdn scripts
81        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
82        "https://code.jquery.com/jquery-"
83    ];
84
85    /// Determine if a script should be rendered in the browser by name.
86    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88    /// General patterns for popular libraries and resources
89    static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90        // Verified 3rd parties for request
91        "https://m.stripe.network/",
92        "https://challenges.cloudflare.com/",
93        "https://www.google.com/recaptcha/api.js",
94        "https://google.com/recaptcha/api.js",
95        "https://www.google.com/recaptcha/enterprise.js",
96        "https://js.stripe.com/",
97        "https://cdn.prod.website-files.com/", // webflow cdn scripts
98        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
99        "https://code.jquery.com/jquery-",
100        "https://ct.captcha-delivery.com/",
101        "https://geo.captcha-delivery.com/captcha/",
102        "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", // parking landing page iframe
103        "https://ct.captcha-delivery.com/",
104        "https://cdn.auth0.com/client",
105        "https://captcha.px-cloud.net/",
106        "/cdn-cgi/challenge-platform/"
107    ];
108
109    /// Determine if a script should be rendered in the browser by name.
110    pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
111
112    /// path of a js framework
113    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
114        phf::phf_set! {
115            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
116            "_astro/", "_app/immutable"
117        }
118    };
119
120    /// Ignore the content types.
121    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
122        "application/pdf",
123        "application/zip",
124        "application/x-rar-compressed",
125        "application/x-tar",
126        "image/png",
127        "image/jpeg",
128        "image/gif",
129        "image/bmp",
130        "image/webp",
131        "image/svg+xml",
132        "video/mp4",
133        "video/x-msvideo",
134        "video/x-matroska",
135        "video/webm",
136        "audio/mpeg",
137        "audio/ogg",
138        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
139        "application/vnd.ms-excel",
140        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
141        "application/vnd.ms-powerpoint",
142        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
143        "application/x-7z-compressed",
144        "application/x-rpm",
145        "application/x-shockwave-flash",
146        "application/rtf",
147    };
148
149    /// Ignore the resources for visual content types.
150    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
151        "Image",
152        "Media",
153        "Font"
154    };
155
156    /// Ignore the resources for visual content types.
157    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
158        "CspViolationReport",
159        "Manifest",
160        "Other",
161        "Prefetch",
162        "Ping",
163    };
164
165    /// Case insenstive css matching
166    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
167
168    /// The command chain.
169    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
170        let enable = EnableParams::default();
171
172        if let Ok(c) = serde_json::to_value(&enable) {
173            vec![(enable.identifier(), c)]
174        } else {
175            vec![]
176        }
177    };
178
179    /// The command chain with https ignore.
180    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
181        let enable = EnableParams::default();
182        let mut v = vec![];
183        if let Ok(c) = serde_json::to_value(&enable) {
184            v.push((enable.identifier(), c));
185        }
186        let ignore = SetIgnoreCertificateErrorsParams::new(true);
187        if let Ok(ignored) = serde_json::to_value(&ignore) {
188            v.push((ignore.identifier(), ignored));
189        }
190
191        v
192    };
193
194    /// Enable the fetch intercept command
195    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
196        fetch::EnableParams::builder()
197        .handle_auth_requests(true)
198        .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
199        .build()
200    };
201}
202
203/// Determine if a redirect is true.
204pub(crate) fn is_redirect_status(status: i64) -> bool {
205    matches!(status, 301 | 302 | 303 | 307 | 308)
206}
207
208#[derive(Debug)]
209/// The base network manager.
210pub struct NetworkManager {
211    queued_events: VecDeque<NetworkEvent>,
212    ignore_httpserrors: bool,
213    requests: HashMap<RequestId, HttpRequest>,
214    // TODO put event in an Arc?
215    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
216    extra_headers: std::collections::HashMap<String, String>,
217    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
218    user_cache_disabled: bool,
219    attempted_authentications: HashSet<RequestId>,
220    credentials: Option<Credentials>,
221    pub(crate) user_request_interception_enabled: bool,
222    block_all: bool,
223    pub(crate) protocol_request_interception_enabled: bool,
224    /// The network is offline.
225    offline: bool,
226    /// The page request timeout.
227    pub request_timeout: Duration,
228    // made_request: bool,
229    /// Ignore visuals (no pings, prefetching, and etc).
230    pub ignore_visuals: bool,
231    /// Block CSS stylesheets.
232    pub block_stylesheets: bool,
233    /// Block javascript that is not critical to rendering.
234    pub block_javascript: bool,
235    /// Block analytics from rendering
236    pub block_analytics: bool,
237    /// Only html from loading.
238    pub only_html: bool,
239    /// Is xml document?
240    pub xml_document: bool,
241    /// The custom intercept handle logic to run on the website.
242    pub intercept_manager: NetworkInterceptManager,
243    /// Track the amount of times the document reloaded.
244    pub document_reload_tracker: u8,
245    /// The initial target url. We want to use a new page on every navigation to prevent re-using the old domain.
246    pub document_target_url: String,
247    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
248    pub document_target_domain: String,
249    /// The max bytes to receive.
250    pub max_bytes_allowed: Option<u64>,
251    #[cfg(feature = "_cache")]
252    /// The cache site_key to use.
253    pub cache_site_key: Option<String>,
254    /// The cache policy to use.
255    #[cfg(feature = "_cache")]
256    pub cache_policy: Option<BasicCachePolicy>,
257    /// Optional per-run/per-site whitelist of URL substrings (scripts/resources).
258    whitelist_patterns: Vec<String>,
259    /// Compiled matcher for whitelist_patterns (rebuilt when patterns change).
260    whitelist_matcher: Option<AhoCorasick>,
261}
262
263impl NetworkManager {
264    /// A new network manager.
265    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
266        Self {
267            queued_events: Default::default(),
268            ignore_httpserrors,
269            requests: Default::default(),
270            requests_will_be_sent: Default::default(),
271            extra_headers: Default::default(),
272            request_id_to_interception_id: Default::default(),
273            user_cache_disabled: false,
274            attempted_authentications: Default::default(),
275            credentials: None,
276            block_all: false,
277            user_request_interception_enabled: false,
278            protocol_request_interception_enabled: false,
279            offline: false,
280            request_timeout,
281            ignore_visuals: false,
282            block_javascript: false,
283            block_stylesheets: false,
284            block_analytics: true,
285            only_html: false,
286            xml_document: false,
287            intercept_manager: NetworkInterceptManager::Unknown,
288            document_reload_tracker: 0,
289            document_target_url: String::new(),
290            document_target_domain: String::new(),
291            whitelist_patterns: Vec::new(),
292            whitelist_matcher: None,
293            max_bytes_allowed: None,
294            #[cfg(feature = "_cache")]
295            cache_site_key: None,
296            #[cfg(feature = "_cache")]
297            cache_policy: None,
298        }
299    }
300
301    /// Replace the whitelist patterns (compiled once).
302    pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
303    where
304        I: IntoIterator<Item = S>,
305        S: Into<String>,
306    {
307        self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
308        self.rebuild_whitelist_matcher();
309    }
310
311    /// Add one pattern (cheap) and rebuild (call this sparingly).
312    pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
313        self.whitelist_patterns.push(pattern.into());
314        self.rebuild_whitelist_matcher();
315    }
316
317    /// Add many patterns and rebuild once.
318    pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
319    where
320        I: IntoIterator<Item = S>,
321        S: Into<String>,
322    {
323        self.whitelist_patterns
324            .extend(patterns.into_iter().map(Into::into));
325        self.rebuild_whitelist_matcher();
326    }
327
328    #[inline]
329    fn rebuild_whitelist_matcher(&mut self) {
330        if self.whitelist_patterns.is_empty() {
331            self.whitelist_matcher = None;
332            return;
333        }
334
335        let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
336
337        // If building fails (shouldn’t for simple patterns), just disable matcher.
338        self.whitelist_matcher = AhoCorasick::new(refs).ok();
339    }
340
341    #[inline]
342    fn is_whitelisted(&self, url: &str) -> bool {
343        self.whitelist_matcher
344            .as_ref()
345            .map(|m| m.is_match(url))
346            .unwrap_or(false)
347    }
348
349    /// Commands to init the chain with.
350    pub fn init_commands(&self) -> CommandChain {
351        let cmds = if self.ignore_httpserrors {
352            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
353        } else {
354            INIT_CHAIN.clone()
355        };
356        CommandChain::new(cmds, self.request_timeout)
357    }
358
359    /// Push the CDP request.
360    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
361        let method = cmd.identifier();
362        if let Ok(params) = serde_json::to_value(cmd) {
363            self.queued_events
364                .push_back(NetworkEvent::SendCdpRequest((method, params)));
365        }
366    }
367
368    /// The next event to handle.
369    pub fn poll(&mut self) -> Option<NetworkEvent> {
370        self.queued_events.pop_front()
371    }
372
373    /// Get the extra headers.
374    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
375        &self.extra_headers
376    }
377
378    /// Set extra HTTP headers.
379    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
380        self.extra_headers = headers;
381        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
382        self.extra_headers.remove("Proxy-Authorization");
383        if !self.extra_headers.is_empty() {
384            if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
385                self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
386            }
387        }
388    }
389
390    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
391        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
392    }
393
394    pub fn set_block_all(&mut self, block_all: bool) {
395        self.block_all = block_all;
396    }
397
398    pub fn set_request_interception(&mut self, enabled: bool) {
399        self.user_request_interception_enabled = enabled;
400        self.update_protocol_request_interception();
401    }
402
403    pub fn set_cache_enabled(&mut self, enabled: bool) {
404        let run = self.user_cache_disabled != !enabled;
405        self.user_cache_disabled = !enabled;
406        if run {
407            self.update_protocol_cache_disabled();
408        }
409    }
410
411    /// Enable fetch interception.
412    pub fn enable_request_intercept(&mut self) {
413        self.protocol_request_interception_enabled = true;
414    }
415
416    /// Disable fetch interception.
417    pub fn disable_request_intercept(&mut self) {
418        self.protocol_request_interception_enabled = false;
419    }
420
421    /// Set the cache site key.
422    #[cfg(feature = "_cache")]
423    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
424        self.cache_site_key = cache_site_key;
425    }
426
427    /// Set the cache policy.
428    #[cfg(feature = "_cache")]
429    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
430        self.cache_policy = cache_policy;
431    }
432
433    pub fn update_protocol_cache_disabled(&mut self) {
434        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
435    }
436
437    pub fn authenticate(&mut self, credentials: Credentials) {
438        self.credentials = Some(credentials);
439        self.update_protocol_request_interception();
440        self.protocol_request_interception_enabled = true;
441    }
442
443    fn update_protocol_request_interception(&mut self) {
444        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
445
446        if enabled == self.protocol_request_interception_enabled {
447            return;
448        }
449
450        if enabled {
451            self.push_cdp_request(ENABLE_FETCH.clone())
452        } else {
453            self.push_cdp_request(DisableParams::default())
454        }
455    }
456
457    #[inline]
458    fn rel_for_ignore_script<'a>(&self, url: &'a str) -> Cow<'a, str> {
459        if url.starts_with('/') {
460            return Cow::Borrowed(url);
461        }
462
463        let base_raw = self.document_target_domain.as_str();
464
465        if base_raw.is_empty() {
466            return Cow::Borrowed(url);
467        }
468
469        let base = base_domain_from_any(base_raw).trim_end_matches('.');
470        if base.is_empty() {
471            return Cow::Borrowed(url);
472        }
473
474        let brand = first_label(base);
475
476        if let Some((host, rest)) = host_and_rest(url) {
477            if host_is_subdomain_of(host, base) || host_contains_label_icase(host, brand) {
478                return if rest.starts_with('/') {
479                    Cow::Borrowed(rest)
480                } else {
481                    Cow::Borrowed("/")
482                };
483            }
484        }
485
486        Cow::Borrowed(url)
487    }
488
489    /// Url matches analytics that we want to ignore or trackers.
490    #[inline]
491    pub(crate) fn ignore_script(
492        &self,
493        url: &str,
494        block_analytics: bool,
495        intercept_manager: NetworkInterceptManager,
496    ) -> bool {
497        // allow relative domains.
498        let mut ignore_script = !url.starts_with("/");
499
500        if !ignore_script
501            && block_analytics
502            && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
503        {
504            ignore_script = true;
505        }
506
507        if !ignore_script {
508            if let Some(index) = url.find("//") {
509                let pos = index + 2;
510
511                // Ensure there is something after `//`
512                if pos < url.len() {
513                    // Find the first slash after the `//`
514                    if let Some(slash_index) = url[pos..].find('/') {
515                        let base_path_index = pos + slash_index + 1;
516
517                        if url.len() > base_path_index {
518                            let new_url: &str = &url[base_path_index..];
519
520                            // ignore assets we do not need for frameworks
521                            if !ignore_script
522                                && intercept_manager == NetworkInterceptManager::Unknown
523                            {
524                                let hydration_file =
525                                    JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
526
527                                // ignore astro paths
528                                if hydration_file && new_url.ends_with(".js") {
529                                    ignore_script = true;
530                                }
531                            }
532
533                            if !ignore_script
534                                && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
535                            {
536                                ignore_script = true;
537                            }
538
539                            if !ignore_script
540                                && self.ignore_visuals
541                                && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
542                            {
543                                ignore_script = true;
544                            }
545                        }
546                    }
547                }
548            }
549        }
550
551        // fallback for file ending in analytics.js
552        if !ignore_script && block_analytics {
553            ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
554        }
555
556        ignore_script
557    }
558
559    /// Determine if the request should be skipped.
560    #[inline]
561    fn skip_xhr(
562        &self,
563        skip_networking: bool,
564        event: &EventRequestPaused,
565        network_event: bool,
566    ) -> bool {
567        // XHR check
568        if !skip_networking && network_event {
569            let request_url = event.request.url.as_str();
570
571            // check if part of ignore scripts.
572            let skip_analytics =
573                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
574
575            if skip_analytics {
576                true
577            } else if self.block_stylesheets || self.ignore_visuals {
578                let block_css = self.block_stylesheets;
579                let block_media = self.ignore_visuals;
580
581                let mut block_request = false;
582
583                if let Some(position) = request_url.rfind('.') {
584                    let hlen = request_url.len();
585                    let has_asset = hlen - position;
586
587                    if has_asset >= 3 {
588                        let next_position = position + 1;
589
590                        if block_media
591                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
592                                &request_url[next_position..].into(),
593                            )
594                        {
595                            block_request = true;
596                        } else if block_css {
597                            block_request =
598                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
599                                    .contains(&**CSS_EXTENSION)
600                        }
601                    }
602                }
603
604                if !block_request {
605                    block_request = ignore_script_xhr_media(request_url);
606                }
607
608                block_request
609            } else {
610                skip_networking
611            }
612        } else {
613            skip_networking
614        }
615    }
616
617    #[cfg(feature = "adblock")]
618    #[inline]
619    /// Detect if ad enabled.
620    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
621        if skip_networking {
622            true
623        } else {
624            self.detect_ad(event)
625        }
626    }
627
628    /// When adblock feature is disabled, this is a no-op.
629    #[cfg(not(feature = "adblock"))]
630    #[inline]
631    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
632        skip_networking
633    }
634
635    #[inline]
636    /// Fail request
637    fn fail_request_blocked(
638        &mut self,
639        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
640    ) {
641        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
642            request_id.clone(),
643            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
644        );
645        self.push_cdp_request(params);
646    }
647
648    #[inline]
649    /// Fulfill request
650    fn fulfill_request_empty_200(
651        &mut self,
652        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
653    ) {
654        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
655            request_id.clone(),
656            200,
657        );
658        self.push_cdp_request(params);
659    }
660
661    #[cfg(feature = "_cache")]
662    #[inline]
663    /// Fulfill a paused Fetch request from cached bytes + header map.
664    ///
665    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
666    fn fulfill_request_from_cache(
667        &mut self,
668        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
669        body: &[u8],
670        headers: &std::collections::HashMap<String, String>,
671        status: i64,
672    ) {
673        use crate::cdp::browser_protocol::fetch::HeaderEntry;
674        use crate::handler::network::fetch::FulfillRequestParams;
675        use base64::Engine;
676
677        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
678
679        for (k, v) in headers.iter() {
680            resp_headers.push(HeaderEntry {
681                name: k.clone().into(),
682                value: v.clone().into(),
683            });
684        }
685
686        let mut params = FulfillRequestParams::new(request_id.clone(), status);
687
688        // TODO: have this already encoded prior.
689        params.body = Some(
690            base64::engine::general_purpose::STANDARD
691                .encode(body)
692                .into(),
693        );
694
695        params.response_headers = Some(resp_headers);
696
697        self.push_cdp_request(params);
698    }
699
700    #[inline]
701    /// Continue the request url.
702    fn continue_request_with_url(
703        &mut self,
704        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
705        url: Option<&str>,
706        intercept_response: bool,
707    ) {
708        let mut params = ContinueRequestParams::new(request_id.clone());
709        if let Some(url) = url {
710            params.url = Some(url.to_string());
711            params.intercept_response = Some(intercept_response);
712        }
713        self.push_cdp_request(params);
714    }
715
716    /// On fetch requesdt paused interception.
717    #[inline]
718    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
719        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
720            return;
721        }
722
723        let resource_type = &event.resource_type;
724
725        if self.block_all {
726            tracing::debug!(
727                "Blocked (block_all): {:?} - {}",
728                event.resource_type,
729                event.request.url
730            );
731            return self.fail_request_blocked(&event.request_id);
732        }
733
734        // // If both interceptions are enabled, do nothing.
735        // if !self.user_request_interception_enabled && self.protocol_request_interception_enabled {
736        //     self.push_cdp_request(ContinueRequestParams::new(event.request_id.clone()))
737        // }
738
739        if let Some(network_id) = event.network_id.as_ref() {
740            if let Some(request_will_be_sent) =
741                self.requests_will_be_sent.remove(network_id.as_ref())
742            {
743                self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
744            } else {
745                self.request_id_to_interception_id
746                    .insert(network_id.clone(), event.request_id.clone().into());
747            }
748        }
749
750        // From here on, we handle the full decision tree.
751        let javascript_resource = *resource_type == ResourceType::Script;
752        let document_resource = *resource_type == ResourceType::Document;
753        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
754
755        // Start with static / cheap skip checks.
756        let mut skip_networking =
757            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
758
759        // Also short-circuit if we've reloaded this document too many times.
760        if !skip_networking {
761            skip_networking = self.document_reload_tracker >= 3;
762        }
763
764        // Handle document redirect / masking and track xml documents.
765        let (current_url_cow, had_replacer) =
766            self.handle_document_replacement_and_tracking(event, document_resource);
767
768        let current_url: &str = current_url_cow.as_ref();
769
770        // Main initial check (visuals, stylesheets, simple JS blocking).
771        if !skip_networking {
772            // Allow XSL for sitemap XML.
773            if self.xml_document && current_url.ends_with(".xsl") {
774                skip_networking = false;
775            } else {
776                skip_networking = self.should_skip_for_visuals_and_basic_js(
777                    resource_type,
778                    javascript_resource,
779                    current_url,
780                );
781            }
782        }
783
784        // Ad blocking (only active when feature = "adblock").
785        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
786
787        // Ignore embedded scripts when only_html or ignore_visuals is set.
788        if !skip_networking
789            && (self.only_html || self.ignore_visuals)
790            && (javascript_resource || document_resource)
791        {
792            skip_networking = ignore_script_embedded(current_url);
793        }
794
795        // Analytics check for JS.
796        if skip_networking && javascript_resource {
797            let rel = self.rel_for_ignore_script(current_url);
798            skip_networking =
799                self.ignore_script(rel.as_ref(), self.block_analytics, self.intercept_manager);
800        }
801
802        // XHR / data resources.
803        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
804
805        // Custom interception layer.
806        if !skip_networking && (javascript_resource || network_resource || document_resource) {
807            skip_networking = self.intercept_manager.intercept_detection(
808                current_url,
809                self.ignore_visuals,
810                network_resource,
811            );
812        }
813
814        // Custom website block list.
815        if !skip_networking && (javascript_resource || network_resource) {
816            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
817        }
818
819        // whitelist 3rd party
820        if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
821        {
822            skip_networking = false;
823        }
824
825        // check if the url is in the whitelist.
826        if skip_networking && self.is_whitelisted(current_url) {
827            skip_networking = false;
828        }
829
830        if skip_networking {
831            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
832            self.fulfill_request_empty_200(&event.request_id);
833        } else {
834            #[cfg(feature = "_cache")]
835            {
836                if let (Some(policy), Some(cache_site_key)) =
837                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
838                {
839                    let current_url = format!("{}:{}", event.request.method, &current_url);
840
841                    if let Some((res, cache_policy)) =
842                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
843                    {
844                        if policy.allows_cached(&cache_policy) {
845                            tracing::debug!(
846                                "Remote Cached: {:?} - {}",
847                                resource_type,
848                                &current_url
849                            );
850                            return self.fulfill_request_from_cache(
851                                &event.request_id,
852                                &res.body,
853                                &res.headers,
854                                res.status as i64,
855                            );
856                        }
857                    }
858                }
859            }
860
861            // check our frame cache for the run.
862            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
863            self.continue_request_with_url(
864                &event.request_id,
865                if had_replacer {
866                    Some(current_url)
867                } else {
868                    None
869                },
870                !had_replacer,
871            );
872        }
873    }
874
875    /// Does the network manager have a target domain?
876    pub fn has_target_domain(&self) -> bool {
877        !self.document_target_url.is_empty()
878    }
879
880    /// Set the target page url for tracking.
881    pub fn set_page_url(&mut self, page_target_url: String) {
882        let host_base = host_and_rest(&page_target_url)
883            .map(|(h, _)| base_domain_from_host(h))
884            .unwrap_or("");
885
886        self.document_target_domain = host_base.to_string();
887        self.document_target_url = page_target_url;
888    }
889
890    /// Clear the initial target domain on every navigation.
891    pub fn clear_target_domain(&mut self) {
892        self.document_reload_tracker = 0;
893        self.document_target_url = Default::default();
894        self.document_target_domain = Default::default();
895    }
896    /// Handles:
897    /// - document reload tracking (`document_reload_tracker`)
898    /// - redirect masking / replacement
899    /// - xml document detection (`xml_document`)
900    /// - `document_target_url` updates
901    ///
902    /// Returns (current_url, had_replacer).
903    #[inline]
904    fn handle_document_replacement_and_tracking<'a>(
905        &mut self,
906        event: &'a EventRequestPaused,
907        document_resource: bool,
908    ) -> (Cow<'a, str>, bool) {
909        let mut replacer: Option<String> = None;
910        let current_url = event.request.url.as_str();
911
912        if document_resource {
913            if self.document_target_url == current_url {
914                self.document_reload_tracker += 1;
915            } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
916            {
917                let (http_document_replacement, mut https_document_replacement) =
918                    if self.document_target_url.starts_with("http://") {
919                        (
920                            self.document_target_url.replacen("http://", "http//", 1),
921                            self.document_target_url.replacen("http://", "https://", 1),
922                        )
923                    } else {
924                        (
925                            self.document_target_url.replacen("https://", "https//", 1),
926                            self.document_target_url.replacen("https://", "http://", 1),
927                        )
928                    };
929
930                // Track trailing slash to restore later.
931                let trailing = https_document_replacement.ends_with('/');
932                if trailing {
933                    https_document_replacement.pop();
934                }
935                if https_document_replacement.ends_with('/') {
936                    https_document_replacement.pop();
937                }
938
939                let redirect_mask = format!(
940                    "{}{}",
941                    https_document_replacement, http_document_replacement
942                );
943
944                if current_url == redirect_mask {
945                    replacer = Some(if trailing {
946                        format!("{}/", https_document_replacement)
947                    } else {
948                        https_document_replacement
949                    });
950                }
951            }
952
953            if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
954                self.xml_document = true;
955            }
956
957            // Track last seen document URL.
958            self.document_target_url = event.request.url.clone();
959            self.document_target_domain = host_and_rest(&self.document_target_url)
960                .map(|(h, _)| base_domain_from_host(h).to_string())
961                .unwrap_or_default();
962        }
963
964        let current_url_cow = match replacer {
965            Some(r) => Cow::Owned(r),
966            None => Cow::Borrowed(event.request.url.as_str()),
967        };
968
969        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
970        (current_url_cow, had_replacer)
971    }
972
973    /// Shared "visuals + basic JS blocking" logic.
974    #[inline]
975    fn should_skip_for_visuals_and_basic_js(
976        &self,
977        resource_type: &ResourceType,
978        javascript_resource: bool,
979        current_url: &str,
980    ) -> bool {
981        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
982            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
983            || (self.block_javascript
984                && javascript_resource
985                && self.intercept_manager == NetworkInterceptManager::Unknown
986                && !ALLOWED_MATCHER.is_match(current_url))
987    }
988
989    /// Perform a page intercept for chrome
990    #[cfg(feature = "adblock")]
991    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
992        use adblock::{
993            lists::{FilterSet, ParseOptions, RuleTypes},
994            Engine,
995        };
996
997        lazy_static::lazy_static! {
998            static ref AD_ENGINE: Engine = {
999                let mut filter_set = FilterSet::new(false);
1000                let mut rules = ParseOptions::default();
1001                rules.rule_types = RuleTypes::All;
1002
1003                filter_set.add_filters(
1004                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1005                    rules,
1006                );
1007
1008                Engine::from_filter_set(filter_set, true)
1009            };
1010        };
1011
1012        let blockable = ResourceType::Image == event.resource_type
1013            || event.resource_type == ResourceType::Media
1014            || event.resource_type == ResourceType::Stylesheet
1015            || event.resource_type == ResourceType::Document
1016            || event.resource_type == ResourceType::Fetch
1017            || event.resource_type == ResourceType::Xhr;
1018
1019        let u = &event.request.url;
1020
1021        let block_request = blockable
1022            // set it to example.com for 3rd party handling is_same_site
1023        && {
1024            let request = adblock::request::Request::preparsed(
1025                 &u,
1026                 "example.com",
1027                 "example.com",
1028                 &event.resource_type.as_ref().to_lowercase(),
1029                 !event.request.is_same_site.unwrap_or_default());
1030
1031            AD_ENGINE.check_network_request(&request).matched
1032        };
1033
1034        block_request
1035    }
1036
1037    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1038        let response = if self
1039            .attempted_authentications
1040            .contains(event.request_id.as_ref())
1041        {
1042            AuthChallengeResponseResponse::CancelAuth
1043        } else if self.credentials.is_some() {
1044            self.attempted_authentications
1045                .insert(event.request_id.clone().into());
1046            AuthChallengeResponseResponse::ProvideCredentials
1047        } else {
1048            AuthChallengeResponseResponse::Default
1049        };
1050
1051        let mut auth = AuthChallengeResponse::new(response);
1052        if let Some(creds) = self.credentials.clone() {
1053            auth.username = Some(creds.username);
1054            auth.password = Some(creds.password);
1055        }
1056        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1057    }
1058
1059    /// Set the page offline network emulation condition.
1060    pub fn set_offline_mode(&mut self, value: bool) {
1061        if self.offline == value {
1062            return;
1063        }
1064        self.offline = value;
1065        if let Ok(network) = EmulateNetworkConditionsParams::builder()
1066            .offline(self.offline)
1067            .latency(0)
1068            .download_throughput(-1.)
1069            .upload_throughput(-1.)
1070            .build()
1071        {
1072            self.push_cdp_request(network);
1073        }
1074    }
1075
1076    /// Request interception doesn't happen for data URLs with Network Service.
1077    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1078        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1079            if let Some(interception_id) = self
1080                .request_id_to_interception_id
1081                .remove(event.request_id.as_ref())
1082            {
1083                self.on_request(event, Some(interception_id));
1084            } else {
1085                // TODO remove the clone for event
1086                self.requests_will_be_sent
1087                    .insert(event.request_id.clone(), event.clone());
1088            }
1089        } else {
1090            self.on_request(event, None);
1091        }
1092    }
1093
1094    /// The request was served from the cache.
1095    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1096        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1097            request.from_memory_cache = true;
1098        }
1099    }
1100
1101    /// On network response received.
1102    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1103        let mut request_failed = false;
1104
1105        // Track how many bytes we actually deducted from this target.
1106        let mut deducted: u64 = 0;
1107
1108        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1109            let before = *max_bytes;
1110
1111            // encoded_data_length -> saturating cast to u64
1112            let received_bytes: u64 = event.response.encoded_data_length as u64;
1113
1114            // Safe parse of Content-Length
1115            let content_length: Option<u64> = event
1116                .response
1117                .headers
1118                .inner()
1119                .get("content-length")
1120                .and_then(|v| v.as_str())
1121                .and_then(|s| s.trim().parse::<u64>().ok());
1122
1123            // Deduct what we actually received
1124            *max_bytes = max_bytes.saturating_sub(received_bytes);
1125
1126            // If the declared size can't fit, zero out now
1127            if let Some(cl) = content_length {
1128                if cl > *max_bytes {
1129                    *max_bytes = 0;
1130                }
1131            }
1132
1133            request_failed = *max_bytes == 0;
1134
1135            // Compute exact delta deducted on this event
1136            deducted = before.saturating_sub(*max_bytes);
1137        }
1138
1139        // Bubble up the deduction (even if request continues)
1140        if deducted > 0 {
1141            self.queued_events
1142                .push_back(NetworkEvent::BytesConsumed(deducted));
1143        }
1144
1145        // block all network request moving forward.
1146        if request_failed && self.max_bytes_allowed.is_some() {
1147            self.set_block_all(true);
1148        }
1149
1150        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1151            request.set_response(event.response.clone());
1152            self.queued_events.push_back(if request_failed {
1153                NetworkEvent::RequestFailed(request)
1154            } else {
1155                NetworkEvent::RequestFinished(request)
1156            });
1157        }
1158    }
1159
1160    /// On network loading finished.
1161    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1162        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1163            if let Some(interception_id) = request.interception_id.as_ref() {
1164                self.attempted_authentications
1165                    .remove(interception_id.as_ref());
1166            }
1167            self.queued_events
1168                .push_back(NetworkEvent::RequestFinished(request));
1169        }
1170    }
1171
1172    /// On network loading failed.
1173    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1174        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1175            request.failure_text = Some(event.error_text.clone());
1176            if let Some(interception_id) = request.interception_id.as_ref() {
1177                self.attempted_authentications
1178                    .remove(interception_id.as_ref());
1179            }
1180            self.queued_events
1181                .push_back(NetworkEvent::RequestFailed(request));
1182        }
1183    }
1184
1185    /// On request will be sent.
1186    fn on_request(
1187        &mut self,
1188        event: &EventRequestWillBeSent,
1189        interception_id: Option<InterceptionId>,
1190    ) {
1191        let mut redirect_chain = Vec::new();
1192        let mut redirect_location = None;
1193
1194        if let Some(redirect_resp) = &event.redirect_response {
1195            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1196                if is_redirect_status(redirect_resp.status) {
1197                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1198                        if redirect_resp.url != location {
1199                            let fixed_location = location.replace(&redirect_resp.url, "");
1200
1201                            if !fixed_location.is_empty() {
1202                                request.response.as_mut().map(|resp| {
1203                                    resp.headers.0["Location"] =
1204                                        serde_json::Value::String(fixed_location.clone());
1205                                });
1206                            }
1207
1208                            redirect_location = Some(fixed_location);
1209                        }
1210                    }
1211                }
1212
1213                self.handle_request_redirect(
1214                    &mut request,
1215                    if let Some(redirect_location) = redirect_location {
1216                        let mut redirect_resp = redirect_resp.clone();
1217
1218                        if !redirect_location.is_empty() {
1219                            redirect_resp.headers.0["Location"] =
1220                                serde_json::Value::String(redirect_location);
1221                        }
1222
1223                        redirect_resp
1224                    } else {
1225                        redirect_resp.clone()
1226                    },
1227                );
1228
1229                redirect_chain = std::mem::take(&mut request.redirect_chain);
1230                redirect_chain.push(request);
1231            }
1232        }
1233
1234        let request = HttpRequest::new(
1235            event.request_id.clone(),
1236            event.frame_id.clone(),
1237            interception_id,
1238            self.user_request_interception_enabled,
1239            redirect_chain,
1240        );
1241
1242        self.requests.insert(event.request_id.clone(), request);
1243        self.queued_events
1244            .push_back(NetworkEvent::Request(event.request_id.clone()));
1245    }
1246
1247    /// Handle request redirect.
1248    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1249        request.set_response(response);
1250        if let Some(interception_id) = request.interception_id.as_ref() {
1251            self.attempted_authentications
1252                .remove(interception_id.as_ref());
1253        }
1254    }
1255}
1256
1257#[derive(Debug)]
1258pub enum NetworkEvent {
1259    /// Send a CDP request.
1260    SendCdpRequest((MethodId, serde_json::Value)),
1261    /// Request.
1262    Request(RequestId),
1263    /// Response
1264    Response(RequestId),
1265    /// Request failed.
1266    RequestFailed(HttpRequest),
1267    /// Request finished.
1268    RequestFinished(HttpRequest),
1269    /// Bytes consumed.
1270    BytesConsumed(u64),
1271}
1272
1273#[cfg(test)]
1274mod tests {
1275    use super::ALLOWED_MATCHER_3RD_PARTY;
1276
1277    #[test]
1278    fn test_allowed_matcher_3rd_party() {
1279        // Should be allowed (matches "/cdn-cgi/challenge-platform/")
1280        let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1281        assert!(
1282            ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1283            "expected Cloudflare challenge script to be allowed"
1284        );
1285
1286        // Should NOT be allowed (not in allow-list)
1287        let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1288        assert!(
1289            !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1290            "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1291        );
1292
1293        // A couple sanity checks for existing allow patterns
1294        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1295        assert!(ALLOWED_MATCHER_3RD_PARTY
1296            .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1297        assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1298    }
1299}