chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use aho_corasick::AhoCorasick;
11use case_insensitive_string::CaseInsensitiveString;
12use chromiumoxide_cdp::cdp::browser_protocol::network::{
13    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
14    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
15    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
16    SetExtraHttpHeadersParams,
17};
18use chromiumoxide_cdp::cdp::browser_protocol::{
19    fetch::{
20        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
21        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
22        RequestPattern,
23    },
24    network::SetBypassServiceWorkerParams,
25};
26use chromiumoxide_cdp::cdp::browser_protocol::{
27    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
28};
29use chromiumoxide_types::{Command, Method, MethodId};
30use hashbrown::{HashMap, HashSet};
31use lazy_static::lazy_static;
32use reqwest::header::PROXY_AUTHORIZATION;
33use spider_network_blocker::intercept_manager::NetworkInterceptManager;
34pub use spider_network_blocker::scripts::{
35    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
36};
37use std::borrow::Cow;
38use std::collections::VecDeque;
39use std::time::Duration;
40
41lazy_static! {
42    /// General patterns for popular libraries and resources
43    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
44        "jquery",           // Covers jquery.min.js, jquery.js, etc.
45        "angular",
46        "react",            // Covers all React-related patterns
47        "vue",              // Covers all Vue-related patterns
48        "bootstrap",
49        "d3",
50        "lodash",
51        "ajax",
52        "application",
53        "app",              // Covers general app scripts like app.js
54        "main",
55        "index",
56        "bundle",
57        "vendor",
58        "runtime",
59        "polyfill",
60        "scripts",
61        "es2015.",
62        "es2020.",
63        "webpack",
64        "/wp-content/js/",  // Covers Wordpress content
65        // Verified 3rd parties for request
66        "https://m.stripe.network/",
67        "https://challenges.cloudflare.com/",
68        "https://www.google.com/recaptcha/api.js",
69        "https://google.com/recaptcha/api.js",
70        "https://js.stripe.com/",
71        "https://cdn.prod.website-files.com/", // webflow cdn scripts
72        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
73        "https://code.jquery.com/jquery-"
74    ];
75
76    /// Determine if a script should be rendered in the browser by name.
77    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
78
79    /// path of a js framework
80    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
81        phf::phf_set! {
82            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
83            "_astro/", "_app/immutable"
84        }
85    };
86
87    /// Ignore the content types.
88    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
89        "application/pdf",
90        "application/zip",
91        "application/x-rar-compressed",
92        "application/x-tar",
93        "image/png",
94        "image/jpeg",
95        "image/gif",
96        "image/bmp",
97        "image/svg+xml",
98        "video/mp4",
99        "video/x-msvideo",
100        "video/x-matroska",
101        "video/webm",
102        "audio/mpeg",
103        "audio/ogg",
104        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
105        "application/vnd.ms-excel",
106        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
107        "application/vnd.ms-powerpoint",
108        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
109        "application/x-7z-compressed",
110        "application/x-rpm",
111        "application/x-shockwave-flash",
112        "application/rtf",
113    };
114
115    /// Ignore the resources for visual content types.
116    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
117        "Image",
118        "Media",
119        "Font"
120    };
121
122    /// Ignore the resources for visual content types.
123    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
124        "CspViolationReport",
125        "Manifest",
126        "Other",
127        "Prefetch",
128        "Ping",
129    };
130
131    /// Case insenstive css matching
132    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
133
134    /// The command chain.
135    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
136        let enable = EnableParams::default();
137
138        if let Ok(c) = serde_json::to_value(&enable) {
139            vec![(enable.identifier(), c)]
140        } else {
141            vec![]
142        }
143    };
144
145    /// The command chain with https ignore.
146    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
147        let enable = EnableParams::default();
148        let mut v = vec![];
149        if let Ok(c) = serde_json::to_value(&enable) {
150            v.push((enable.identifier(), c));
151        }
152        let ignore = SetIgnoreCertificateErrorsParams::new(true);
153        if let Ok(ignored) = serde_json::to_value(&ignore) {
154            v.push((ignore.identifier(), ignored));
155        }
156
157        v
158    };
159
160    /// Enable the fetch intercept command
161    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
162        fetch::EnableParams::builder()
163        .handle_auth_requests(true)
164        .pattern(RequestPattern::builder().url_pattern("*").build())
165        .build()
166    };
167}
168
169/// Determine if a redirect is true.
170pub(crate) fn is_redirect_status(status: i64) -> bool {
171    matches!(status, 301 | 302 | 303 | 307 | 308)
172}
173
174#[derive(Debug)]
175/// The base network manager.
176pub struct NetworkManager {
177    queued_events: VecDeque<NetworkEvent>,
178    ignore_httpserrors: bool,
179    requests: HashMap<RequestId, HttpRequest>,
180    // TODO put event in an Arc?
181    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
182    extra_headers: std::collections::HashMap<String, String>,
183    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
184    user_cache_disabled: bool,
185    attempted_authentications: HashSet<RequestId>,
186    credentials: Option<Credentials>,
187    pub(crate) user_request_interception_enabled: bool,
188    block_all: bool,
189    pub(crate) protocol_request_interception_enabled: bool,
190    /// The network is offline.
191    offline: bool,
192    /// The page request timeout.
193    pub request_timeout: Duration,
194    // made_request: bool,
195    /// Ignore visuals (no pings, prefetching, and etc).
196    pub ignore_visuals: bool,
197    /// Block CSS stylesheets.
198    pub block_stylesheets: bool,
199    /// Block javascript that is not critical to rendering.
200    pub block_javascript: bool,
201    /// Block analytics from rendering
202    pub block_analytics: bool,
203    /// Only html from loading.
204    pub only_html: bool,
205    /// Is xml document?
206    pub xml_document: bool,
207    /// The custom intercept handle logic to run on the website.
208    pub intercept_manager: NetworkInterceptManager,
209    /// Track the amount of times the document reloaded.
210    pub document_reload_tracker: u8,
211    /// The initial target domain. We want to use a new page on every navigation to prevent re-using the old domain.
212    pub document_target_domain: String,
213    /// The max bytes to receive.
214    pub max_bytes_allowed: Option<u64>,
215    #[cfg(feature = "_cache")]
216    /// The cache site_key to use.
217    pub cache_site_key: Option<String>,
218    /// The cache policy to use.
219    #[cfg(feature = "_cache")]
220    pub cache_policy: Option<BasicCachePolicy>,
221}
222
223impl NetworkManager {
224    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
225        Self {
226            queued_events: Default::default(),
227            ignore_httpserrors,
228            requests: Default::default(),
229            requests_will_be_sent: Default::default(),
230            extra_headers: Default::default(),
231            request_id_to_interception_id: Default::default(),
232            user_cache_disabled: false,
233            attempted_authentications: Default::default(),
234            credentials: None,
235            block_all: false,
236            user_request_interception_enabled: false,
237            protocol_request_interception_enabled: false,
238            offline: false,
239            request_timeout,
240            ignore_visuals: false,
241            block_javascript: false,
242            block_stylesheets: false,
243            block_analytics: true,
244            only_html: false,
245            xml_document: false,
246            intercept_manager: NetworkInterceptManager::Unknown,
247            document_reload_tracker: 0,
248            document_target_domain: String::new(),
249            max_bytes_allowed: None,
250            #[cfg(feature = "_cache")]
251            cache_site_key: None,
252            #[cfg(feature = "_cache")]
253            cache_policy: None,
254        }
255    }
256
257    pub fn init_commands(&self) -> CommandChain {
258        let cmds = if self.ignore_httpserrors {
259            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
260        } else {
261            INIT_CHAIN.clone()
262        };
263        CommandChain::new(cmds, self.request_timeout)
264    }
265
266    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
267        let method = cmd.identifier();
268        if let Ok(params) = serde_json::to_value(cmd) {
269            self.queued_events
270                .push_back(NetworkEvent::SendCdpRequest((method, params)));
271        }
272    }
273
274    /// The next event to handle
275    pub fn poll(&mut self) -> Option<NetworkEvent> {
276        self.queued_events.pop_front()
277    }
278
279    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
280        &self.extra_headers
281    }
282
283    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
284        self.extra_headers = headers;
285        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
286        self.extra_headers.remove("Proxy-Authorization");
287        if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
288            self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
289        }
290    }
291
292    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
293        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
294    }
295
296    pub fn set_block_all(&mut self, block_all: bool) {
297        self.block_all = block_all;
298    }
299
300    pub fn set_request_interception(&mut self, enabled: bool) {
301        self.user_request_interception_enabled = enabled;
302        self.update_protocol_request_interception();
303    }
304
305    pub fn set_cache_enabled(&mut self, enabled: bool) {
306        let run = self.user_cache_disabled != !enabled;
307        self.user_cache_disabled = !enabled;
308        if run {
309            self.update_protocol_cache_disabled();
310        }
311    }
312
313    pub fn disable_request_intercept(&mut self) {
314        self.protocol_request_interception_enabled = true;
315    }
316
317    /// Set the cache site key.
318    #[cfg(feature = "_cache")]
319    pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
320        self.cache_site_key = cache_site_key;
321    }
322
323    /// Set the cache policy.
324    #[cfg(feature = "_cache")]
325    pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
326        self.cache_policy = cache_policy;
327    }
328
329    pub fn update_protocol_cache_disabled(&mut self) {
330        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
331    }
332
333    pub fn authenticate(&mut self, credentials: Credentials) {
334        self.credentials = Some(credentials);
335        self.update_protocol_request_interception();
336        self.protocol_request_interception_enabled = true;
337    }
338
339    fn update_protocol_request_interception(&mut self) {
340        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
341
342        if enabled == self.protocol_request_interception_enabled {
343            return;
344        }
345
346        if enabled {
347            self.push_cdp_request(ENABLE_FETCH.clone())
348        } else {
349            self.push_cdp_request(DisableParams::default())
350        }
351    }
352
353    /// Url matches analytics that we want to ignore or trackers.
354    pub(crate) fn ignore_script(
355        &self,
356        url: &str,
357        block_analytics: bool,
358        intercept_manager: NetworkInterceptManager,
359    ) -> bool {
360        let mut ignore_script = block_analytics
361            && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url);
362
363        if !ignore_script {
364            if let Some(index) = url.find("//") {
365                let pos = index + 2;
366
367                // Ensure there is something after `//`
368                if pos < url.len() {
369                    // Find the first slash after the `//`
370                    if let Some(slash_index) = url[pos..].find('/') {
371                        let base_path_index = pos + slash_index + 1;
372
373                        if url.len() > base_path_index {
374                            let new_url: &str = &url[base_path_index..];
375
376                            // ignore assets we do not need for frameworks
377                            if !ignore_script
378                                && intercept_manager == NetworkInterceptManager::Unknown
379                            {
380                                let hydration_file =
381                                    JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
382
383                                // ignore astro paths
384                                if hydration_file && new_url.ends_with(".js") {
385                                    ignore_script = true;
386                                }
387                            }
388
389                            if !ignore_script
390                                && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
391                            {
392                                ignore_script = true;
393                            }
394
395                            if !ignore_script
396                                && self.ignore_visuals
397                                && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
398                            {
399                                ignore_script = true;
400                            }
401                        }
402                    }
403                }
404            }
405        }
406
407        // fallback for file ending in analytics.js
408        if !ignore_script && block_analytics {
409            ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
410        }
411
412        ignore_script
413    }
414
415    /// Determine if the request should be skipped.
416    fn skip_xhr(
417        &self,
418        skip_networking: bool,
419        event: &EventRequestPaused,
420        network_event: bool,
421    ) -> bool {
422        // XHR check
423        if !skip_networking && network_event {
424            let request_url = event.request.url.as_str();
425
426            // check if part of ignore scripts.
427            let skip_analytics =
428                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
429
430            if skip_analytics {
431                true
432            } else if self.block_stylesheets || self.ignore_visuals {
433                let block_css = self.block_stylesheets;
434                let block_media = self.ignore_visuals;
435
436                let mut block_request = false;
437
438                if let Some(position) = request_url.rfind('.') {
439                    let hlen = request_url.len();
440                    let has_asset = hlen - position;
441
442                    if has_asset >= 3 {
443                        let next_position = position + 1;
444
445                        if block_media
446                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
447                                &request_url[next_position..].into(),
448                            )
449                        {
450                            block_request = true;
451                        } else if block_css {
452                            block_request =
453                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
454                                    .contains(&**CSS_EXTENSION)
455                        }
456                    }
457                }
458
459                if !block_request {
460                    block_request = ignore_script_xhr_media(request_url);
461                }
462
463                block_request
464            } else {
465                skip_networking
466            }
467        } else {
468            skip_networking
469        }
470    }
471
472    #[cfg(feature = "adblock")]
473    #[inline]
474    /// Detect if ad enabled.
475    fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
476        if skip_networking {
477            true
478        } else {
479            self.detect_ad(event)
480        }
481    }
482
483    /// When adblock feature is disabled, this is a no-op.
484    #[cfg(not(feature = "adblock"))]
485    #[inline]
486    fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
487        skip_networking
488    }
489
490    #[inline]
491    /// Fail request
492    fn fail_request_blocked(
493        &mut self,
494        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
495    ) {
496        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
497            request_id.clone(),
498            chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
499        );
500        self.push_cdp_request(params);
501    }
502
503    #[inline]
504    /// Fulfill request
505    fn fulfill_request_empty_200(
506        &mut self,
507        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
508    ) {
509        let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
510            request_id.clone(),
511            200,
512        );
513        self.push_cdp_request(params);
514    }
515
516    #[cfg(feature = "_cache")]
517    #[inline]
518    /// Fulfill a paused Fetch request from cached bytes + header map.
519    ///
520    /// `headers` should be response headers (e.g. Content-Type, Cache-Control, etc).
521    fn fulfill_request_from_cache(
522        &mut self,
523        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
524        body: &[u8],
525        headers: &std::collections::HashMap<String, String>,
526        status: i64,
527    ) {
528        use crate::cdp::browser_protocol::fetch::HeaderEntry;
529        use crate::handler::network::fetch::FulfillRequestParams;
530        use base64::Engine;
531
532        let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
533
534        for (k, v) in headers.iter() {
535            resp_headers.push(HeaderEntry {
536                name: k.clone().into(),
537                value: v.clone().into(),
538            });
539        }
540
541        let mut params = FulfillRequestParams::new(request_id.clone(), status);
542
543        // TODO: have this already encoded prior.
544        params.body = Some(
545            base64::engine::general_purpose::STANDARD
546                .encode(body)
547                .into(),
548        );
549
550        params.response_headers = Some(resp_headers);
551
552        self.push_cdp_request(params);
553    }
554
555    #[inline]
556    /// Continue the request url.
557    fn continue_request_with_url(
558        &mut self,
559        request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
560        url: Option<&str>,
561        intercept_response: bool,
562    ) {
563        let mut params = ContinueRequestParams::new(request_id.clone());
564        if let Some(url) = url {
565            params.url = Some(url.to_string());
566            params.intercept_response = Some(intercept_response);
567        }
568        self.push_cdp_request(params);
569    }
570
571    /// On fetch requesdt paused interception.
572    #[inline]
573    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
574        // If both interceptions are enabled, do nothing.
575        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
576            return;
577        }
578
579        if self.block_all {
580            tracing::debug!(
581                "Blocked (block_all): {:?} - {}",
582                event.resource_type,
583                event.request.url
584            );
585            return self.fail_request_blocked(&event.request_id);
586        }
587
588        // If this paused request corresponds to a "request_will_be_sent", hand it off and exit.
589        if let Some(network_id) = event.network_id.as_ref() {
590            if let Some(request_will_be_sent) =
591                self.requests_will_be_sent.remove(network_id.as_ref())
592            {
593                return self
594                    .on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
595            }
596        } else {
597            // No network_id, just continue.
598            return self.push_cdp_request(ContinueRequestParams::new(event.request_id.clone()));
599        }
600
601        // From here on, we handle the full decision tree.
602        let resource_type = &event.resource_type;
603        let javascript_resource = *resource_type == ResourceType::Script;
604        let document_resource = *resource_type == ResourceType::Document;
605        let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
606
607        // Start with static / cheap skip checks.
608        let mut skip_networking =
609            self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
610
611        // Also short-circuit if we've reloaded this document too many times.
612        if !skip_networking {
613            skip_networking = self.document_reload_tracker >= 3;
614        }
615
616        // Handle document redirect / masking and track xml documents.
617        let (current_url_cow, had_replacer) =
618            self.handle_document_replacement_and_tracking(event, document_resource);
619
620        let current_url: &str = current_url_cow.as_ref();
621
622        // Main initial check (visuals, stylesheets, simple JS blocking).
623        if !skip_networking {
624            // Allow XSL for sitemap XML.
625            if self.xml_document && current_url.ends_with(".xsl") {
626                skip_networking = false;
627            } else {
628                skip_networking = self.should_skip_for_visuals_and_basic_js(
629                    resource_type,
630                    javascript_resource,
631                    current_url,
632                );
633            }
634        }
635
636        // Ad blocking (only active when feature = "adblock").
637        skip_networking = self.detect_ad_if_enabled(event, skip_networking);
638
639        // Ignore embedded scripts when only_html or ignore_visuals is set.
640        if !skip_networking
641            && (self.only_html || self.ignore_visuals)
642            && (javascript_resource || document_resource)
643        {
644            skip_networking = ignore_script_embedded(current_url);
645        }
646
647        // Analytics check for JS.
648        if !skip_networking && javascript_resource {
649            skip_networking =
650                self.ignore_script(current_url, self.block_analytics, self.intercept_manager);
651        }
652
653        // XHR / data resources.
654        skip_networking = self.skip_xhr(skip_networking, event, network_resource);
655
656        // Custom interception layer.
657        if !skip_networking && (javascript_resource || network_resource || document_resource) {
658            skip_networking = self.intercept_manager.intercept_detection(
659                current_url,
660                self.ignore_visuals,
661                network_resource,
662            );
663        }
664
665        // Custom website block list.
666        if !skip_networking && (javascript_resource || network_resource) {
667            skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
668        }
669
670        if skip_networking {
671            tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
672            self.fulfill_request_empty_200(&event.request_id);
673        } else {
674            #[cfg(feature = "_cache")]
675            {
676                if let (Some(policy), Some(cache_site_key)) =
677                    (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
678                {
679                    let current_url = format!("{}:{}", event.request.method, &current_url);
680
681                    if let Some((res, cache_policy)) =
682                        crate::cache::remote::get_session_cache_item(cache_site_key, &current_url)
683                    {
684                        if policy.allows_cached(&cache_policy) {
685                            tracing::debug!(
686                                "Remote Cached: {:?} - {}",
687                                resource_type,
688                                &current_url
689                            );
690                            return self.fulfill_request_from_cache(
691                                &event.request_id,
692                                &res.body,
693                                &res.headers,
694                                res.status as i64,
695                            );
696                        }
697                    }
698                }
699            }
700
701            // check our frame cache for the run.
702            tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
703            self.continue_request_with_url(
704                &event.request_id,
705                if had_replacer {
706                    Some(current_url)
707                } else {
708                    None
709                },
710                !had_replacer,
711            );
712        }
713    }
714
715    /// Does the network manager have a target domain?
716    pub fn has_target_domain(&self) -> bool {
717        !self.document_target_domain.is_empty()
718    }
719
720    /// Set the target page url for tracking.
721    pub fn set_page_url(&mut self, page_target_url: String) {
722        self.document_target_domain = page_target_url;
723    }
724
725    /// Clear the initial target domain on every navigation.
726    pub fn clear_target_domain(&mut self) {
727        self.document_reload_tracker = 0;
728        self.document_target_domain = Default::default();
729    }
730
731    /// Handles:
732    /// - document reload tracking (`document_reload_tracker`)
733    /// - redirect masking / replacement
734    /// - xml document detection (`xml_document`)
735    /// - `document_target_domain` updates
736    ///
737    /// Returns (current_url, had_replacer).
738    #[inline]
739    fn handle_document_replacement_and_tracking<'a>(
740        &mut self,
741        event: &'a EventRequestPaused,
742        document_resource: bool,
743    ) -> (Cow<'a, str>, bool) {
744        let mut replacer: Option<String> = None;
745        let current_url = event.request.url.as_str();
746
747        if document_resource {
748            if self.document_target_domain == current_url {
749                // Prevent redirect loop (3 attempts are considered enough).
750                self.document_reload_tracker += 1;
751            } else if !self.document_target_domain.is_empty()
752                && event.redirected_request_id.is_some()
753            {
754                // Build http/https mask pair for redirect masking.
755                let (http_document_replacement, mut https_document_replacement) =
756                    if self.document_target_domain.starts_with("http://") {
757                        (
758                            self.document_target_domain.replace("http://", "http//"),
759                            self.document_target_domain.replace("http://", "https://"),
760                        )
761                    } else {
762                        (
763                            self.document_target_domain.replace("https://", "https//"),
764                            self.document_target_domain.replace("https://", "http://"),
765                        )
766                    };
767
768                // Track trailing slash to restore later.
769                let trailing = https_document_replacement.ends_with('/');
770                if trailing {
771                    https_document_replacement.pop();
772                }
773                if https_document_replacement.ends_with('/') {
774                    https_document_replacement.pop();
775                }
776
777                let redirect_mask = format!(
778                    "{}{}",
779                    https_document_replacement, http_document_replacement
780                );
781
782                if current_url == redirect_mask {
783                    replacer = Some(if trailing {
784                        format!("{}/", https_document_replacement)
785                    } else {
786                        https_document_replacement
787                    });
788                }
789            }
790
791            if self.document_target_domain.is_empty() && current_url.ends_with(".xml") {
792                self.xml_document = true;
793            }
794
795            // Track last seen document URL.
796            self.document_target_domain = event.request.url.clone();
797        }
798
799        let current_url_cow = match replacer {
800            Some(r) => Cow::Owned(r),
801            None => Cow::Borrowed(event.request.url.as_str()),
802        };
803
804        let had_replacer = matches!(current_url_cow, Cow::Owned(_));
805        (current_url_cow, had_replacer)
806    }
807
808    /// Shared "visuals + basic JS blocking" logic.
809    #[inline]
810    fn should_skip_for_visuals_and_basic_js(
811        &self,
812        resource_type: &ResourceType,
813        javascript_resource: bool,
814        current_url: &str,
815    ) -> bool {
816        (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
817            || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
818            || (self.block_javascript
819                && javascript_resource
820                && self.intercept_manager == NetworkInterceptManager::Unknown
821                && !ALLOWED_MATCHER.is_match(current_url))
822    }
823
824    /// Perform a page intercept for chrome
825    #[cfg(feature = "adblock")]
826    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
827        use adblock::{
828            lists::{FilterSet, ParseOptions, RuleTypes},
829            Engine,
830        };
831
832        lazy_static::lazy_static! {
833            static ref AD_ENGINE: Engine = {
834                let mut filter_set = FilterSet::new(false);
835                let mut rules = ParseOptions::default();
836                rules.rule_types = RuleTypes::All;
837
838                filter_set.add_filters(
839                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
840                    rules,
841                );
842
843                Engine::from_filter_set(filter_set, true)
844            };
845        };
846
847        let blockable = ResourceType::Image == event.resource_type
848            || event.resource_type == ResourceType::Media
849            || event.resource_type == ResourceType::Stylesheet
850            || event.resource_type == ResourceType::Document
851            || event.resource_type == ResourceType::Fetch
852            || event.resource_type == ResourceType::Xhr;
853
854        let u = &event.request.url;
855
856        let block_request = blockable
857            // set it to example.com for 3rd party handling is_same_site
858        && {
859            let request = adblock::request::Request::preparsed(
860                 &u,
861                 "example.com",
862                 "example.com",
863                 &event.resource_type.as_ref().to_lowercase(),
864                 !event.request.is_same_site.unwrap_or_default());
865
866            AD_ENGINE.check_network_request(&request).matched
867        };
868
869        block_request
870    }
871
872    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
873        let response = if self
874            .attempted_authentications
875            .contains(event.request_id.as_ref())
876        {
877            AuthChallengeResponseResponse::CancelAuth
878        } else if self.credentials.is_some() {
879            self.attempted_authentications
880                .insert(event.request_id.clone().into());
881            AuthChallengeResponseResponse::ProvideCredentials
882        } else {
883            AuthChallengeResponseResponse::Default
884        };
885
886        let mut auth = AuthChallengeResponse::new(response);
887        if let Some(creds) = self.credentials.clone() {
888            auth.username = Some(creds.username);
889            auth.password = Some(creds.password);
890        }
891        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
892    }
893
894    pub fn set_offline_mode(&mut self, value: bool) {
895        if self.offline == value {
896            return;
897        }
898        self.offline = value;
899        if let Ok(network) = EmulateNetworkConditionsParams::builder()
900            .offline(self.offline)
901            .latency(0)
902            .download_throughput(-1.)
903            .upload_throughput(-1.)
904            .build()
905        {
906            self.push_cdp_request(network);
907        }
908    }
909
910    /// Request interception doesn't happen for data URLs with Network Service.
911    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
912        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
913            if let Some(interception_id) = self
914                .request_id_to_interception_id
915                .remove(event.request_id.as_ref())
916            {
917                self.on_request(event, Some(interception_id));
918            } else {
919                // TODO remove the clone for event
920                self.requests_will_be_sent
921                    .insert(event.request_id.clone(), event.clone());
922            }
923        } else {
924            self.on_request(event, None);
925        }
926    }
927
928    /// The request was served from the cache.
929    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
930        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
931            request.from_memory_cache = true;
932        }
933    }
934
935    /// On network response received.
936    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
937        let mut request_failed = false;
938
939        // Track how many bytes we actually deducted from this target.
940        let mut deducted: u64 = 0;
941
942        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
943            let before = *max_bytes;
944
945            // encoded_data_length -> saturating cast to u64
946            let received_bytes: u64 = event.response.encoded_data_length as u64;
947
948            // Safe parse of Content-Length
949            let content_length: Option<u64> = event
950                .response
951                .headers
952                .inner()
953                .get("content-length")
954                .and_then(|v| v.as_str())
955                .and_then(|s| s.trim().parse::<u64>().ok());
956
957            // Deduct what we actually received
958            *max_bytes = max_bytes.saturating_sub(received_bytes);
959
960            // If the declared size can't fit, zero out now
961            if let Some(cl) = content_length {
962                if cl > *max_bytes {
963                    *max_bytes = 0;
964                }
965            }
966
967            request_failed = *max_bytes == 0;
968
969            // Compute exact delta deducted on this event
970            deducted = before.saturating_sub(*max_bytes);
971        }
972
973        // Bubble up the deduction (even if request continues)
974        if deducted > 0 {
975            self.queued_events
976                .push_back(NetworkEvent::BytesConsumed(deducted));
977        }
978
979        // block all network request moving forward.
980        if request_failed && self.max_bytes_allowed.is_some() {
981            self.set_block_all(true);
982        }
983
984        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
985            request.set_response(event.response.clone());
986            self.queued_events.push_back(if request_failed {
987                NetworkEvent::RequestFailed(request)
988            } else {
989                NetworkEvent::RequestFinished(request)
990            });
991        }
992    }
993
994    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
995        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
996            if let Some(interception_id) = request.interception_id.as_ref() {
997                self.attempted_authentications
998                    .remove(interception_id.as_ref());
999            }
1000            self.queued_events
1001                .push_back(NetworkEvent::RequestFinished(request));
1002        }
1003    }
1004
1005    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1006        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1007            request.failure_text = Some(event.error_text.clone());
1008            if let Some(interception_id) = request.interception_id.as_ref() {
1009                self.attempted_authentications
1010                    .remove(interception_id.as_ref());
1011            }
1012            self.queued_events
1013                .push_back(NetworkEvent::RequestFailed(request));
1014        }
1015    }
1016
1017    fn on_request(
1018        &mut self,
1019        event: &EventRequestWillBeSent,
1020        interception_id: Option<InterceptionId>,
1021    ) {
1022        let mut redirect_chain = Vec::new();
1023        let mut redirect_location = None;
1024
1025        if let Some(redirect_resp) = &event.redirect_response {
1026            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1027                if is_redirect_status(redirect_resp.status) {
1028                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1029                        if redirect_resp.url != location {
1030                            let fixed_location = location.replace(&redirect_resp.url, "");
1031
1032                            request.response.as_mut().map(|resp| {
1033                                resp.headers.0["Location"] =
1034                                    serde_json::Value::String(fixed_location.clone());
1035                            });
1036
1037                            redirect_location = Some(fixed_location);
1038                        }
1039                    }
1040                }
1041
1042                self.handle_request_redirect(
1043                    &mut request,
1044                    if let Some(redirect_location) = redirect_location {
1045                        let mut redirect_resp = redirect_resp.clone();
1046
1047                        redirect_resp.headers.0["Location"] =
1048                            serde_json::Value::String(redirect_location);
1049
1050                        redirect_resp
1051                    } else {
1052                        redirect_resp.clone()
1053                    },
1054                );
1055
1056                redirect_chain = std::mem::take(&mut request.redirect_chain);
1057                redirect_chain.push(request);
1058            }
1059        }
1060
1061        let request = HttpRequest::new(
1062            event.request_id.clone(),
1063            event.frame_id.clone(),
1064            interception_id,
1065            self.user_request_interception_enabled,
1066            redirect_chain,
1067        );
1068
1069        self.requests.insert(event.request_id.clone(), request);
1070        self.queued_events
1071            .push_back(NetworkEvent::Request(event.request_id.clone()));
1072    }
1073
1074    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1075        request.set_response(response);
1076        if let Some(interception_id) = request.interception_id.as_ref() {
1077            self.attempted_authentications
1078                .remove(interception_id.as_ref());
1079        }
1080    }
1081}
1082
1083#[derive(Debug)]
1084pub enum NetworkEvent {
1085    SendCdpRequest((MethodId, serde_json::Value)),
1086    Request(RequestId),
1087    Response(RequestId),
1088    RequestFailed(HttpRequest),
1089    RequestFinished(HttpRequest),
1090    BytesConsumed(u64),
1091}