chromiumoxide/handler/
network.rs

1use super::blockers::{
2    block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3    xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6use crate::cmd::CommandChain;
7use crate::handler::http::HttpRequest;
8use aho_corasick::AhoCorasick;
9use case_insensitive_string::CaseInsensitiveString;
10use chromiumoxide_cdp::cdp::browser_protocol::network::{
11    EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
12    EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
13    InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
14    SetExtraHttpHeadersParams,
15};
16use chromiumoxide_cdp::cdp::browser_protocol::{
17    fetch::{
18        self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
19        ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
20        RequestPattern,
21    },
22    network::SetBypassServiceWorkerParams,
23};
24use chromiumoxide_cdp::cdp::browser_protocol::{
25    network::EnableParams, security::SetIgnoreCertificateErrorsParams,
26};
27use chromiumoxide_types::{Command, Method, MethodId};
28use hashbrown::{HashMap, HashSet};
29use lazy_static::lazy_static;
30use reqwest::header::PROXY_AUTHORIZATION;
31use spider_network_blocker::intercept_manager::NetworkInterceptManager;
32pub use spider_network_blocker::scripts::{
33    URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
34};
35use std::collections::VecDeque;
36use std::time::Duration;
37
38lazy_static! {
39    /// General patterns for popular libraries and resources
40    static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
41        "jquery",           // Covers jquery.min.js, jquery.js, etc.
42        "angular",
43        "react",            // Covers all React-related patterns
44        "vue",              // Covers all Vue-related patterns
45        "bootstrap",
46        "d3",
47        "lodash",
48        "ajax",
49        "application",
50        "app",              // Covers general app scripts like app.js
51        "main",
52        "index",
53        "bundle",
54        "vendor",
55        "runtime",
56        "polyfill",
57        "scripts",
58        "es2015.",
59        "es2020.",
60        "webpack",
61        "/wp-content/js/",  // Covers Wordpress content
62        // Verified 3rd parties for request
63        "https://m.stripe.network/",
64        "https://challenges.cloudflare.com/",
65        "https://www.google.com/recaptcha/api.js",
66        "https://google.com/recaptcha/api.js",
67        "https://js.stripe.com/",
68        "https://cdn.prod.website-files.com/", // webflow cdn scripts
69        "https://cdnjs.cloudflare.com/",        // cloudflare cdn scripts
70        "https://code.jquery.com/jquery-"
71    ];
72
73    /// Determine if a script should be rendered in the browser by name.
74    pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
75
76    /// path of a js framework
77    pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
78        phf::phf_set! {
79            // Add allowed assets from JS_FRAMEWORK_ASSETS except the excluded ones
80            "_astro/", "_app/immutable"
81        }
82    };
83
84    /// Ignore the content types.
85    pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
86        "application/pdf",
87        "application/zip",
88        "application/x-rar-compressed",
89        "application/x-tar",
90        "image/png",
91        "image/jpeg",
92        "image/gif",
93        "image/bmp",
94        "image/svg+xml",
95        "video/mp4",
96        "video/x-msvideo",
97        "video/x-matroska",
98        "video/webm",
99        "audio/mpeg",
100        "audio/ogg",
101        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
102        "application/vnd.ms-excel",
103        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
104        "application/vnd.ms-powerpoint",
105        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
106        "application/x-7z-compressed",
107        "application/x-rpm",
108        "application/x-shockwave-flash",
109        "application/rtf",
110    };
111
112    /// Ignore the resources for visual content types.
113    pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
114        "Image",
115        "Media",
116        "Font"
117    };
118
119    /// Ignore the resources for visual content types.
120    pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
121        "CspViolationReport",
122        "Manifest",
123        "Other",
124        "Prefetch",
125        "Ping",
126    };
127
128    /// Case insenstive css matching
129    pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
130
131    /// The command chain.
132    pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
133        let enable = EnableParams::default();
134
135        if let Ok(c) = serde_json::to_value(&enable) {
136            vec![(enable.identifier(), c)]
137        } else {
138            vec![]
139        }
140    };
141
142    /// The command chain with https ignore.
143    pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)>  = {
144        let enable = EnableParams::default();
145        let mut v = vec![];
146        if let Ok(c) = serde_json::to_value(&enable) {
147            v.push((enable.identifier(), c));
148        }
149        let ignore = SetIgnoreCertificateErrorsParams::new(true);
150        if let Ok(ignored) = serde_json::to_value(&ignore) {
151            v.push((ignore.identifier(), ignored));
152        }
153
154        v
155    };
156
157    /// Enable the fetch intercept command
158    pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
159        fetch::EnableParams::builder()
160        .handle_auth_requests(true)
161        .pattern(RequestPattern::builder().url_pattern("*").build())
162        .build()
163    };
164}
165
166/// Determine if a redirect is true.
167pub(crate) fn is_redirect_status(status: i64) -> bool {
168    matches!(status, 301 | 302 | 303 | 307 | 308)
169}
170
171#[derive(Debug)]
172/// The base network manager.
173pub struct NetworkManager {
174    queued_events: VecDeque<NetworkEvent>,
175    ignore_httpserrors: bool,
176    requests: HashMap<RequestId, HttpRequest>,
177    // TODO put event in an Arc?
178    requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
179    extra_headers: std::collections::HashMap<String, String>,
180    request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
181    user_cache_disabled: bool,
182    attempted_authentications: HashSet<RequestId>,
183    credentials: Option<Credentials>,
184    pub(crate) user_request_interception_enabled: bool,
185    block_all: bool,
186    pub(crate) protocol_request_interception_enabled: bool,
187    /// The network is offline.
188    offline: bool,
189    /// The page request timeout.
190    pub request_timeout: Duration,
191    // made_request: bool,
192    /// Ignore visuals (no pings, prefetching, and etc).
193    pub ignore_visuals: bool,
194    /// Block CSS stylesheets.
195    pub block_stylesheets: bool,
196    /// Block javascript that is not critical to rendering.
197    pub block_javascript: bool,
198    /// Block analytics from rendering
199    pub block_analytics: bool,
200    /// Only html from loading.
201    pub only_html: bool,
202    /// Is xml document?
203    pub xml_document: bool,
204    /// The custom intercept handle logic to run on the website.
205    pub intercept_manager: NetworkInterceptManager,
206    /// Track the amount of times the document reloaded.
207    pub document_reload_tracker: u8,
208    /// The initial target domain.
209    pub document_target_domain: String,
210    /// The max bytes to receive.
211    pub max_bytes_allowed: Option<u64>,
212}
213
214impl NetworkManager {
215    pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
216        Self {
217            queued_events: Default::default(),
218            ignore_httpserrors,
219            requests: Default::default(),
220            requests_will_be_sent: Default::default(),
221            extra_headers: Default::default(),
222            request_id_to_interception_id: Default::default(),
223            user_cache_disabled: false,
224            attempted_authentications: Default::default(),
225            credentials: None,
226            block_all: false,
227            user_request_interception_enabled: false,
228            protocol_request_interception_enabled: false,
229            offline: false,
230            request_timeout,
231            ignore_visuals: false,
232            block_javascript: false,
233            block_stylesheets: false,
234            block_analytics: true,
235            only_html: false,
236            xml_document: false,
237            intercept_manager: NetworkInterceptManager::Unknown,
238            document_reload_tracker: 0,
239            document_target_domain: String::new(),
240            max_bytes_allowed: None,
241        }
242    }
243
244    pub fn init_commands(&self) -> CommandChain {
245        let cmds = if self.ignore_httpserrors {
246            INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
247        } else {
248            INIT_CHAIN.clone()
249        };
250        CommandChain::new(cmds, self.request_timeout)
251    }
252
253    pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
254        let method = cmd.identifier();
255        if let Ok(params) = serde_json::to_value(cmd) {
256            self.queued_events
257                .push_back(NetworkEvent::SendCdpRequest((method, params)));
258        }
259    }
260
261    /// The next event to handle
262    pub fn poll(&mut self) -> Option<NetworkEvent> {
263        self.queued_events.pop_front()
264    }
265
266    pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
267        &self.extra_headers
268    }
269
270    pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
271        self.extra_headers = headers;
272        self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
273        self.extra_headers.remove("Proxy-Authorization");
274        if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
275            self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
276        }
277    }
278
279    pub fn set_service_worker_enabled(&mut self, bypass: bool) {
280        self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
281    }
282
283    pub fn set_block_all(&mut self, block_all: bool) {
284        self.block_all = block_all;
285    }
286
287    pub fn set_request_interception(&mut self, enabled: bool) {
288        self.user_request_interception_enabled = enabled;
289        self.update_protocol_request_interception();
290    }
291
292    pub fn set_cache_enabled(&mut self, enabled: bool) {
293        let run = self.user_cache_disabled != !enabled;
294        self.user_cache_disabled = !enabled;
295        if run {
296            self.update_protocol_cache_disabled();
297        }
298    }
299
300    pub fn disable_request_intercept(&mut self) {
301        self.protocol_request_interception_enabled = true;
302    }
303
304    pub fn update_protocol_cache_disabled(&mut self) {
305        self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
306    }
307
308    pub fn authenticate(&mut self, credentials: Credentials) {
309        self.credentials = Some(credentials);
310        self.update_protocol_request_interception();
311        self.protocol_request_interception_enabled = true;
312    }
313
314    fn update_protocol_request_interception(&mut self) {
315        let enabled = self.user_request_interception_enabled || self.credentials.is_some();
316
317        if enabled == self.protocol_request_interception_enabled {
318            return;
319        }
320
321        if enabled {
322            self.push_cdp_request(ENABLE_FETCH.clone())
323        } else {
324            self.push_cdp_request(DisableParams::default())
325        }
326    }
327
328    /// Url matches analytics that we want to ignore or trackers.
329    pub(crate) fn ignore_script(
330        &self,
331        url: &str,
332        block_analytics: bool,
333        intercept_manager: NetworkInterceptManager,
334    ) -> bool {
335        let mut ignore_script = block_analytics
336            && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url);
337
338        if !ignore_script {
339            if let Some(index) = url.find("//") {
340                let pos = index + 2;
341
342                // Ensure there is something after `//`
343                if pos < url.len() {
344                    // Find the first slash after the `//`
345                    if let Some(slash_index) = url[pos..].find('/') {
346                        let base_path_index = pos + slash_index + 1;
347
348                        if url.len() > base_path_index {
349                            let new_url: &str = &url[base_path_index..];
350
351                            // ignore assets we do not need for frameworks
352                            if !ignore_script
353                                && intercept_manager == NetworkInterceptManager::Unknown
354                            {
355                                let hydration_file =
356                                    JS_FRAMEWORK_PATH.iter().any(|p| new_url.starts_with(p));
357
358                                // ignore astro paths
359                                if hydration_file && new_url.ends_with(".js") {
360                                    ignore_script = true;
361                                }
362                            }
363
364                            if !ignore_script
365                                && URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(new_url)
366                            {
367                                ignore_script = true;
368                            }
369
370                            if !ignore_script
371                                && self.ignore_visuals
372                                && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(new_url)
373                            {
374                                ignore_script = true;
375                            }
376                        }
377                    }
378                }
379            }
380        }
381
382        // fallback for file ending in analytics.js
383        if !ignore_script && block_analytics {
384            ignore_script = URL_IGNORE_TRIE_PATHS.contains_prefix(url);
385        }
386
387        ignore_script
388    }
389
390    /// Determine if the request should be skipped.
391    fn skip_xhr(
392        &self,
393        skip_networking: bool,
394        event: &EventRequestPaused,
395        network_event: bool,
396    ) -> bool {
397        // XHR check
398        if !skip_networking && network_event {
399            let request_url = event.request.url.as_str();
400
401            // check if part of ignore scripts.
402            let skip_analytics =
403                self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
404
405            if skip_analytics {
406                true
407            } else if self.block_stylesheets || self.ignore_visuals {
408                let block_css = self.block_stylesheets;
409                let block_media = self.ignore_visuals;
410
411                let mut block_request = false;
412
413                if let Some(position) = request_url.rfind('.') {
414                    let hlen = request_url.len();
415                    let has_asset = hlen - position;
416
417                    if has_asset >= 3 {
418                        let next_position = position + 1;
419
420                        if block_media
421                            && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
422                                &request_url[next_position..].into(),
423                            )
424                        {
425                            block_request = true;
426                        } else if block_css {
427                            block_request =
428                                CaseInsensitiveString::from(request_url[next_position..].as_bytes())
429                                    .contains(&**CSS_EXTENSION)
430                        }
431                    }
432                }
433
434                if !block_request {
435                    block_request = ignore_script_xhr_media(request_url);
436                }
437
438                block_request
439            } else {
440                skip_networking
441            }
442        } else {
443            skip_networking
444        }
445    }
446
447    #[cfg(not(feature = "adblock"))]
448    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
449        use super::blockers::block_websites::block_website;
450
451        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
452            return;
453        }
454
455        if self.block_all {
456            use chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason;
457            tracing::debug!("Blocked: {:?} - {}", event.resource_type, event.request.url);
458            let fullfill_params = crate::handler::network::fetch::FailRequestParams::new(
459                event.request_id.clone(),
460                ErrorReason::BlockedByClient,
461            );
462            self.push_cdp_request(fullfill_params);
463        } else {
464            if let Some(network_id) = event.network_id.as_ref() {
465                if let Some(request_will_be_sent) =
466                    self.requests_will_be_sent.remove(network_id.as_ref())
467                {
468                    self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
469                } else {
470                    let current_url = event.request.url.as_str();
471                    let javascript_resource = event.resource_type == ResourceType::Script;
472                    let document_resource = event.resource_type == ResourceType::Document;
473                    let network_resource =
474                        !document_resource && crate::utils::is_data_resource(&event.resource_type);
475
476                    let skip_networking = self.block_all
477                        || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
478
479                    let skip_networking = skip_networking || self.document_reload_tracker >= 3;
480                    let mut replacer = None;
481
482                    if document_resource {
483                        if self.document_target_domain == current_url {
484                            // this will prevent the domain from looping (3 times is enough).
485                            self.document_reload_tracker += 1;
486                        } else if !self.document_target_domain.is_empty()
487                            && event.redirected_request_id.is_some()
488                        {
489                            let (http_document_replacement, mut https_document_replacement) =
490                                if self.document_target_domain.starts_with("http://") {
491                                    (
492                                        self.document_target_domain.replace("http://", "http//"),
493                                        self.document_target_domain.replace("http://", "https://"),
494                                    )
495                                } else {
496                                    (
497                                        self.document_target_domain.replace("https://", "https//"),
498                                        self.document_target_domain.replace("https://", "http://"),
499                                    )
500                                };
501
502                            let trailing = https_document_replacement.ends_with('/');
503
504                            if trailing {
505                                https_document_replacement.pop();
506                            }
507
508                            if https_document_replacement.ends_with('/') {
509                                https_document_replacement.pop();
510                            }
511
512                            let redirect_mask = format!(
513                                "{}{}",
514                                https_document_replacement, http_document_replacement
515                            );
516
517                            // handle redirect masking
518                            if current_url == redirect_mask {
519                                replacer = Some(if trailing {
520                                    format!("{}/", https_document_replacement)
521                                } else {
522                                    https_document_replacement
523                                });
524                            }
525                        }
526
527                        if self.document_target_domain.is_empty() && current_url.ends_with(".xml") {
528                            self.xml_document = true;
529                        }
530
531                        self.document_target_domain = event.request.url.clone();
532                    }
533
534                    let current_url = match &replacer {
535                        Some(r) => r,
536                        _ => &event.request.url,
537                    }
538                    .as_str();
539
540                    // main initial check
541                    let skip_networking = if !skip_networking {
542                        // allow sitemap xml building xsl
543                        if self.xml_document && current_url.ends_with(".xsl") {
544                            false
545                        } else {
546                            self.ignore_visuals
547                                && (IGNORE_VISUAL_RESOURCE_MAP
548                                    .contains(event.resource_type.as_ref()))
549                                || self.block_stylesheets
550                                    && ResourceType::Stylesheet == event.resource_type
551                                || self.block_javascript
552                                    && javascript_resource
553                                    && self.intercept_manager == NetworkInterceptManager::Unknown
554                                    && !ALLOWED_MATCHER.is_match(current_url)
555                        }
556                    } else {
557                        skip_networking
558                    };
559
560                    let skip_networking = if !skip_networking
561                        && (self.only_html || self.ignore_visuals)
562                        && (javascript_resource || document_resource)
563                    {
564                        ignore_script_embedded(current_url)
565                    } else {
566                        skip_networking
567                    };
568
569                    // analytics check
570                    let skip_networking = if !skip_networking && javascript_resource {
571                        self.ignore_script(
572                            current_url,
573                            self.block_analytics,
574                            self.intercept_manager,
575                        )
576                    } else {
577                        skip_networking
578                    };
579
580                    // XHR check
581                    let skip_networking = self.skip_xhr(skip_networking, &event, network_resource);
582
583                    // custom interception layer.
584                    let skip_networking = if !skip_networking
585                        && (javascript_resource || network_resource || document_resource)
586                    {
587                        self.intercept_manager.intercept_detection(
588                            &current_url,
589                            self.ignore_visuals,
590                            network_resource,
591                        )
592                    } else {
593                        skip_networking
594                    };
595
596                    let skip_networking =
597                        if !skip_networking && (javascript_resource || network_resource) {
598                            block_website(&current_url)
599                        } else {
600                            skip_networking
601                        };
602
603                    if skip_networking {
604                        tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
605                        let fullfill_params =
606                            crate::handler::network::fetch::FulfillRequestParams::new(
607                                event.request_id.clone(),
608                                200,
609                            );
610                        self.push_cdp_request(fullfill_params);
611                    } else {
612                        tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
613                        let mut continue_params =
614                            ContinueRequestParams::new(event.request_id.clone());
615
616                        if replacer.is_some() {
617                            continue_params.url = Some(current_url.into());
618                            continue_params.intercept_response = Some(false);
619                        }
620
621                        self.push_cdp_request(continue_params)
622                    }
623                }
624            } else {
625                self.push_cdp_request(ContinueRequestParams::new(event.request_id.clone()))
626            }
627        }
628    }
629
630    #[cfg(feature = "adblock")]
631    pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
632        if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
633            return;
634        }
635
636        if self.block_all {
637            use chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason;
638            tracing::debug!("Blocked: {:?} - {}", event.resource_type, event.request.url);
639            let fullfill_params = crate::handler::network::fetch::FailRequestParams::new(
640                event.request_id.clone(),
641                ErrorReason::BlockedByClient,
642            );
643            self.push_cdp_request(fullfill_params);
644        } else {
645            if let Some(network_id) = event.network_id.as_ref() {
646                if let Some(request_will_be_sent) =
647                    self.requests_will_be_sent.remove(network_id.as_ref())
648                {
649                    self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
650                } else {
651                    let current_url = event.request.url.as_str();
652                    let javascript_resource = event.resource_type == ResourceType::Script;
653                    let document_resource = event.resource_type == ResourceType::Document;
654                    let network_resource =
655                        !document_resource && crate::utils::is_data_resource(&event.resource_type);
656                    let mut replacer = None;
657
658                    // block all of these events.
659                    let skip_networking = self.block_all
660                        || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
661
662                    let skip_networking = skip_networking || self.document_reload_tracker >= 3;
663
664                    if document_resource {
665                        if self.document_target_domain == current_url {
666                            // this will prevent the domain from looping (3 times is enough).
667                            self.document_reload_tracker += 1;
668                        } else if !self.document_target_domain.is_empty()
669                            && event.redirected_request_id.is_some()
670                        {
671                            let (http_document_replacement, mut https_document_replacement) =
672                                if self.document_target_domain.starts_with("http://") {
673                                    (
674                                        self.document_target_domain.replace("http://", "http//"),
675                                        self.document_target_domain.replace("http://", "https://"),
676                                    )
677                                } else {
678                                    (
679                                        self.document_target_domain.replace("https://", "https//"),
680                                        self.document_target_domain.replace("https://", "http://"),
681                                    )
682                                };
683
684                            let trailing = https_document_replacement.ends_with('/');
685
686                            if trailing {
687                                https_document_replacement.pop();
688                            }
689
690                            if https_document_replacement.ends_with('/') {
691                                https_document_replacement.pop();
692                            }
693
694                            let redirect_mask = format!(
695                                "{}{}",
696                                https_document_replacement, http_document_replacement
697                            );
698
699                            // handle redirect masking
700                            if current_url == redirect_mask {
701                                replacer = Some(if trailing {
702                                    format!("{}/", https_document_replacement)
703                                } else {
704                                    https_document_replacement
705                                });
706                            }
707                        }
708
709                        if self.document_target_domain.is_empty() && current_url.ends_with(".xml") {
710                            self.xml_document = true;
711                        }
712
713                        self.document_target_domain = event.request.url.clone();
714                    }
715
716                    let current_url = match &replacer {
717                        Some(r) => r,
718                        _ => &event.request.url,
719                    }
720                    .as_str();
721
722                    // main initial check
723                    let skip_networking = if !skip_networking {
724                        // allow sitemap xml building xsl
725                        if self.xml_document && current_url.ends_with(".xsl") {
726                            false
727                        } else {
728                            self.ignore_visuals
729                                && (IGNORE_VISUAL_RESOURCE_MAP
730                                    .contains(event.resource_type.as_ref()))
731                                || self.block_stylesheets
732                                    && ResourceType::Stylesheet == event.resource_type
733                                || self.block_javascript
734                                    && javascript_resource
735                                    && self.intercept_manager == NetworkInterceptManager::Unknown
736                                    && !ALLOWED_MATCHER.is_match(current_url)
737                        }
738                    } else {
739                        skip_networking
740                    };
741
742                    let skip_networking = if !skip_networking {
743                        self.detect_ad(event)
744                    } else {
745                        skip_networking
746                    };
747
748                    let skip_networking = if !skip_networking
749                        && (self.only_html || self.ignore_visuals)
750                        && (javascript_resource || document_resource)
751                    {
752                        ignore_script_embedded(current_url)
753                    } else {
754                        skip_networking
755                    };
756
757                    // analytics check
758                    let skip_networking = if !skip_networking && javascript_resource {
759                        self.ignore_script(
760                            current_url,
761                            self.block_analytics,
762                            self.intercept_manager,
763                        )
764                    } else {
765                        skip_networking
766                    };
767
768                    // XHR check
769                    let skip_networking = self.skip_xhr(skip_networking, &event, network_resource);
770
771                    // custom interception layer.
772                    let skip_networking = if !skip_networking
773                        && (javascript_resource || network_resource || document_resource)
774                    {
775                        self.intercept_manager.intercept_detection(
776                            &event.request.url,
777                            self.ignore_visuals,
778                            network_resource,
779                        )
780                    } else {
781                        skip_networking
782                    };
783
784                    let skip_networking = if !skip_networking
785                        && (javascript_resource || network_resource)
786                    {
787                        crate::handler::blockers::block_websites::block_website(&event.request.url)
788                    } else {
789                        skip_networking
790                    };
791
792                    if skip_networking {
793                        tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
794
795                        let fullfill_params =
796                            crate::handler::network::fetch::FulfillRequestParams::new(
797                                event.request_id.clone(),
798                                200,
799                            );
800                        self.push_cdp_request(fullfill_params);
801                    } else {
802                        tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
803
804                        let mut continue_params =
805                            ContinueRequestParams::new(event.request_id.clone());
806
807                        if replacer.is_some() {
808                            continue_params.url = Some(current_url.into());
809                            continue_params.intercept_response = Some(false);
810                        }
811                    }
812                }
813            } else {
814                self.push_cdp_request(ContinueRequestParams::new(event.request_id.clone()))
815            }
816        }
817
818        // if self.only_html {
819        //     self.made_request = true;
820        // }
821    }
822
823    /// Perform a page intercept for chrome
824    #[cfg(feature = "adblock")]
825    pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
826        use adblock::{
827            lists::{FilterSet, ParseOptions, RuleTypes},
828            Engine,
829        };
830
831        lazy_static::lazy_static! {
832            static ref AD_ENGINE: Engine = {
833                let mut filter_set = FilterSet::new(false);
834                let mut rules = ParseOptions::default();
835                rules.rule_types = RuleTypes::All;
836
837                filter_set.add_filters(
838                    &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
839                    rules,
840                );
841
842                Engine::from_filter_set(filter_set, true)
843            };
844        };
845
846        let blockable = ResourceType::Image == event.resource_type
847            || event.resource_type == ResourceType::Media
848            || event.resource_type == ResourceType::Stylesheet
849            || event.resource_type == ResourceType::Document
850            || event.resource_type == ResourceType::Fetch
851            || event.resource_type == ResourceType::Xhr;
852
853        let u = &event.request.url;
854
855        let block_request = blockable
856            // set it to example.com for 3rd party handling is_same_site
857        && {
858            let request = adblock::request::Request::preparsed(
859                 &u,
860                 "example.com",
861                 "example.com",
862                 &event.resource_type.as_ref().to_lowercase(),
863                 !event.request.is_same_site.unwrap_or_default());
864
865            AD_ENGINE.check_network_request(&request).matched
866        };
867
868        block_request
869    }
870
871    pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
872        let response = if self
873            .attempted_authentications
874            .contains(event.request_id.as_ref())
875        {
876            AuthChallengeResponseResponse::CancelAuth
877        } else if self.credentials.is_some() {
878            self.attempted_authentications
879                .insert(event.request_id.clone().into());
880            AuthChallengeResponseResponse::ProvideCredentials
881        } else {
882            AuthChallengeResponseResponse::Default
883        };
884
885        let mut auth = AuthChallengeResponse::new(response);
886        if let Some(creds) = self.credentials.clone() {
887            auth.username = Some(creds.username);
888            auth.password = Some(creds.password);
889        }
890        self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
891    }
892
893    pub fn set_offline_mode(&mut self, value: bool) {
894        if self.offline == value {
895            return;
896        }
897        self.offline = value;
898        if let Ok(network) = EmulateNetworkConditionsParams::builder()
899            .offline(self.offline)
900            .latency(0)
901            .download_throughput(-1.)
902            .upload_throughput(-1.)
903            .build()
904        {
905            self.push_cdp_request(network);
906        }
907    }
908
909    /// Request interception doesn't happen for data URLs with Network Service.
910    pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
911        if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
912            if let Some(interception_id) = self
913                .request_id_to_interception_id
914                .remove(event.request_id.as_ref())
915            {
916                self.on_request(event, Some(interception_id));
917            } else {
918                // TODO remove the clone for event
919                self.requests_will_be_sent
920                    .insert(event.request_id.clone(), event.clone());
921            }
922        } else {
923            self.on_request(event, None);
924        }
925    }
926
927    pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
928        if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
929            request.from_memory_cache = true;
930        }
931    }
932
933    /// On network response received.
934    pub fn on_response_received(&mut self, event: &EventResponseReceived) {
935        let mut request_failed = false;
936
937        // Track how many bytes we actually deducted from this target
938        let mut deducted: u64 = 0;
939
940        if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
941            let before = *max_bytes;
942
943            // encoded_data_length -> saturating cast to u64
944            let received_bytes: u64 = event.response.encoded_data_length as u64;
945
946            // Safe parse of Content-Length
947            let content_length: Option<u64> = event
948                .response
949                .headers
950                .inner()
951                .get("content-length")
952                .and_then(|v| v.as_str())
953                .and_then(|s| s.trim().parse::<u64>().ok());
954
955            // Deduct what we actually received
956            *max_bytes = max_bytes.saturating_sub(received_bytes);
957
958            // If the declared size can't fit, zero out now
959            if let Some(cl) = content_length {
960                if cl > *max_bytes {
961                    *max_bytes = 0;
962                }
963            }
964
965            request_failed = *max_bytes == 0;
966
967            // Compute exact delta deducted on this event
968            deducted = before.saturating_sub(*max_bytes);
969        }
970
971        // Bubble up the deduction (even if request continues)
972        if deducted > 0 {
973            self.queued_events
974                .push_back(NetworkEvent::BytesConsumed(deducted));
975        }
976
977        // block all network request moving forward.
978        if request_failed && self.max_bytes_allowed.is_some() {
979            self.set_block_all(true);
980        }
981
982        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
983            request.set_response(event.response.clone());
984            self.queued_events.push_back(if request_failed {
985                NetworkEvent::RequestFailed(request)
986            } else {
987                NetworkEvent::RequestFinished(request)
988            });
989        }
990    }
991
992    pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
993        if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
994            if let Some(interception_id) = request.interception_id.as_ref() {
995                self.attempted_authentications
996                    .remove(interception_id.as_ref());
997            }
998            self.queued_events
999                .push_back(NetworkEvent::RequestFinished(request));
1000        }
1001    }
1002
1003    pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1004        if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1005            request.failure_text = Some(event.error_text.clone());
1006            if let Some(interception_id) = request.interception_id.as_ref() {
1007                self.attempted_authentications
1008                    .remove(interception_id.as_ref());
1009            }
1010            self.queued_events
1011                .push_back(NetworkEvent::RequestFailed(request));
1012        }
1013    }
1014
1015    fn on_request(
1016        &mut self,
1017        event: &EventRequestWillBeSent,
1018        interception_id: Option<InterceptionId>,
1019    ) {
1020        let mut redirect_chain = Vec::new();
1021        let mut redirect_location = None;
1022
1023        if let Some(redirect_resp) = &event.redirect_response {
1024            if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1025                if is_redirect_status(redirect_resp.status) {
1026                    if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1027                        if redirect_resp.url != location {
1028                            let fixed_location = location.replace(&redirect_resp.url, "");
1029
1030                            request.response.as_mut().map(|resp| {
1031                                resp.headers.0["Location"] =
1032                                    serde_json::Value::String(fixed_location.clone());
1033                            });
1034
1035                            redirect_location = Some(fixed_location);
1036                        }
1037                    }
1038                }
1039
1040                self.handle_request_redirect(
1041                    &mut request,
1042                    if let Some(redirect_location) = redirect_location {
1043                        let mut redirect_resp = redirect_resp.clone();
1044
1045                        redirect_resp.headers.0["Location"] =
1046                            serde_json::Value::String(redirect_location);
1047
1048                        redirect_resp
1049                    } else {
1050                        redirect_resp.clone()
1051                    },
1052                );
1053
1054                redirect_chain = std::mem::take(&mut request.redirect_chain);
1055                redirect_chain.push(request);
1056            }
1057        }
1058
1059        let request = HttpRequest::new(
1060            event.request_id.clone(),
1061            event.frame_id.clone(),
1062            interception_id,
1063            self.user_request_interception_enabled,
1064            redirect_chain,
1065        );
1066
1067        self.requests.insert(event.request_id.clone(), request);
1068        self.queued_events
1069            .push_back(NetworkEvent::Request(event.request_id.clone()));
1070    }
1071
1072    fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1073        request.set_response(response);
1074        if let Some(interception_id) = request.interception_id.as_ref() {
1075            self.attempted_authentications
1076                .remove(interception_id.as_ref());
1077        }
1078    }
1079}
1080
1081#[derive(Debug)]
1082pub enum NetworkEvent {
1083    SendCdpRequest((MethodId, serde_json::Value)),
1084    Request(RequestId),
1085    Response(RequestId),
1086    RequestFailed(HttpRequest),
1087    RequestFinished(HttpRequest),
1088    BytesConsumed(u64),
1089}