1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "/cdn-cgi/challenge-platform/",
66 "/wp-content/js/", "https://m.stripe.network/",
69 "https://challenges.cloudflare.com/",
70 "https://www.google.com/recaptcha/enterprise.js",
71 "https://www.google.com/recaptcha/api.js",
72 "https://google.com/recaptcha/api.js",
73 "https://captcha.px-cloud.net/",
74 "https://cdn.auth0.com/js/lock/",
75 "https://cdn.auth0.com/client",
76 "https://js.stripe.com/",
77 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
80 ];
81
82 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
87
88 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
90 "https://m.stripe.network/",
92 "https://challenges.cloudflare.com/",
93 "https://www.google.com/recaptcha/api.js",
94 "https://google.com/recaptcha/api.js",
95 "https://www.google.com/recaptcha/enterprise.js",
96 "https://js.stripe.com/",
97 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
100 "https://ct.captcha-delivery.com/",
101 "https://geo.captcha-delivery.com/captcha/",
102 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
104 "https://cdn.auth0.com/client",
105 "https://captcha.px-cloud.net/",
106 "https://www.gstatic.com/recaptcha/",
107 "https://www.google.com/recaptcha/api2/",
108 "https://www.recaptcha.net/recaptcha/",
109 "https://www.recaptcha.net/recaptcha/api2/",
110 "https://js.hcaptcha.com/1/api.js",
111 "https://hcaptcha.com/1/api.js",
112 "https://js.datadome.co/tags.js",
113 "https://api-js.datadome.co/",
114 "https://client.perimeterx.net/",
115 "https://captcha.px-cdn.net/",
116 "https://captcha.px-cloud.net/",
117 "https://s.perimeterx.net/",
118 "https://client-api.arkoselabs.com/v2/",
119 "https://static.geetest.com/v4/gt4.js",
120 "https://static.geetest.com/",
121 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
122 "https://cdn.perfdrive.com/aperture/",
123 "https://assets.queue-it.net/",
124 "/cdn-cgi/challenge-platform/",
125 "/_Incapsula_Resource",
126 "discourse-cdn.com/"
127 ];
128
129 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
131
132 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
134 phf::phf_set! {
135 "_astro/", "_app/immutable"
137 }
138 };
139
140 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
142 "application/pdf",
143 "application/zip",
144 "application/x-rar-compressed",
145 "application/x-tar",
146 "image/png",
147 "image/jpeg",
148 "image/gif",
149 "image/bmp",
150 "image/webp",
151 "image/svg+xml",
152 "video/mp4",
153 "video/x-msvideo",
154 "video/x-matroska",
155 "video/webm",
156 "audio/mpeg",
157 "audio/ogg",
158 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
159 "application/vnd.ms-excel",
160 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
161 "application/vnd.ms-powerpoint",
162 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
163 "application/x-7z-compressed",
164 "application/x-rpm",
165 "application/x-shockwave-flash",
166 "application/rtf",
167 };
168
169 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
171 "Image",
172 "Media",
173 "Font"
174 };
175
176 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
178 "CspViolationReport",
179 "Manifest",
180 "Other",
181 "Prefetch",
182 "Ping",
183 };
184
185 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
187
188 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
190 let enable = EnableParams::default();
191
192 if let Ok(c) = serde_json::to_value(&enable) {
193 vec![(enable.identifier(), c)]
194 } else {
195 vec![]
196 }
197 };
198
199 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
201 let enable = EnableParams::default();
202 let mut v = vec![];
203 if let Ok(c) = serde_json::to_value(&enable) {
204 v.push((enable.identifier(), c));
205 }
206 let ignore = SetIgnoreCertificateErrorsParams::new(true);
207 if let Ok(ignored) = serde_json::to_value(&ignore) {
208 v.push((ignore.identifier(), ignored));
209 }
210
211 v
212 };
213
214 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
216 fetch::EnableParams::builder()
217 .handle_auth_requests(true)
218 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
219 .build()
220 };
221}
222
223pub(crate) fn is_redirect_status(status: i64) -> bool {
225 matches!(status, 301 | 302 | 303 | 307 | 308)
226}
227
228#[derive(Debug)]
229pub struct NetworkManager {
231 queued_events: VecDeque<NetworkEvent>,
237 ignore_httpserrors: bool,
242 requests: HashMap<RequestId, HttpRequest>,
247 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
254 extra_headers: std::collections::HashMap<String, String>,
259 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
265 user_cache_disabled: bool,
270 attempted_authentications: HashSet<RequestId>,
276 credentials: Option<Credentials>,
281 pub(crate) user_request_interception_enabled: bool,
290 block_all: bool,
297 pub(crate) protocol_request_interception_enabled: bool,
303 offline: bool,
305 pub request_timeout: Duration,
307 pub ignore_visuals: bool,
310 pub block_stylesheets: bool,
312 pub block_javascript: bool,
317 pub block_analytics: bool,
319 pub only_html: bool,
321 pub xml_document: bool,
323 pub intercept_manager: NetworkInterceptManager,
325 pub document_reload_tracker: u8,
327 pub document_target_url: String,
329 pub document_target_domain: String,
331 pub max_bytes_allowed: Option<u64>,
333 #[cfg(feature = "_cache")]
334 pub cache_site_key: Option<String>,
336 #[cfg(feature = "_cache")]
338 pub cache_policy: Option<BasicCachePolicy>,
339 whitelist_patterns: Vec<String>,
341 whitelist_matcher: Option<AhoCorasick>,
343}
344
345impl NetworkManager {
346 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
348 Self {
349 queued_events: Default::default(),
350 ignore_httpserrors,
351 requests: Default::default(),
352 requests_will_be_sent: Default::default(),
353 extra_headers: Default::default(),
354 request_id_to_interception_id: Default::default(),
355 user_cache_disabled: false,
356 attempted_authentications: Default::default(),
357 credentials: None,
358 block_all: false,
359 user_request_interception_enabled: false,
360 protocol_request_interception_enabled: false,
361 offline: false,
362 request_timeout,
363 ignore_visuals: false,
364 block_javascript: false,
365 block_stylesheets: false,
366 block_analytics: true,
367 only_html: false,
368 xml_document: false,
369 intercept_manager: NetworkInterceptManager::Unknown,
370 document_reload_tracker: 0,
371 document_target_url: String::new(),
372 document_target_domain: String::new(),
373 whitelist_patterns: Vec::new(),
374 whitelist_matcher: None,
375 max_bytes_allowed: None,
376 #[cfg(feature = "_cache")]
377 cache_site_key: None,
378 #[cfg(feature = "_cache")]
379 cache_policy: None,
380 }
381 }
382
383 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
385 where
386 I: IntoIterator<Item = S>,
387 S: Into<String>,
388 {
389 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
390 self.rebuild_whitelist_matcher();
391 }
392
393 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
395 self.whitelist_patterns.push(pattern.into());
396 self.rebuild_whitelist_matcher();
397 }
398
399 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
401 where
402 I: IntoIterator<Item = S>,
403 S: Into<String>,
404 {
405 self.whitelist_patterns
406 .extend(patterns.into_iter().map(Into::into));
407 self.rebuild_whitelist_matcher();
408 }
409
410 #[inline]
411 fn rebuild_whitelist_matcher(&mut self) {
412 if self.whitelist_patterns.is_empty() {
413 self.whitelist_matcher = None;
414 return;
415 }
416
417 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
418
419 self.whitelist_matcher = AhoCorasick::new(refs).ok();
421 }
422
423 #[inline]
424 fn is_whitelisted(&self, url: &str) -> bool {
425 self.whitelist_matcher
426 .as_ref()
427 .map(|m| m.is_match(url))
428 .unwrap_or(false)
429 }
430
431 pub fn init_commands(&self) -> CommandChain {
433 let cmds = if self.ignore_httpserrors {
434 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
435 } else {
436 INIT_CHAIN.clone()
437 };
438 CommandChain::new(cmds, self.request_timeout)
439 }
440
441 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
443 let method = cmd.identifier();
444 if let Ok(params) = serde_json::to_value(cmd) {
445 self.queued_events
446 .push_back(NetworkEvent::SendCdpRequest((method, params)));
447 }
448 }
449
450 pub fn poll(&mut self) -> Option<NetworkEvent> {
452 self.queued_events.pop_front()
453 }
454
455 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
457 &self.extra_headers
458 }
459
460 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
462 self.extra_headers = headers;
463 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
464 self.extra_headers.remove("Proxy-Authorization");
465 if !self.extra_headers.is_empty() {
466 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
467 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
468 }
469 }
470 }
471
472 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
473 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
474 }
475
476 pub fn set_block_all(&mut self, block_all: bool) {
477 self.block_all = block_all;
478 }
479
480 pub fn set_request_interception(&mut self, enabled: bool) {
481 self.user_request_interception_enabled = enabled;
482 self.update_protocol_request_interception();
483 }
484
485 pub fn set_cache_enabled(&mut self, enabled: bool) {
486 let run = self.user_cache_disabled != !enabled;
487 self.user_cache_disabled = !enabled;
488 if run {
489 self.update_protocol_cache_disabled();
490 }
491 }
492
493 pub fn enable_request_intercept(&mut self) {
495 self.protocol_request_interception_enabled = true;
496 }
497
498 pub fn disable_request_intercept(&mut self) {
500 self.protocol_request_interception_enabled = false;
501 }
502
503 #[cfg(feature = "_cache")]
505 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
506 self.cache_site_key = cache_site_key;
507 }
508
509 #[cfg(feature = "_cache")]
511 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
512 self.cache_policy = cache_policy;
513 }
514
515 pub fn update_protocol_cache_disabled(&mut self) {
516 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
517 }
518
519 pub fn authenticate(&mut self, credentials: Credentials) {
520 self.credentials = Some(credentials);
521 self.update_protocol_request_interception();
522 self.protocol_request_interception_enabled = true;
523 }
524
525 fn update_protocol_request_interception(&mut self) {
526 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
527
528 if enabled == self.protocol_request_interception_enabled {
529 return;
530 }
531
532 if enabled {
533 self.push_cdp_request(ENABLE_FETCH.clone())
534 } else {
535 self.push_cdp_request(DisableParams::default())
536 }
537 }
538
539 #[inline]
544 fn url_path_no_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
545 let idx = url.find("//")?;
546 let pos = idx + 2;
547 let slash = url[pos..].find('/')?;
548 let base_path_index = pos + slash + 1;
549 if base_path_index < url.len() {
550 Some(&url[base_path_index..])
551 } else {
552 None
553 }
554 }
555
556 #[inline]
565 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
566 let block_analytics = self.block_analytics;
568
569 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
571 {
572 return true;
573 }
574
575 if crate::handler::blockers::block_websites::block_website(url) {
577 return true;
578 }
579
580 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
587 let p_slash = Self::strip_query_fragment(path_with_slash);
589 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
590
591 let base = match p_slash.rsplit('/').next() {
593 Some(b) => b,
594 None => p_slash,
595 };
596
597 if block_analytics && (base == "analytics.js" || p_noslash.ends_with("/analytics.js")) {
601 return true;
602 }
603
604 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
607 return true;
608 }
609 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
610 return true;
611 }
612 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
613 return true;
614 }
615
616 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
619 return true;
620 }
621
622 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
624 return true;
625 }
626 }
627
628 false
629 }
630
631 #[inline]
636 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
637 let idx = url.find("//")?;
639 let after_slashes = idx + 2;
640
641 let slash_rel = url[after_slashes..].find('/')?;
643 let slash_idx = after_slashes + slash_rel;
644
645 if slash_idx < url.len() {
646 Some(&url[slash_idx..])
647 } else {
648 None
649 }
650 }
651
652 #[inline]
657 fn strip_query_fragment(s: &str) -> &str {
658 let q = s.find('?');
659 let h = s.find('#');
660
661 match (q, h) {
662 (None, None) => s,
663 (Some(i), None) => &s[..i],
664 (None, Some(i)) => &s[..i],
665 (Some(i), Some(j)) => &s[..i.min(j)],
666 }
667 }
668
669 #[inline]
671 fn skip_xhr(
672 &self,
673 skip_networking: bool,
674 event: &EventRequestPaused,
675 network_event: bool,
676 ) -> bool {
677 if !skip_networking && network_event {
679 let request_url = event.request.url.as_str();
680
681 let skip_analytics =
683 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
684
685 if skip_analytics {
686 true
687 } else if self.block_stylesheets || self.ignore_visuals {
688 let block_css = self.block_stylesheets;
689 let block_media = self.ignore_visuals;
690
691 let mut block_request = false;
692
693 if let Some(position) = request_url.rfind('.') {
694 let hlen = request_url.len();
695 let has_asset = hlen - position;
696
697 if has_asset >= 3 {
698 let next_position = position + 1;
699
700 if block_media
701 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
702 &request_url[next_position..].into(),
703 )
704 {
705 block_request = true;
706 } else if block_css {
707 block_request =
708 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
709 .contains(&**CSS_EXTENSION)
710 }
711 }
712 }
713
714 if !block_request {
715 block_request = ignore_script_xhr_media(request_url);
716 }
717
718 block_request
719 } else {
720 skip_networking
721 }
722 } else {
723 skip_networking
724 }
725 }
726
727 #[cfg(feature = "adblock")]
728 #[inline]
729 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
731 if skip_networking {
732 true
733 } else {
734 self.detect_ad(event)
735 }
736 }
737
738 #[cfg(not(feature = "adblock"))]
740 #[inline]
741 fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
742 skip_networking
743 }
744
745 #[inline]
746 fn fail_request_blocked(
748 &mut self,
749 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
750 ) {
751 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
752 request_id.clone(),
753 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
754 );
755 self.push_cdp_request(params);
756 }
757
758 #[inline]
759 fn fulfill_request_empty_200(
761 &mut self,
762 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
763 ) {
764 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
765 request_id.clone(),
766 200,
767 );
768 self.push_cdp_request(params);
769 }
770
771 #[cfg(feature = "_cache")]
772 #[inline]
773 fn fulfill_request_from_cache(
777 &mut self,
778 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
779 body: &[u8],
780 headers: &std::collections::HashMap<String, String>,
781 status: i64,
782 ) {
783 use crate::cdp::browser_protocol::fetch::HeaderEntry;
784 use crate::handler::network::fetch::FulfillRequestParams;
785 use base64::Engine;
786
787 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
788
789 for (k, v) in headers.iter() {
790 resp_headers.push(HeaderEntry {
791 name: k.clone().into(),
792 value: v.clone().into(),
793 });
794 }
795
796 let mut params = FulfillRequestParams::new(request_id.clone(), status);
797
798 params.body = Some(
800 base64::engine::general_purpose::STANDARD
801 .encode(body)
802 .into(),
803 );
804
805 params.response_headers = Some(resp_headers);
806
807 self.push_cdp_request(params);
808 }
809
810 #[inline]
811 fn continue_request_with_url(
813 &mut self,
814 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
815 url: Option<&str>,
816 intercept_response: bool,
817 ) {
818 let mut params = ContinueRequestParams::new(request_id.clone());
819 if let Some(url) = url {
820 params.url = Some(url.to_string());
821 params.intercept_response = Some(intercept_response);
822 }
823 self.push_cdp_request(params);
824 }
825
826 #[inline]
828 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
829 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
830 return;
831 }
832
833 let resource_type = &event.resource_type;
834
835 if self.block_all {
836 tracing::debug!(
837 "Blocked (block_all): {:?} - {}",
838 event.resource_type,
839 event.request.url
840 );
841 return self.fail_request_blocked(&event.request_id);
842 }
843
844 if let Some(network_id) = event.network_id.as_ref() {
845 if let Some(request_will_be_sent) =
846 self.requests_will_be_sent.remove(network_id.as_ref())
847 {
848 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
849 } else {
850 self.request_id_to_interception_id
851 .insert(network_id.clone(), event.request_id.clone().into());
852 }
853 }
854
855 let javascript_resource = *resource_type == ResourceType::Script;
857 let document_resource = *resource_type == ResourceType::Document;
858 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
859
860 let mut skip_networking =
862 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
863
864 if !skip_networking {
866 skip_networking = self.document_reload_tracker >= 3;
867 }
868
869 let (current_url_cow, had_replacer) =
871 self.handle_document_replacement_and_tracking(event, document_resource);
872
873 let current_url: &str = current_url_cow.as_ref();
874
875 if !skip_networking {
881 if self.xml_document && current_url.ends_with(".xsl") {
883 skip_networking = false;
884 } else {
885 skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
886 }
887 }
888
889 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
891
892 if !skip_networking
894 && (self.only_html || self.ignore_visuals)
895 && (javascript_resource || document_resource)
896 {
897 skip_networking = ignore_script_embedded(current_url);
898 }
899
900 if !skip_networking && javascript_resource {
903 skip_networking = self.should_block_script_blocklist_only(current_url);
904 }
905
906 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
908
909 if !skip_networking && (javascript_resource || network_resource || document_resource) {
911 skip_networking = self.intercept_manager.intercept_detection(
912 current_url,
913 self.ignore_visuals,
914 network_resource,
915 );
916 }
917
918 if !skip_networking && (javascript_resource || network_resource) {
920 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
921 }
922
923 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
926 {
927 skip_networking = false;
928 }
929
930 if skip_networking && self.is_whitelisted(current_url) {
932 skip_networking = false;
933 }
934
935 if skip_networking {
936 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
937 self.fulfill_request_empty_200(&event.request_id);
938 } else {
939 #[cfg(feature = "_cache")]
940 {
941 if let (Some(policy), Some(cache_site_key)) =
942 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
943 {
944 let current_url = format!("{}:{}", event.request.method, ¤t_url);
945
946 if let Some((res, cache_policy)) =
947 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
948 {
949 if policy.allows_cached(&cache_policy) {
950 tracing::debug!(
951 "Remote Cached: {:?} - {}",
952 resource_type,
953 ¤t_url
954 );
955 return self.fulfill_request_from_cache(
956 &event.request_id,
957 &res.body,
958 &res.headers,
959 res.status as i64,
960 );
961 }
962 }
963 }
964 }
965
966 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
968 self.continue_request_with_url(
969 &event.request_id,
970 if had_replacer {
971 Some(current_url)
972 } else {
973 None
974 },
975 !had_replacer,
976 );
977 }
978 }
979
980 #[inline]
986 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
987 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
988 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
989 }
990
991 pub fn has_target_domain(&self) -> bool {
993 !self.document_target_url.is_empty()
994 }
995
996 pub fn set_page_url(&mut self, page_target_url: String) {
998 let host_base = host_and_rest(&page_target_url)
999 .map(|(h, _)| base_domain_from_host(h))
1000 .unwrap_or("");
1001
1002 self.document_target_domain = host_base.to_string();
1003 self.document_target_url = page_target_url;
1004 }
1005
1006 pub fn clear_target_domain(&mut self) {
1008 self.document_reload_tracker = 0;
1009 self.document_target_url = Default::default();
1010 self.document_target_domain = Default::default();
1011 }
1012
1013 #[inline]
1021 fn handle_document_replacement_and_tracking<'a>(
1022 &mut self,
1023 event: &'a EventRequestPaused,
1024 document_resource: bool,
1025 ) -> (Cow<'a, str>, bool) {
1026 let mut replacer: Option<String> = None;
1027 let current_url = event.request.url.as_str();
1028
1029 if document_resource {
1030 if self.document_target_url == current_url {
1031 self.document_reload_tracker += 1;
1032 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1033 {
1034 let (http_document_replacement, mut https_document_replacement) =
1035 if self.document_target_url.starts_with("http://") {
1036 (
1037 self.document_target_url.replacen("http://", "http//", 1),
1038 self.document_target_url.replacen("http://", "https://", 1),
1039 )
1040 } else {
1041 (
1042 self.document_target_url.replacen("https://", "https//", 1),
1043 self.document_target_url.replacen("https://", "http://", 1),
1044 )
1045 };
1046
1047 let trailing = https_document_replacement.ends_with('/');
1049 if trailing {
1050 https_document_replacement.pop();
1051 }
1052 if https_document_replacement.ends_with('/') {
1053 https_document_replacement.pop();
1054 }
1055
1056 let redirect_mask = format!(
1057 "{}{}",
1058 https_document_replacement, http_document_replacement
1059 );
1060
1061 if current_url == redirect_mask {
1062 replacer = Some(if trailing {
1063 format!("{}/", https_document_replacement)
1064 } else {
1065 https_document_replacement
1066 });
1067 }
1068 }
1069
1070 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1071 self.xml_document = true;
1072 }
1073
1074 self.document_target_url = event.request.url.clone();
1076 self.document_target_domain = host_and_rest(&self.document_target_url)
1077 .map(|(h, _)| base_domain_from_host(h).to_string())
1078 .unwrap_or_default();
1079 }
1080
1081 let current_url_cow = match replacer {
1082 Some(r) => Cow::Owned(r),
1083 None => Cow::Borrowed(event.request.url.as_str()),
1084 };
1085
1086 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1087 (current_url_cow, had_replacer)
1088 }
1089
1090 #[cfg(feature = "adblock")]
1092 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1093 use adblock::{
1094 lists::{FilterSet, ParseOptions, RuleTypes},
1095 Engine,
1096 };
1097
1098 lazy_static::lazy_static! {
1099 static ref AD_ENGINE: Engine = {
1100 let mut filter_set = FilterSet::new(false);
1101 let mut rules = ParseOptions::default();
1102 rules.rule_types = RuleTypes::All;
1103
1104 filter_set.add_filters(
1105 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1106 rules,
1107 );
1108
1109 Engine::from_filter_set(filter_set, true)
1110 };
1111 };
1112
1113 let blockable = ResourceType::Image == event.resource_type
1114 || event.resource_type == ResourceType::Media
1115 || event.resource_type == ResourceType::Stylesheet
1116 || event.resource_type == ResourceType::Document
1117 || event.resource_type == ResourceType::Fetch
1118 || event.resource_type == ResourceType::Xhr;
1119
1120 let u = &event.request.url;
1121
1122 let block_request = blockable
1123 && {
1125 let request = adblock::request::Request::preparsed(
1126 &u,
1127 "example.com",
1128 "example.com",
1129 &event.resource_type.as_ref().to_lowercase(),
1130 !event.request.is_same_site.unwrap_or_default());
1131
1132 AD_ENGINE.check_network_request(&request).matched
1133 };
1134
1135 block_request
1136 }
1137
1138 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1139 let response = if self
1140 .attempted_authentications
1141 .contains(event.request_id.as_ref())
1142 {
1143 AuthChallengeResponseResponse::CancelAuth
1144 } else if self.credentials.is_some() {
1145 self.attempted_authentications
1146 .insert(event.request_id.clone().into());
1147 AuthChallengeResponseResponse::ProvideCredentials
1148 } else {
1149 AuthChallengeResponseResponse::Default
1150 };
1151
1152 let mut auth = AuthChallengeResponse::new(response);
1153 if let Some(creds) = self.credentials.clone() {
1154 auth.username = Some(creds.username);
1155 auth.password = Some(creds.password);
1156 }
1157 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1158 }
1159
1160 pub fn set_offline_mode(&mut self, value: bool) {
1162 if self.offline == value {
1163 return;
1164 }
1165 self.offline = value;
1166 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1167 .offline(self.offline)
1168 .latency(0)
1169 .download_throughput(-1.)
1170 .upload_throughput(-1.)
1171 .build()
1172 {
1173 self.push_cdp_request(network);
1174 }
1175 }
1176
1177 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1179 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1180 if let Some(interception_id) = self
1181 .request_id_to_interception_id
1182 .remove(event.request_id.as_ref())
1183 {
1184 self.on_request(event, Some(interception_id));
1185 } else {
1186 self.requests_will_be_sent
1188 .insert(event.request_id.clone(), event.clone());
1189 }
1190 } else {
1191 self.on_request(event, None);
1192 }
1193 }
1194
1195 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1197 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1198 request.from_memory_cache = true;
1199 }
1200 }
1201
1202 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1204 let mut request_failed = false;
1205
1206 let mut deducted: u64 = 0;
1208
1209 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1210 let before = *max_bytes;
1211
1212 let received_bytes: u64 = event.response.encoded_data_length as u64;
1214
1215 let content_length: Option<u64> = event
1217 .response
1218 .headers
1219 .inner()
1220 .get("content-length")
1221 .and_then(|v| v.as_str())
1222 .and_then(|s| s.trim().parse::<u64>().ok());
1223
1224 *max_bytes = max_bytes.saturating_sub(received_bytes);
1226
1227 if let Some(cl) = content_length {
1229 if cl > *max_bytes {
1230 *max_bytes = 0;
1231 }
1232 }
1233
1234 request_failed = *max_bytes == 0;
1235
1236 deducted = before.saturating_sub(*max_bytes);
1238 }
1239
1240 if deducted > 0 {
1242 self.queued_events
1243 .push_back(NetworkEvent::BytesConsumed(deducted));
1244 }
1245
1246 if request_failed && self.max_bytes_allowed.is_some() {
1248 self.set_block_all(true);
1249 }
1250
1251 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1252 request.set_response(event.response.clone());
1253 self.queued_events.push_back(if request_failed {
1254 NetworkEvent::RequestFailed(request)
1255 } else {
1256 NetworkEvent::RequestFinished(request)
1257 });
1258 }
1259 }
1260
1261 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1263 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1264 if let Some(interception_id) = request.interception_id.as_ref() {
1265 self.attempted_authentications
1266 .remove(interception_id.as_ref());
1267 }
1268 self.queued_events
1269 .push_back(NetworkEvent::RequestFinished(request));
1270 }
1271 }
1272
1273 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1275 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1276 request.failure_text = Some(event.error_text.clone());
1277 if let Some(interception_id) = request.interception_id.as_ref() {
1278 self.attempted_authentications
1279 .remove(interception_id.as_ref());
1280 }
1281 self.queued_events
1282 .push_back(NetworkEvent::RequestFailed(request));
1283 }
1284 }
1285
1286 fn on_request(
1288 &mut self,
1289 event: &EventRequestWillBeSent,
1290 interception_id: Option<InterceptionId>,
1291 ) {
1292 let mut redirect_chain = Vec::new();
1293 let mut redirect_location = None;
1294
1295 if let Some(redirect_resp) = &event.redirect_response {
1296 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1297 if is_redirect_status(redirect_resp.status) {
1298 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1299 if redirect_resp.url != location {
1300 let fixed_location = location.replace(&redirect_resp.url, "");
1301
1302 if !fixed_location.is_empty() {
1303 request.response.as_mut().map(|resp| {
1304 resp.headers.0["Location"] =
1305 serde_json::Value::String(fixed_location.clone());
1306 });
1307 }
1308
1309 redirect_location = Some(fixed_location);
1310 }
1311 }
1312 }
1313
1314 self.handle_request_redirect(
1315 &mut request,
1316 if let Some(redirect_location) = redirect_location {
1317 let mut redirect_resp = redirect_resp.clone();
1318
1319 if !redirect_location.is_empty() {
1320 redirect_resp.headers.0["Location"] =
1321 serde_json::Value::String(redirect_location);
1322 }
1323
1324 redirect_resp
1325 } else {
1326 redirect_resp.clone()
1327 },
1328 );
1329
1330 redirect_chain = std::mem::take(&mut request.redirect_chain);
1331 redirect_chain.push(request);
1332 }
1333 }
1334
1335 let request = HttpRequest::new(
1336 event.request_id.clone(),
1337 event.frame_id.clone(),
1338 interception_id,
1339 self.user_request_interception_enabled,
1340 redirect_chain,
1341 );
1342
1343 self.requests.insert(event.request_id.clone(), request);
1344 self.queued_events
1345 .push_back(NetworkEvent::Request(event.request_id.clone()));
1346 }
1347
1348 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1350 request.set_response(response);
1351 if let Some(interception_id) = request.interception_id.as_ref() {
1352 self.attempted_authentications
1353 .remove(interception_id.as_ref());
1354 }
1355 }
1356}
1357
1358#[derive(Debug)]
1359pub enum NetworkEvent {
1360 SendCdpRequest((MethodId, serde_json::Value)),
1362 Request(RequestId),
1364 Response(RequestId),
1366 RequestFailed(HttpRequest),
1368 RequestFinished(HttpRequest),
1370 BytesConsumed(u64),
1372}
1373
1374#[cfg(test)]
1375mod tests {
1376 use super::ALLOWED_MATCHER_3RD_PARTY;
1377 use crate::handler::network::NetworkManager;
1378 use std::time::Duration;
1379
1380 #[test]
1381 fn test_allowed_matcher_3rd_party() {
1382 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1384 assert!(
1385 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1386 "expected Cloudflare challenge script to be allowed"
1387 );
1388
1389 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1391 assert!(
1392 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1393 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1394 );
1395
1396 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1398 assert!(ALLOWED_MATCHER_3RD_PARTY
1399 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1400 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1401 }
1402
1403 #[test]
1404 fn test_script_allowed_by_default_when_not_blocklisted() {
1405 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1406 nm.set_page_url(
1407 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1408 );
1409
1410 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1412 assert!(
1413 !nm.should_block_script_blocklist_only(ok),
1414 "expected non-blocklisted script to be allowed"
1415 );
1416 }
1417
1418 #[test]
1419 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1420 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1421 nm.set_page_url(
1422 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1423 );
1424
1425 let bad = "https://cdn.example.net/js/analytics.js";
1427 assert!(
1428 nm.should_block_script_blocklist_only(bad),
1429 "expected analytics.js to be blocklisted"
1430 );
1431 }
1432
1433 #[test]
1434 fn test_allowed_matcher_3rd_party_sanity() {
1435 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1437 assert!(
1438 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1439 "expected Cloudflare challenge script to be allowed"
1440 );
1441
1442 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1444 assert!(
1445 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1446 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1447 );
1448
1449 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1450 assert!(ALLOWED_MATCHER_3RD_PARTY
1451 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1452 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1453 }
1454}