1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "captcha",
66 "client",
67 "/cdn-cgi/challenge-platform/",
68 "/wp-content/js/", "https://m.stripe.network/",
71 "https://challenges.cloudflare.com/",
72 "https://www.google.com/recaptcha/enterprise.js",
73 "https://www.google.com/recaptcha/api.js",
74 "https://google.com/recaptcha/api.js",
75 "https://captcha.px-cloud.net/",
76 "https://cdn.auth0.com/js/lock/",
77 "https://captcha.gtimg.com",
78 "https://cdn.auth0.com/client",
79 "https://js.stripe.com/",
80 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
83 ];
84
85 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
90
91 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
93 "https://m.stripe.network/",
95 "https://challenges.cloudflare.com/",
96 "https://www.google.com/recaptcha/api.js",
97 "https://google.com/recaptcha/api.js",
98 "https://www.google.com/recaptcha/enterprise.js",
99 "https://js.stripe.com/",
100 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
103 "https://ct.captcha-delivery.com/",
104 "https://geo.captcha-delivery.com/captcha/",
105 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
107 "https://cdn.auth0.com/client",
108 "https://captcha.px-cloud.net/",
109 "https://www.gstatic.com/recaptcha/",
110 "https://www.google.com/recaptcha/api2/",
111 "https://www.recaptcha.net/recaptcha/",
112 "https://js.hcaptcha.com/1/api.js",
113 "https://hcaptcha.com/1/api.js",
114 "https://js.datadome.co/tags.js",
115 "https://api-js.datadome.co/",
116 "https://client.perimeterx.net/",
117 "https://captcha.px-cdn.net/",
118 "https://captcha.px-cloud.net/",
119 "https://s.perimeterx.net/",
120 "https://client-api.arkoselabs.com/v2/",
121 "https://static.geetest.com/v4/gt4.js",
122 "https://static.geetest.com/",
123 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
124 "https://cdn.perfdrive.com/aperture/",
125 "https://assets.queue-it.net/",
126 "discourse-cdn.com/",
127 "/cdn-cgi/challenge-platform/",
128 "/_Incapsula_Resource"
129 ];
130
131 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
133
134 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
136 phf::phf_set! {
137 "_astro/", "_app/immutable"
139 }
140 };
141
142 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
144 "application/pdf",
145 "application/zip",
146 "application/x-rar-compressed",
147 "application/x-tar",
148 "image/png",
149 "image/jpeg",
150 "image/gif",
151 "image/bmp",
152 "image/webp",
153 "image/svg+xml",
154 "video/mp4",
155 "video/x-msvideo",
156 "video/x-matroska",
157 "video/webm",
158 "audio/mpeg",
159 "audio/ogg",
160 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
161 "application/vnd.ms-excel",
162 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
163 "application/vnd.ms-powerpoint",
164 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
165 "application/x-7z-compressed",
166 "application/x-rpm",
167 "application/x-shockwave-flash",
168 "application/rtf",
169 };
170
171 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
173 "Image",
174 "Media",
175 "Font"
176 };
177
178 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180 "CspViolationReport",
181 "Manifest",
182 "Other",
183 "Prefetch",
184 "Ping",
185 };
186
187 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
189
190 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
192 let enable = EnableParams::default();
193
194 if let Ok(c) = serde_json::to_value(&enable) {
195 vec![(enable.identifier(), c)]
196 } else {
197 vec![]
198 }
199 };
200
201 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
203 let enable = EnableParams::default();
204 let mut v = vec![];
205 if let Ok(c) = serde_json::to_value(&enable) {
206 v.push((enable.identifier(), c));
207 }
208 let ignore = SetIgnoreCertificateErrorsParams::new(true);
209 if let Ok(ignored) = serde_json::to_value(&ignore) {
210 v.push((ignore.identifier(), ignored));
211 }
212
213 v
214 };
215
216 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
218 fetch::EnableParams::builder()
219 .handle_auth_requests(true)
220 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
221 .build()
222 };
223}
224
225pub(crate) fn is_redirect_status(status: i64) -> bool {
227 matches!(status, 301 | 302 | 303 | 307 | 308)
228}
229
230#[derive(Debug)]
231pub struct NetworkManager {
233 queued_events: VecDeque<NetworkEvent>,
239 ignore_httpserrors: bool,
244 requests: HashMap<RequestId, HttpRequest>,
249 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
256 extra_headers: std::collections::HashMap<String, String>,
261 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
267 user_cache_disabled: bool,
272 attempted_authentications: HashSet<RequestId>,
278 credentials: Option<Credentials>,
283 pub(crate) user_request_interception_enabled: bool,
292 block_all: bool,
299 pub(crate) protocol_request_interception_enabled: bool,
305 offline: bool,
307 pub request_timeout: Duration,
309 pub ignore_visuals: bool,
312 pub block_stylesheets: bool,
314 pub block_javascript: bool,
319 pub block_analytics: bool,
321 pub only_html: bool,
323 pub xml_document: bool,
325 pub intercept_manager: NetworkInterceptManager,
327 pub document_reload_tracker: u8,
329 pub document_target_url: String,
331 pub document_target_domain: String,
333 pub max_bytes_allowed: Option<u64>,
335 #[cfg(feature = "_cache")]
336 pub cache_site_key: Option<String>,
338 #[cfg(feature = "_cache")]
340 pub cache_policy: Option<BasicCachePolicy>,
341 whitelist_patterns: Vec<String>,
343 whitelist_matcher: Option<AhoCorasick>,
345 blacklist_patterns: Vec<String>,
347 blacklist_matcher: Option<AhoCorasick>,
349 blacklist_strict: bool,
351}
352
353impl NetworkManager {
354 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
356 Self {
357 queued_events: Default::default(),
358 ignore_httpserrors,
359 requests: Default::default(),
360 requests_will_be_sent: Default::default(),
361 extra_headers: Default::default(),
362 request_id_to_interception_id: Default::default(),
363 user_cache_disabled: false,
364 attempted_authentications: Default::default(),
365 credentials: None,
366 block_all: false,
367 user_request_interception_enabled: false,
368 protocol_request_interception_enabled: false,
369 offline: false,
370 request_timeout,
371 ignore_visuals: false,
372 block_javascript: false,
373 block_stylesheets: false,
374 block_analytics: true,
375 only_html: false,
376 xml_document: false,
377 intercept_manager: NetworkInterceptManager::Unknown,
378 document_reload_tracker: 0,
379 document_target_url: String::new(),
380 document_target_domain: String::new(),
381 whitelist_patterns: Vec::new(),
382 whitelist_matcher: None,
383 blacklist_patterns: Vec::new(),
384 blacklist_matcher: None,
385 blacklist_strict: true,
386 max_bytes_allowed: None,
387 #[cfg(feature = "_cache")]
388 cache_site_key: None,
389 #[cfg(feature = "_cache")]
390 cache_policy: None,
391 }
392 }
393
394 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
396 where
397 I: IntoIterator<Item = S>,
398 S: Into<String>,
399 {
400 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
401 self.rebuild_whitelist_matcher();
402 }
403
404 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
406 where
407 I: IntoIterator<Item = S>,
408 S: Into<String>,
409 {
410 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
411 self.rebuild_blacklist_matcher();
412 }
413
414 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
416 self.blacklist_patterns.push(pattern.into());
417 self.rebuild_blacklist_matcher();
418 }
419
420 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
422 where
423 I: IntoIterator<Item = S>,
424 S: Into<String>,
425 {
426 self.blacklist_patterns
427 .extend(patterns.into_iter().map(Into::into));
428 self.rebuild_blacklist_matcher();
429 }
430
431 pub fn clear_blacklist(&mut self) {
433 self.blacklist_patterns.clear();
434 self.blacklist_matcher = None;
435 }
436
437 pub fn set_blacklist_strict(&mut self, strict: bool) {
439 self.blacklist_strict = strict;
440 }
441
442 #[inline]
443 fn rebuild_blacklist_matcher(&mut self) {
444 if self.blacklist_patterns.is_empty() {
445 self.blacklist_matcher = None;
446 return;
447 }
448
449 let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
450 self.blacklist_matcher = AhoCorasick::new(refs).ok();
451 }
452
453 #[inline]
454 fn is_blacklisted(&self, url: &str) -> bool {
455 self.blacklist_matcher
456 .as_ref()
457 .map(|m| m.is_match(url))
458 .unwrap_or(false)
459 }
460
461 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
463 self.whitelist_patterns.push(pattern.into());
464 self.rebuild_whitelist_matcher();
465 }
466
467 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
469 where
470 I: IntoIterator<Item = S>,
471 S: Into<String>,
472 {
473 self.whitelist_patterns
474 .extend(patterns.into_iter().map(Into::into));
475 self.rebuild_whitelist_matcher();
476 }
477
478 #[inline]
479 fn rebuild_whitelist_matcher(&mut self) {
480 if self.whitelist_patterns.is_empty() {
481 self.whitelist_matcher = None;
482 return;
483 }
484
485 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
486
487 self.whitelist_matcher = AhoCorasick::new(refs).ok();
489 }
490
491 #[inline]
492 fn is_whitelisted(&self, url: &str) -> bool {
493 self.whitelist_matcher
494 .as_ref()
495 .map(|m| m.is_match(url))
496 .unwrap_or(false)
497 }
498
499 pub fn init_commands(&self) -> CommandChain {
501 let cmds = if self.ignore_httpserrors {
502 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
503 } else {
504 INIT_CHAIN.clone()
505 };
506 CommandChain::new(cmds, self.request_timeout)
507 }
508
509 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
511 let method = cmd.identifier();
512 if let Ok(params) = serde_json::to_value(cmd) {
513 self.queued_events
514 .push_back(NetworkEvent::SendCdpRequest((method, params)));
515 }
516 }
517
518 pub fn poll(&mut self) -> Option<NetworkEvent> {
520 self.queued_events.pop_front()
521 }
522
523 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
525 &self.extra_headers
526 }
527
528 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
530 self.extra_headers = headers;
531 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
532 self.extra_headers.remove("Proxy-Authorization");
533 if !self.extra_headers.is_empty() {
534 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
535 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
536 }
537 }
538 }
539
540 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
541 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
542 }
543
544 pub fn set_block_all(&mut self, block_all: bool) {
545 self.block_all = block_all;
546 }
547
548 pub fn set_request_interception(&mut self, enabled: bool) {
549 self.user_request_interception_enabled = enabled;
550 self.update_protocol_request_interception();
551 }
552
553 pub fn set_cache_enabled(&mut self, enabled: bool) {
554 let run = self.user_cache_disabled != !enabled;
555 self.user_cache_disabled = !enabled;
556 if run {
557 self.update_protocol_cache_disabled();
558 }
559 }
560
561 pub fn enable_request_intercept(&mut self) {
563 self.protocol_request_interception_enabled = true;
564 }
565
566 pub fn disable_request_intercept(&mut self) {
568 self.protocol_request_interception_enabled = false;
569 }
570
571 #[cfg(feature = "_cache")]
573 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
574 self.cache_site_key = cache_site_key;
575 }
576
577 #[cfg(feature = "_cache")]
579 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
580 self.cache_policy = cache_policy;
581 }
582
583 pub fn update_protocol_cache_disabled(&mut self) {
584 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
585 }
586
587 pub fn authenticate(&mut self, credentials: Credentials) {
588 self.credentials = Some(credentials);
589 self.update_protocol_request_interception();
590 self.protocol_request_interception_enabled = true;
591 }
592
593 fn update_protocol_request_interception(&mut self) {
594 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
595
596 if enabled == self.protocol_request_interception_enabled {
597 return;
598 }
599
600 if enabled {
601 self.push_cdp_request(ENABLE_FETCH.clone())
602 } else {
603 self.push_cdp_request(DisableParams::default())
604 }
605 }
606
607 #[inline]
610 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
611 let block_analytics = self.block_analytics;
613
614 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
616 {
617 return true;
618 }
619
620 if crate::handler::blockers::block_websites::block_website(url) {
622 return true;
623 }
624
625 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
632 let p_slash = Self::strip_query_fragment(path_with_slash);
634 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
635
636 let base = match p_slash.rsplit('/').next() {
638 Some(b) => b,
639 None => p_slash,
640 };
641
642 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
645 return true;
646 }
647 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
648 return true;
649 }
650 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
651 return true;
652 }
653
654 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
657 return true;
658 }
659
660 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
662 return true;
663 }
664 }
665
666 false
667 }
668
669 #[inline]
674 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
675 let idx = url.find("//")?;
677 let after_slashes = idx + 2;
678
679 let slash_rel = url[after_slashes..].find('/')?;
681 let slash_idx = after_slashes + slash_rel;
682
683 if slash_idx < url.len() {
684 Some(&url[slash_idx..])
685 } else {
686 None
687 }
688 }
689
690 #[inline]
695 fn strip_query_fragment(s: &str) -> &str {
696 let q = s.find('?');
697 let h = s.find('#');
698
699 match (q, h) {
700 (None, None) => s,
701 (Some(i), None) => &s[..i],
702 (None, Some(i)) => &s[..i],
703 (Some(i), Some(j)) => &s[..i.min(j)],
704 }
705 }
706
707 #[inline]
709 fn skip_xhr(
710 &self,
711 skip_networking: bool,
712 event: &EventRequestPaused,
713 network_event: bool,
714 ) -> bool {
715 if !skip_networking && network_event {
717 let request_url = event.request.url.as_str();
718
719 let skip_analytics =
721 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
722
723 if skip_analytics {
724 true
725 } else if self.block_stylesheets || self.ignore_visuals {
726 let block_css = self.block_stylesheets;
727 let block_media = self.ignore_visuals;
728
729 let mut block_request = false;
730
731 if let Some(position) = request_url.rfind('.') {
732 let hlen = request_url.len();
733 let has_asset = hlen - position;
734
735 if has_asset >= 3 {
736 let next_position = position + 1;
737
738 if block_media
739 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
740 &request_url[next_position..].into(),
741 )
742 {
743 block_request = true;
744 } else if block_css {
745 block_request =
746 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
747 .contains(&**CSS_EXTENSION)
748 }
749 }
750 }
751
752 if !block_request {
753 block_request = ignore_script_xhr_media(request_url);
754 }
755
756 block_request
757 } else {
758 skip_networking
759 }
760 } else {
761 skip_networking
762 }
763 }
764
765 #[cfg(feature = "adblock")]
766 #[inline]
767 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
769 if skip_networking {
770 true
771 } else {
772 block_ads(&event.request.url) || self.detect_ad(event)
773 }
774 }
775
776 #[cfg(not(feature = "adblock"))]
778 #[inline]
779 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
780 use crate::handler::blockers::block_websites::block_ads;
781 if skip_networking {
782 true
783 } else {
784 block_ads(&event.request.url)
785 }
786 }
787
788 #[inline]
789 fn fail_request_blocked(
791 &mut self,
792 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
793 ) {
794 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
795 request_id.clone(),
796 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
797 );
798 self.push_cdp_request(params);
799 }
800
801 #[inline]
802 fn fulfill_request_empty_200(
804 &mut self,
805 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
806 ) {
807 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
808 request_id.clone(),
809 200,
810 );
811 self.push_cdp_request(params);
812 }
813
814 #[cfg(feature = "_cache")]
815 #[inline]
816 fn fulfill_request_from_cache(
820 &mut self,
821 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
822 body: &[u8],
823 headers: &std::collections::HashMap<String, String>,
824 status: i64,
825 ) {
826 use crate::cdp::browser_protocol::fetch::HeaderEntry;
827 use crate::handler::network::fetch::FulfillRequestParams;
828 use base64::Engine;
829
830 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
831
832 for (k, v) in headers.iter() {
833 resp_headers.push(HeaderEntry {
834 name: k.clone().into(),
835 value: v.clone().into(),
836 });
837 }
838
839 let mut params = FulfillRequestParams::new(request_id.clone(), status);
840
841 params.body = Some(
843 base64::engine::general_purpose::STANDARD
844 .encode(body)
845 .into(),
846 );
847
848 params.response_headers = Some(resp_headers);
849
850 self.push_cdp_request(params);
851 }
852
853 #[inline]
854 fn continue_request_with_url(
856 &mut self,
857 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
858 url: Option<&str>,
859 intercept_response: bool,
860 ) {
861 let mut params = ContinueRequestParams::new(request_id.clone());
862 if let Some(url) = url {
863 params.url = Some(url.to_string());
864 params.intercept_response = Some(intercept_response);
865 }
866 self.push_cdp_request(params);
867 }
868
869 #[inline]
871 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
872 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
873 return;
874 }
875
876 let resource_type = &event.resource_type;
877
878 if self.block_all {
879 tracing::debug!(
880 "Blocked (block_all): {:?} - {}",
881 event.resource_type,
882 event.request.url
883 );
884 return self.fail_request_blocked(&event.request_id);
885 }
886
887 if let Some(network_id) = event.network_id.as_ref() {
888 if let Some(request_will_be_sent) =
889 self.requests_will_be_sent.remove(network_id.as_ref())
890 {
891 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
892 } else {
893 self.request_id_to_interception_id
894 .insert(network_id.clone(), event.request_id.clone().into());
895 }
896 }
897
898 let javascript_resource = *resource_type == ResourceType::Script;
900 let document_resource = *resource_type == ResourceType::Document;
901 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
902
903 let mut skip_networking =
905 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
906
907 if !skip_networking {
909 skip_networking = self.document_reload_tracker >= 3;
910 }
911
912 let (current_url_cow, had_replacer) =
914 self.handle_document_replacement_and_tracking(event, document_resource);
915
916 let current_url: &str = current_url_cow.as_ref();
917
918 let blacklisted = self.is_blacklisted(current_url);
919
920 if !self.blacklist_strict && blacklisted {
921 skip_networking = true;
922 }
923
924 if !skip_networking {
925 if self.xml_document && current_url.ends_with(".xsl") {
927 skip_networking = false;
928 } else {
929 skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
930 }
931 }
932
933 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
934
935 if !skip_networking
937 && self.block_javascript
938 && (self.only_html || self.ignore_visuals)
939 && (javascript_resource || document_resource)
940 {
941 skip_networking = ignore_script_embedded(current_url);
942 }
943
944 if !skip_networking && javascript_resource {
947 skip_networking = self.should_block_script_blocklist_only(current_url);
948 }
949
950 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
952
953 if !skip_networking && (javascript_resource || network_resource || document_resource) {
955 skip_networking = self.intercept_manager.intercept_detection(
956 current_url,
957 self.ignore_visuals,
958 network_resource,
959 );
960 }
961
962 if !skip_networking && (javascript_resource || network_resource) {
964 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
965 }
966
967 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
970 {
971 skip_networking = false;
972 }
973
974 if skip_networking && self.is_whitelisted(current_url) {
976 skip_networking = false;
977 }
978
979 if self.blacklist_strict && blacklisted {
980 skip_networking = true;
981 }
982
983 if skip_networking {
984 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
985 self.fulfill_request_empty_200(&event.request_id);
986 } else {
987 #[cfg(feature = "_cache")]
988 {
989 if let (Some(policy), Some(cache_site_key)) =
990 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
991 {
992 let current_url = format!("{}:{}", event.request.method, ¤t_url);
993
994 if let Some((res, cache_policy)) =
995 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
996 {
997 if policy.allows_cached(&cache_policy) {
998 tracing::debug!(
999 "Remote Cached: {:?} - {}",
1000 resource_type,
1001 ¤t_url
1002 );
1003 return self.fulfill_request_from_cache(
1004 &event.request_id,
1005 &res.body,
1006 &res.headers,
1007 res.status as i64,
1008 );
1009 }
1010 }
1011 }
1012 }
1013
1014 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1016 self.continue_request_with_url(
1017 &event.request_id,
1018 if had_replacer {
1019 Some(current_url)
1020 } else {
1021 None
1022 },
1023 !had_replacer,
1024 );
1025 }
1026 }
1027
1028 #[inline]
1034 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1035 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1036 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1037 }
1038
1039 pub fn has_target_domain(&self) -> bool {
1041 !self.document_target_url.is_empty()
1042 }
1043
1044 pub fn set_page_url(&mut self, page_target_url: String) {
1046 let host_base = host_and_rest(&page_target_url)
1047 .map(|(h, _)| base_domain_from_host(h))
1048 .unwrap_or("");
1049
1050 self.document_target_domain = host_base.to_string();
1051 self.document_target_url = page_target_url;
1052 }
1053
1054 pub fn clear_target_domain(&mut self) {
1056 self.document_reload_tracker = 0;
1057 self.document_target_url = Default::default();
1058 self.document_target_domain = Default::default();
1059 }
1060
1061 #[inline]
1069 fn handle_document_replacement_and_tracking<'a>(
1070 &mut self,
1071 event: &'a EventRequestPaused,
1072 document_resource: bool,
1073 ) -> (Cow<'a, str>, bool) {
1074 let mut replacer: Option<String> = None;
1075 let current_url = event.request.url.as_str();
1076
1077 if document_resource {
1078 if self.document_target_url == current_url {
1079 self.document_reload_tracker += 1;
1080 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1081 {
1082 let (http_document_replacement, mut https_document_replacement) =
1083 if self.document_target_url.starts_with("http://") {
1084 (
1085 self.document_target_url.replacen("http://", "http//", 1),
1086 self.document_target_url.replacen("http://", "https://", 1),
1087 )
1088 } else {
1089 (
1090 self.document_target_url.replacen("https://", "https//", 1),
1091 self.document_target_url.replacen("https://", "http://", 1),
1092 )
1093 };
1094
1095 let trailing = https_document_replacement.ends_with('/');
1097 if trailing {
1098 https_document_replacement.pop();
1099 }
1100 if https_document_replacement.ends_with('/') {
1101 https_document_replacement.pop();
1102 }
1103
1104 let redirect_mask = format!(
1105 "{}{}",
1106 https_document_replacement, http_document_replacement
1107 );
1108
1109 if current_url == redirect_mask {
1110 replacer = Some(if trailing {
1111 format!("{}/", https_document_replacement)
1112 } else {
1113 https_document_replacement
1114 });
1115 }
1116 }
1117
1118 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1119 self.xml_document = true;
1120 }
1121
1122 self.document_target_url = event.request.url.clone();
1124 self.document_target_domain = host_and_rest(&self.document_target_url)
1125 .map(|(h, _)| base_domain_from_host(h).to_string())
1126 .unwrap_or_default();
1127 }
1128
1129 let current_url_cow = match replacer {
1130 Some(r) => Cow::Owned(r),
1131 None => Cow::Borrowed(event.request.url.as_str()),
1132 };
1133
1134 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1135 (current_url_cow, had_replacer)
1136 }
1137
1138 #[cfg(feature = "adblock")]
1140 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1141 use adblock::{
1142 lists::{FilterSet, ParseOptions, RuleTypes},
1143 Engine,
1144 };
1145
1146 lazy_static::lazy_static! {
1147 static ref AD_ENGINE: Engine = {
1148 let mut filter_set = FilterSet::new(false);
1149 let mut rules = ParseOptions::default();
1150 rules.rule_types = RuleTypes::All;
1151
1152 filter_set.add_filters(
1153 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1154 rules,
1155 );
1156
1157 Engine::from_filter_set(filter_set, true)
1158 };
1159 };
1160
1161 let blockable = ResourceType::Image == event.resource_type
1162 || event.resource_type == ResourceType::Media
1163 || event.resource_type == ResourceType::Stylesheet
1164 || event.resource_type == ResourceType::Document
1165 || event.resource_type == ResourceType::Fetch
1166 || event.resource_type == ResourceType::Xhr;
1167
1168 let u = &event.request.url;
1169
1170 let block_request = blockable
1171 && {
1173 let request = adblock::request::Request::preparsed(
1174 &u,
1175 "example.com",
1176 "example.com",
1177 &event.resource_type.as_ref().to_lowercase(),
1178 !event.request.is_same_site.unwrap_or_default());
1179
1180 AD_ENGINE.check_network_request(&request).matched
1181 };
1182
1183 block_request
1184 }
1185
1186 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1187 let response = if self
1188 .attempted_authentications
1189 .contains(event.request_id.as_ref())
1190 {
1191 AuthChallengeResponseResponse::CancelAuth
1192 } else if self.credentials.is_some() {
1193 self.attempted_authentications
1194 .insert(event.request_id.clone().into());
1195 AuthChallengeResponseResponse::ProvideCredentials
1196 } else {
1197 AuthChallengeResponseResponse::Default
1198 };
1199
1200 let mut auth = AuthChallengeResponse::new(response);
1201 if let Some(creds) = self.credentials.clone() {
1202 auth.username = Some(creds.username);
1203 auth.password = Some(creds.password);
1204 }
1205 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1206 }
1207
1208 pub fn set_offline_mode(&mut self, value: bool) {
1210 if self.offline == value {
1211 return;
1212 }
1213 self.offline = value;
1214 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1215 .offline(self.offline)
1216 .latency(0)
1217 .download_throughput(-1.)
1218 .upload_throughput(-1.)
1219 .build()
1220 {
1221 self.push_cdp_request(network);
1222 }
1223 }
1224
1225 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1227 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1228 if let Some(interception_id) = self
1229 .request_id_to_interception_id
1230 .remove(event.request_id.as_ref())
1231 {
1232 self.on_request(event, Some(interception_id));
1233 } else {
1234 self.requests_will_be_sent
1236 .insert(event.request_id.clone(), event.clone());
1237 }
1238 } else {
1239 self.on_request(event, None);
1240 }
1241 }
1242
1243 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1245 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1246 request.from_memory_cache = true;
1247 }
1248 }
1249
1250 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1252 let mut request_failed = false;
1253
1254 let mut deducted: u64 = 0;
1256
1257 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1258 let before = *max_bytes;
1259
1260 let received_bytes: u64 = event.response.encoded_data_length as u64;
1262
1263 let content_length: Option<u64> = event
1265 .response
1266 .headers
1267 .inner()
1268 .get("content-length")
1269 .and_then(|v| v.as_str())
1270 .and_then(|s| s.trim().parse::<u64>().ok());
1271
1272 *max_bytes = max_bytes.saturating_sub(received_bytes);
1274
1275 if let Some(cl) = content_length {
1277 if cl > *max_bytes {
1278 *max_bytes = 0;
1279 }
1280 }
1281
1282 request_failed = *max_bytes == 0;
1283
1284 deducted = before.saturating_sub(*max_bytes);
1286 }
1287
1288 if deducted > 0 {
1290 self.queued_events
1291 .push_back(NetworkEvent::BytesConsumed(deducted));
1292 }
1293
1294 if request_failed && self.max_bytes_allowed.is_some() {
1296 self.set_block_all(true);
1297 }
1298
1299 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1300 request.set_response(event.response.clone());
1301 self.queued_events.push_back(if request_failed {
1302 NetworkEvent::RequestFailed(request)
1303 } else {
1304 NetworkEvent::RequestFinished(request)
1305 });
1306 }
1307 }
1308
1309 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1311 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1312 if let Some(interception_id) = request.interception_id.as_ref() {
1313 self.attempted_authentications
1314 .remove(interception_id.as_ref());
1315 }
1316 self.queued_events
1317 .push_back(NetworkEvent::RequestFinished(request));
1318 }
1319 }
1320
1321 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1323 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1324 request.failure_text = Some(event.error_text.clone());
1325 if let Some(interception_id) = request.interception_id.as_ref() {
1326 self.attempted_authentications
1327 .remove(interception_id.as_ref());
1328 }
1329 self.queued_events
1330 .push_back(NetworkEvent::RequestFailed(request));
1331 }
1332 }
1333
1334 fn on_request(
1336 &mut self,
1337 event: &EventRequestWillBeSent,
1338 interception_id: Option<InterceptionId>,
1339 ) {
1340 let mut redirect_chain = Vec::new();
1341 let mut redirect_location = None;
1342
1343 if let Some(redirect_resp) = &event.redirect_response {
1344 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1345 if is_redirect_status(redirect_resp.status) {
1346 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1347 if redirect_resp.url != location {
1348 let fixed_location = location.replace(&redirect_resp.url, "");
1349
1350 if !fixed_location.is_empty() {
1351 request.response.as_mut().map(|resp| {
1352 resp.headers.0["Location"] =
1353 serde_json::Value::String(fixed_location.clone());
1354 });
1355 }
1356
1357 redirect_location = Some(fixed_location);
1358 }
1359 }
1360 }
1361
1362 self.handle_request_redirect(
1363 &mut request,
1364 if let Some(redirect_location) = redirect_location {
1365 let mut redirect_resp = redirect_resp.clone();
1366
1367 if !redirect_location.is_empty() {
1368 redirect_resp.headers.0["Location"] =
1369 serde_json::Value::String(redirect_location);
1370 }
1371
1372 redirect_resp
1373 } else {
1374 redirect_resp.clone()
1375 },
1376 );
1377
1378 redirect_chain = std::mem::take(&mut request.redirect_chain);
1379 redirect_chain.push(request);
1380 }
1381 }
1382
1383 let request = HttpRequest::new(
1384 event.request_id.clone(),
1385 event.frame_id.clone(),
1386 interception_id,
1387 self.user_request_interception_enabled,
1388 redirect_chain,
1389 );
1390
1391 self.requests.insert(event.request_id.clone(), request);
1392 self.queued_events
1393 .push_back(NetworkEvent::Request(event.request_id.clone()));
1394 }
1395
1396 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1398 request.set_response(response);
1399 if let Some(interception_id) = request.interception_id.as_ref() {
1400 self.attempted_authentications
1401 .remove(interception_id.as_ref());
1402 }
1403 }
1404}
1405
1406#[derive(Debug)]
1407pub enum NetworkEvent {
1408 SendCdpRequest((MethodId, serde_json::Value)),
1410 Request(RequestId),
1412 Response(RequestId),
1414 RequestFailed(HttpRequest),
1416 RequestFinished(HttpRequest),
1418 BytesConsumed(u64),
1420}
1421
1422#[cfg(test)]
1423mod tests {
1424 use super::ALLOWED_MATCHER_3RD_PARTY;
1425 use crate::handler::network::NetworkManager;
1426 use std::time::Duration;
1427
1428 #[test]
1429 fn test_allowed_matcher_3rd_party() {
1430 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1432 assert!(
1433 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1434 "expected Cloudflare challenge script to be allowed"
1435 );
1436
1437 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1439 assert!(
1440 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1441 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1442 );
1443
1444 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1446 assert!(ALLOWED_MATCHER_3RD_PARTY
1447 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1448 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1449 }
1450
1451 #[test]
1452 fn test_script_allowed_by_default_when_not_blocklisted() {
1453 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1454 nm.set_page_url(
1455 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1456 );
1457
1458 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1460 assert!(
1461 !nm.should_block_script_blocklist_only(ok),
1462 "expected non-blocklisted script to be allowed"
1463 );
1464 }
1465
1466 #[test]
1467 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1468 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1469 nm.set_page_url(
1470 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1471 );
1472
1473 let bad = "https://cdn.example.net/js/analytics.js";
1475 assert!(
1476 nm.should_block_script_blocklist_only(bad),
1477 "expected analytics.js to be blocklisted"
1478 );
1479 }
1480
1481 #[test]
1482 fn test_allowed_matcher_3rd_party_sanity() {
1483 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1485 assert!(
1486 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1487 "expected Cloudflare challenge script to be allowed"
1488 );
1489
1490 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1492 assert!(
1493 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1494 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1495 );
1496
1497 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1498 assert!(ALLOWED_MATCHER_3RD_PARTY
1499 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1500 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1501 }
1502 #[test]
1503 fn test_dynamic_blacklist_blocks_url() {
1504 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1505 nm.set_page_url("https://example.com/".to_string());
1506
1507 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1508 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1509 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1510
1511 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1512 }
1513
1514 #[test]
1515 fn test_blacklist_strict_wins_over_whitelist() {
1516 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1517 nm.set_page_url("https://example.com/".to_string());
1518
1519 nm.set_blacklist_patterns(["beacon.min.js"]);
1521 nm.set_whitelist_patterns(["beacon.min.js"]);
1522
1523 nm.set_blacklist_strict(true);
1524
1525 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1526 assert!(nm.is_whitelisted(u));
1527 assert!(nm.is_blacklisted(u));
1528
1529 assert!(nm.blacklist_strict);
1532 }
1533
1534 #[test]
1535 fn test_blacklist_non_strict_allows_whitelist_override() {
1536 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1537 nm.set_page_url("https://example.com/".to_string());
1538
1539 nm.set_blacklist_patterns(["beacon.min.js"]);
1540 nm.set_whitelist_patterns(["beacon.min.js"]);
1541
1542 nm.set_blacklist_strict(false);
1543
1544 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1545 assert!(nm.is_blacklisted(u));
1546 assert!(nm.is_whitelisted(u));
1547 assert!(!nm.blacklist_strict);
1548 }
1549}