1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "captcha",
66 "client",
67 "/cdn-cgi/challenge-platform/",
68 "/wp-content/js/", "https://m.stripe.network/",
71 "https://challenges.cloudflare.com/",
72 "https://www.google.com/recaptcha/",
73 "https://google.com/recaptcha/api.js",
74 "https://www.gstatic.com/recaptcha/",
75 "https://captcha.px-cloud.net/",
76 "https://geo.captcha-delivery.com/",
77 "https://api.leminnow.com/captcha/",
78 "https://cdn.auth0.com/js/lock/",
79 "https://captcha.gtimg.com",
80 "https://client-api.arkoselabs.com/",
81 "https://www.capy.me/puzzle/",
82 "https://newassets.hcaptcha.com/",
83 "https://cdn.auth0.com/client",
84 "https://js.stripe.com/",
85 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
88 ];
89
90 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
95
96 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
98 "https://m.stripe.network/",
100 "https://challenges.cloudflare.com/",
101 "https://js.stripe.com/",
102 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
105 "https://ct.captcha-delivery.com/",
106 "https://geo.captcha-delivery.com/",
107 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
109 "https://captcha.px-cloud.net/",
110 "https://www.capy.me/puzzle/",
111 "https://www.gstatic.com/recaptcha/",
112 "https://google.com/recaptcha/",
113 "https://www.google.com/recaptcha/",
114 "https://www.recaptcha.net/recaptcha/",
115 "https://js.hcaptcha.com/1/api.js",
116 "https://hcaptcha.com/1/api.js",
117 "https://js.datadome.co/tags.js",
118 "https://api-js.datadome.co/",
119 "https://client.perimeterx.net/",
120 "https://captcha.px-cdn.net/",
121 "https://newassets.hcaptcha.com/",
122 "https://captcha.px-cloud.net/",
123 "https://s.perimeterx.net/",
124 "https://api.leminnow.com/captcha/",
125 "https://client-api.arkoselabs.com/",
126 "https://static.geetest.com/v4/gt4.js",
127 "https://static.geetest.com/",
128 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
129 "https://cdn.perfdrive.com/aperture/",
130 "https://assets.queue-it.net/",
131 "discourse-cdn.com/",
132 "hcaptcha.com",
133 "/cdn-cgi/challenge-platform/",
134 "/_Incapsula_Resource"
135 ];
136
137 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
139
140 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
142 phf::phf_set! {
143 "_astro/", "_app/immutable"
145 }
146 };
147
148 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
150 "application/pdf",
151 "application/zip",
152 "application/x-rar-compressed",
153 "application/x-tar",
154 "image/png",
155 "image/jpeg",
156 "image/gif",
157 "image/bmp",
158 "image/webp",
159 "image/svg+xml",
160 "video/mp4",
161 "video/x-msvideo",
162 "video/x-matroska",
163 "video/webm",
164 "audio/mpeg",
165 "audio/ogg",
166 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
167 "application/vnd.ms-excel",
168 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
169 "application/vnd.ms-powerpoint",
170 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
171 "application/x-7z-compressed",
172 "application/x-rpm",
173 "application/x-shockwave-flash",
174 "application/rtf",
175 };
176
177 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
179 "Image",
180 "Media",
181 "Font"
182 };
183
184 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
186 "CspViolationReport",
187 "Ping",
188 };
189
190 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
192
193 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
195 let enable = EnableParams::default();
196
197 if let Ok(c) = serde_json::to_value(&enable) {
198 vec![(enable.identifier(), c)]
199 } else {
200 vec![]
201 }
202 };
203
204 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
206 let enable = EnableParams::default();
207 let mut v = vec![];
208 if let Ok(c) = serde_json::to_value(&enable) {
209 v.push((enable.identifier(), c));
210 }
211 let ignore = SetIgnoreCertificateErrorsParams::new(true);
212 if let Ok(ignored) = serde_json::to_value(&ignore) {
213 v.push((ignore.identifier(), ignored));
214 }
215
216 v
217 };
218
219 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
221 fetch::EnableParams::builder()
222 .handle_auth_requests(true)
223 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
224 .build()
225 };
226}
227
228pub(crate) fn is_redirect_status(status: i64) -> bool {
230 matches!(status, 301 | 302 | 303 | 307 | 308)
231}
232
233#[derive(Debug)]
234pub struct NetworkManager {
236 queued_events: VecDeque<NetworkEvent>,
242 ignore_httpserrors: bool,
247 requests: HashMap<RequestId, HttpRequest>,
252 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
259 extra_headers: std::collections::HashMap<String, String>,
264 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
270 user_cache_disabled: bool,
275 attempted_authentications: HashSet<RequestId>,
281 credentials: Option<Credentials>,
286 pub(crate) user_request_interception_enabled: bool,
295 block_all: bool,
302 pub(crate) protocol_request_interception_enabled: bool,
308 offline: bool,
310 pub request_timeout: Duration,
312 pub ignore_visuals: bool,
315 pub block_stylesheets: bool,
317 pub block_javascript: bool,
322 pub block_analytics: bool,
324 pub block_prefetch: bool,
326 pub only_html: bool,
328 pub xml_document: bool,
330 pub intercept_manager: NetworkInterceptManager,
332 pub document_reload_tracker: u8,
334 pub document_target_url: String,
336 pub document_target_domain: String,
338 pub max_bytes_allowed: Option<u64>,
340 #[cfg(feature = "_cache")]
341 pub cache_site_key: Option<String>,
343 #[cfg(feature = "_cache")]
345 pub cache_policy: Option<BasicCachePolicy>,
346 whitelist_patterns: Vec<String>,
348 whitelist_matcher: Option<AhoCorasick>,
350 blacklist_patterns: Vec<String>,
352 blacklist_matcher: Option<AhoCorasick>,
354 blacklist_strict: bool,
356}
357
358impl NetworkManager {
359 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
361 Self {
362 queued_events: Default::default(),
363 ignore_httpserrors,
364 requests: Default::default(),
365 requests_will_be_sent: Default::default(),
366 extra_headers: Default::default(),
367 request_id_to_interception_id: Default::default(),
368 user_cache_disabled: false,
369 attempted_authentications: Default::default(),
370 credentials: None,
371 block_all: false,
372 user_request_interception_enabled: false,
373 protocol_request_interception_enabled: false,
374 offline: false,
375 request_timeout,
376 ignore_visuals: false,
377 block_javascript: false,
378 block_stylesheets: false,
379 block_prefetch: true,
380 block_analytics: true,
381 only_html: false,
382 xml_document: false,
383 intercept_manager: NetworkInterceptManager::Unknown,
384 document_reload_tracker: 0,
385 document_target_url: String::new(),
386 document_target_domain: String::new(),
387 whitelist_patterns: Vec::new(),
388 whitelist_matcher: None,
389 blacklist_patterns: Vec::new(),
390 blacklist_matcher: None,
391 blacklist_strict: true,
392 max_bytes_allowed: None,
393 #[cfg(feature = "_cache")]
394 cache_site_key: None,
395 #[cfg(feature = "_cache")]
396 cache_policy: None,
397 }
398 }
399
400 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
402 where
403 I: IntoIterator<Item = S>,
404 S: Into<String>,
405 {
406 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
407 self.rebuild_whitelist_matcher();
408 }
409
410 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
412 where
413 I: IntoIterator<Item = S>,
414 S: Into<String>,
415 {
416 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
417 self.rebuild_blacklist_matcher();
418 }
419
420 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
422 self.blacklist_patterns.push(pattern.into());
423 self.rebuild_blacklist_matcher();
424 }
425
426 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
428 where
429 I: IntoIterator<Item = S>,
430 S: Into<String>,
431 {
432 self.blacklist_patterns
433 .extend(patterns.into_iter().map(Into::into));
434 self.rebuild_blacklist_matcher();
435 }
436
437 pub fn clear_blacklist(&mut self) {
439 self.blacklist_patterns.clear();
440 self.blacklist_matcher = None;
441 }
442
443 pub fn set_blacklist_strict(&mut self, strict: bool) {
445 self.blacklist_strict = strict;
446 }
447
448 #[inline]
449 fn rebuild_blacklist_matcher(&mut self) {
450 if self.blacklist_patterns.is_empty() {
451 self.blacklist_matcher = None;
452 return;
453 }
454
455 let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
456 self.blacklist_matcher = AhoCorasick::new(refs).ok();
457 }
458
459 #[inline]
460 fn is_blacklisted(&self, url: &str) -> bool {
461 self.blacklist_matcher
462 .as_ref()
463 .map(|m| m.is_match(url))
464 .unwrap_or(false)
465 }
466
467 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
469 self.whitelist_patterns.push(pattern.into());
470 self.rebuild_whitelist_matcher();
471 }
472
473 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
475 where
476 I: IntoIterator<Item = S>,
477 S: Into<String>,
478 {
479 self.whitelist_patterns
480 .extend(patterns.into_iter().map(Into::into));
481 self.rebuild_whitelist_matcher();
482 }
483
484 #[inline]
485 fn rebuild_whitelist_matcher(&mut self) {
486 if self.whitelist_patterns.is_empty() {
487 self.whitelist_matcher = None;
488 return;
489 }
490
491 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
492
493 self.whitelist_matcher = AhoCorasick::new(refs).ok();
495 }
496
497 #[inline]
498 fn is_whitelisted(&self, url: &str) -> bool {
499 self.whitelist_matcher
500 .as_ref()
501 .map(|m| m.is_match(url))
502 .unwrap_or(false)
503 }
504
505 pub fn init_commands(&self) -> CommandChain {
507 let cmds = if self.ignore_httpserrors {
508 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
509 } else {
510 INIT_CHAIN.clone()
511 };
512 CommandChain::new(cmds, self.request_timeout)
513 }
514
515 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
517 let method = cmd.identifier();
518 if let Ok(params) = serde_json::to_value(cmd) {
519 self.queued_events
520 .push_back(NetworkEvent::SendCdpRequest((method, params)));
521 }
522 }
523
524 pub fn poll(&mut self) -> Option<NetworkEvent> {
526 self.queued_events.pop_front()
527 }
528
529 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
531 &self.extra_headers
532 }
533
534 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
536 self.extra_headers = headers;
537 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
538 self.extra_headers.remove("Proxy-Authorization");
539 if !self.extra_headers.is_empty() {
540 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
541 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
542 }
543 }
544 }
545
546 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
547 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
548 }
549
550 pub fn set_block_all(&mut self, block_all: bool) {
551 self.block_all = block_all;
552 }
553
554 pub fn set_request_interception(&mut self, enabled: bool) {
555 self.user_request_interception_enabled = enabled;
556 self.update_protocol_request_interception();
557 }
558
559 pub fn set_cache_enabled(&mut self, enabled: bool) {
560 let run = self.user_cache_disabled != !enabled;
561 self.user_cache_disabled = !enabled;
562 if run {
563 self.update_protocol_cache_disabled();
564 }
565 }
566
567 pub fn enable_request_intercept(&mut self) {
569 self.protocol_request_interception_enabled = true;
570 }
571
572 pub fn disable_request_intercept(&mut self) {
574 self.protocol_request_interception_enabled = false;
575 }
576
577 #[cfg(feature = "_cache")]
579 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
580 self.cache_site_key = cache_site_key;
581 }
582
583 #[cfg(feature = "_cache")]
585 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
586 self.cache_policy = cache_policy;
587 }
588
589 pub fn update_protocol_cache_disabled(&mut self) {
590 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
591 }
592
593 pub fn authenticate(&mut self, credentials: Credentials) {
594 self.credentials = Some(credentials);
595 self.update_protocol_request_interception();
596 self.protocol_request_interception_enabled = true;
597 }
598
599 fn update_protocol_request_interception(&mut self) {
600 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
601
602 if enabled == self.protocol_request_interception_enabled {
603 return;
604 }
605
606 if enabled {
607 self.push_cdp_request(ENABLE_FETCH.clone())
608 } else {
609 self.push_cdp_request(DisableParams::default())
610 }
611 }
612
613 #[inline]
616 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
617 let block_analytics = self.block_analytics;
619
620 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
622 {
623 return true;
624 }
625
626 if crate::handler::blockers::block_websites::block_website(url) {
628 return true;
629 }
630
631 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
638 let p_slash = Self::strip_query_fragment(path_with_slash);
640 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
641
642 let base = match p_slash.rsplit('/').next() {
644 Some(b) => b,
645 None => p_slash,
646 };
647
648 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
651 return true;
652 }
653 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
654 return true;
655 }
656 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
657 return true;
658 }
659
660 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
663 return true;
664 }
665
666 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
668 return true;
669 }
670 }
671
672 false
673 }
674
675 #[inline]
680 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
681 let idx = url.find("//")?;
683 let after_slashes = idx + 2;
684
685 let slash_rel = url[after_slashes..].find('/')?;
687 let slash_idx = after_slashes + slash_rel;
688
689 if slash_idx < url.len() {
690 Some(&url[slash_idx..])
691 } else {
692 None
693 }
694 }
695
696 #[inline]
701 fn strip_query_fragment(s: &str) -> &str {
702 let q = s.find('?');
703 let h = s.find('#');
704
705 match (q, h) {
706 (None, None) => s,
707 (Some(i), None) => &s[..i],
708 (None, Some(i)) => &s[..i],
709 (Some(i), Some(j)) => &s[..i.min(j)],
710 }
711 }
712
713 #[inline]
715 fn skip_xhr(
716 &self,
717 skip_networking: bool,
718 event: &EventRequestPaused,
719 network_event: bool,
720 ) -> bool {
721 if !skip_networking && network_event {
723 let request_url = event.request.url.as_str();
724
725 let skip_analytics =
727 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
728
729 if skip_analytics {
730 true
731 } else if self.block_stylesheets || self.ignore_visuals {
732 let block_css = self.block_stylesheets;
733 let block_media = self.ignore_visuals;
734
735 let mut block_request = false;
736
737 if let Some(position) = request_url.rfind('.') {
738 let hlen = request_url.len();
739 let has_asset = hlen - position;
740
741 if has_asset >= 3 {
742 let next_position = position + 1;
743
744 if block_media
745 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
746 &request_url[next_position..].into(),
747 )
748 {
749 block_request = true;
750 } else if block_css {
751 block_request =
752 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
753 .contains(&**CSS_EXTENSION)
754 }
755 }
756 }
757
758 if !block_request {
759 block_request = ignore_script_xhr_media(request_url);
760 }
761
762 block_request
763 } else {
764 skip_networking
765 }
766 } else {
767 skip_networking
768 }
769 }
770
771 #[cfg(feature = "adblock")]
772 #[inline]
773 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
775 if skip_networking {
776 true
777 } else {
778 block_ads(&event.request.url) || self.detect_ad(event)
779 }
780 }
781
782 #[cfg(not(feature = "adblock"))]
784 #[inline]
785 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
786 use crate::handler::blockers::block_websites::block_ads;
787 if skip_networking {
788 true
789 } else {
790 block_ads(&event.request.url)
791 }
792 }
793
794 #[inline]
795 fn fail_request_blocked(
797 &mut self,
798 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
799 ) {
800 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
801 request_id.clone(),
802 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
803 );
804 self.push_cdp_request(params);
805 }
806
807 #[inline]
808 fn fulfill_request_empty_200(
810 &mut self,
811 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
812 ) {
813 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
814 request_id.clone(),
815 200,
816 );
817 self.push_cdp_request(params);
818 }
819
820 #[cfg(feature = "_cache")]
821 #[inline]
822 fn fulfill_request_from_cache(
826 &mut self,
827 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
828 body: &[u8],
829 headers: &std::collections::HashMap<String, String>,
830 status: i64,
831 ) {
832 use crate::cdp::browser_protocol::fetch::HeaderEntry;
833 use crate::handler::network::fetch::FulfillRequestParams;
834 use base64::Engine;
835
836 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
837
838 for (k, v) in headers.iter() {
839 resp_headers.push(HeaderEntry {
840 name: k.clone().into(),
841 value: v.clone().into(),
842 });
843 }
844
845 let mut params = FulfillRequestParams::new(request_id.clone(), status);
846
847 params.body = Some(
849 base64::engine::general_purpose::STANDARD
850 .encode(body)
851 .into(),
852 );
853
854 params.response_headers = Some(resp_headers);
855
856 self.push_cdp_request(params);
857 }
858
859 #[inline]
860 fn continue_request_with_url(
862 &mut self,
863 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
864 url: Option<&str>,
865 intercept_response: bool,
866 ) {
867 let mut params = ContinueRequestParams::new(request_id.clone());
868 if let Some(url) = url {
869 params.url = Some(url.to_string());
870 params.intercept_response = Some(intercept_response);
871 }
872 self.push_cdp_request(params);
873 }
874
875 #[inline]
877 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
878 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
879 return;
880 }
881
882 if self.block_all {
883 tracing::debug!(
884 "Blocked (block_all): {:?} - {}",
885 event.resource_type,
886 event.request.url
887 );
888 return self.fail_request_blocked(&event.request_id);
889 }
890
891 if let Some(network_id) = event.network_id.as_ref() {
892 if let Some(request_will_be_sent) =
893 self.requests_will_be_sent.remove(network_id.as_ref())
894 {
895 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
896 } else {
897 self.request_id_to_interception_id
898 .insert(network_id.clone(), event.request_id.clone().into());
899 }
900 }
901
902 let javascript_resource = event.resource_type == ResourceType::Script;
904 let document_resource = event.resource_type == ResourceType::Document;
905 let network_resource =
906 !document_resource && crate::utils::is_data_resource(&event.resource_type);
907
908 let mut skip_networking =
910 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
911
912 if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
913 skip_networking = true;
914 }
915
916 if !skip_networking {
918 skip_networking = self.document_reload_tracker >= 3;
919 }
920
921 let (current_url_cow, had_replacer) =
923 self.handle_document_replacement_and_tracking(event, document_resource);
924
925 let current_url: &str = current_url_cow.as_ref();
926
927 let blacklisted = self.is_blacklisted(current_url);
928
929 if !self.blacklist_strict && blacklisted {
930 skip_networking = true;
931 }
932
933 if !skip_networking {
934 if self.xml_document && current_url.ends_with(".xsl") {
936 skip_networking = false;
937 } else {
938 skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
939 }
940 }
941
942 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
943
944 if !skip_networking
946 && self.block_javascript
947 && (self.only_html || self.ignore_visuals)
948 && (javascript_resource || document_resource)
949 {
950 skip_networking = ignore_script_embedded(current_url);
951 }
952
953 if !skip_networking && javascript_resource {
956 skip_networking = self.should_block_script_blocklist_only(current_url);
957 }
958
959 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
961
962 if !skip_networking && (javascript_resource || network_resource || document_resource) {
964 skip_networking = self.intercept_manager.intercept_detection(
965 current_url,
966 self.ignore_visuals,
967 network_resource,
968 );
969 }
970
971 if !skip_networking && (javascript_resource || network_resource) {
973 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
974 }
975
976 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
979 {
980 skip_networking = false;
981 }
982
983 if skip_networking && self.is_whitelisted(current_url) {
985 skip_networking = false;
986 }
987
988 if self.blacklist_strict && blacklisted {
989 skip_networking = true;
990 }
991
992 if skip_networking {
993 tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
994 self.fulfill_request_empty_200(&event.request_id);
995 } else {
996 #[cfg(feature = "_cache")]
997 {
998 if let (Some(policy), Some(cache_site_key)) =
999 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1000 {
1001 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1002
1003 if let Some((res, cache_policy)) =
1004 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1005 {
1006 if policy.allows_cached(&cache_policy) {
1007 tracing::debug!(
1008 "Remote Cached: {:?} - {}",
1009 &event.resource_type,
1010 ¤t_url
1011 );
1012 return self.fulfill_request_from_cache(
1013 &event.request_id,
1014 &res.body,
1015 &res.headers,
1016 res.status as i64,
1017 );
1018 }
1019 }
1020 }
1021 }
1022
1023 tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1025 self.continue_request_with_url(
1026 &event.request_id,
1027 if had_replacer {
1028 Some(current_url)
1029 } else {
1030 None
1031 },
1032 !had_replacer,
1033 );
1034 }
1035 }
1036
1037 #[inline]
1043 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1044 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1045 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1046 }
1047
1048 pub fn has_target_domain(&self) -> bool {
1050 !self.document_target_url.is_empty()
1051 }
1052
1053 pub fn set_page_url(&mut self, page_target_url: String) {
1055 let host_base = host_and_rest(&page_target_url)
1056 .map(|(h, _)| base_domain_from_host(h))
1057 .unwrap_or("");
1058
1059 self.document_target_domain = host_base.to_string();
1060 self.document_target_url = page_target_url;
1061 }
1062
1063 pub fn clear_target_domain(&mut self) {
1065 self.document_reload_tracker = 0;
1066 self.document_target_url = Default::default();
1067 self.document_target_domain = Default::default();
1068 }
1069
1070 #[inline]
1078 fn handle_document_replacement_and_tracking<'a>(
1079 &mut self,
1080 event: &'a EventRequestPaused,
1081 document_resource: bool,
1082 ) -> (Cow<'a, str>, bool) {
1083 let mut replacer: Option<String> = None;
1084 let current_url = event.request.url.as_str();
1085
1086 if document_resource {
1087 if self.document_target_url == current_url {
1088 self.document_reload_tracker += 1;
1089 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1090 {
1091 let (http_document_replacement, mut https_document_replacement) =
1092 if self.document_target_url.starts_with("http://") {
1093 (
1094 self.document_target_url.replacen("http://", "http//", 1),
1095 self.document_target_url.replacen("http://", "https://", 1),
1096 )
1097 } else {
1098 (
1099 self.document_target_url.replacen("https://", "https//", 1),
1100 self.document_target_url.replacen("https://", "http://", 1),
1101 )
1102 };
1103
1104 let trailing = https_document_replacement.ends_with('/');
1106 if trailing {
1107 https_document_replacement.pop();
1108 }
1109 if https_document_replacement.ends_with('/') {
1110 https_document_replacement.pop();
1111 }
1112
1113 let redirect_mask = format!(
1114 "{}{}",
1115 https_document_replacement, http_document_replacement
1116 );
1117
1118 if current_url == redirect_mask {
1119 replacer = Some(if trailing {
1120 format!("{}/", https_document_replacement)
1121 } else {
1122 https_document_replacement
1123 });
1124 }
1125 }
1126
1127 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1128 self.xml_document = true;
1129 }
1130
1131 self.document_target_url = event.request.url.clone();
1133 self.document_target_domain = host_and_rest(&self.document_target_url)
1134 .map(|(h, _)| base_domain_from_host(h).to_string())
1135 .unwrap_or_default();
1136 }
1137
1138 let current_url_cow = match replacer {
1139 Some(r) => Cow::Owned(r),
1140 None => Cow::Borrowed(event.request.url.as_str()),
1141 };
1142
1143 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1144 (current_url_cow, had_replacer)
1145 }
1146
1147 #[cfg(feature = "adblock")]
1149 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1150 use adblock::{
1151 lists::{FilterSet, ParseOptions, RuleTypes},
1152 Engine,
1153 };
1154
1155 lazy_static::lazy_static! {
1156 static ref AD_ENGINE: Engine = {
1157 let mut filter_set = FilterSet::new(false);
1158 let mut rules = ParseOptions::default();
1159 rules.rule_types = RuleTypes::All;
1160
1161 filter_set.add_filters(
1162 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1163 rules,
1164 );
1165
1166 Engine::from_filter_set(filter_set, true)
1167 };
1168 };
1169
1170 let blockable = ResourceType::Image == event.resource_type
1171 || event.resource_type == ResourceType::Media
1172 || event.resource_type == ResourceType::Stylesheet
1173 || event.resource_type == ResourceType::Document
1174 || event.resource_type == ResourceType::Fetch
1175 || event.resource_type == ResourceType::Xhr;
1176
1177 let u = &event.request.url;
1178
1179 let block_request = blockable
1180 && {
1182 let request = adblock::request::Request::preparsed(
1183 &u,
1184 "example.com",
1185 "example.com",
1186 &event.resource_type.as_ref().to_lowercase(),
1187 !event.request.is_same_site.unwrap_or_default());
1188
1189 AD_ENGINE.check_network_request(&request).matched
1190 };
1191
1192 block_request
1193 }
1194
1195 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1196 let response = if self
1197 .attempted_authentications
1198 .contains(event.request_id.as_ref())
1199 {
1200 AuthChallengeResponseResponse::CancelAuth
1201 } else if self.credentials.is_some() {
1202 self.attempted_authentications
1203 .insert(event.request_id.clone().into());
1204 AuthChallengeResponseResponse::ProvideCredentials
1205 } else {
1206 AuthChallengeResponseResponse::Default
1207 };
1208
1209 let mut auth = AuthChallengeResponse::new(response);
1210 if let Some(creds) = self.credentials.clone() {
1211 auth.username = Some(creds.username);
1212 auth.password = Some(creds.password);
1213 }
1214 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1215 }
1216
1217 pub fn set_offline_mode(&mut self, value: bool) {
1219 if self.offline == value {
1220 return;
1221 }
1222 self.offline = value;
1223 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1224 .offline(self.offline)
1225 .latency(0)
1226 .download_throughput(-1.)
1227 .upload_throughput(-1.)
1228 .build()
1229 {
1230 self.push_cdp_request(network);
1231 }
1232 }
1233
1234 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1236 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1237 if let Some(interception_id) = self
1238 .request_id_to_interception_id
1239 .remove(event.request_id.as_ref())
1240 {
1241 self.on_request(event, Some(interception_id));
1242 } else {
1243 self.requests_will_be_sent
1245 .insert(event.request_id.clone(), event.clone());
1246 }
1247 } else {
1248 self.on_request(event, None);
1249 }
1250 }
1251
1252 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1254 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1255 request.from_memory_cache = true;
1256 }
1257 }
1258
1259 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1261 let mut request_failed = false;
1262
1263 let mut deducted: u64 = 0;
1265
1266 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1267 let before = *max_bytes;
1268
1269 let received_bytes: u64 = event.response.encoded_data_length as u64;
1271
1272 let content_length: Option<u64> = event
1274 .response
1275 .headers
1276 .inner()
1277 .get("content-length")
1278 .and_then(|v| v.as_str())
1279 .and_then(|s| s.trim().parse::<u64>().ok());
1280
1281 *max_bytes = max_bytes.saturating_sub(received_bytes);
1283
1284 if let Some(cl) = content_length {
1286 if cl > *max_bytes {
1287 *max_bytes = 0;
1288 }
1289 }
1290
1291 request_failed = *max_bytes == 0;
1292
1293 deducted = before.saturating_sub(*max_bytes);
1295 }
1296
1297 if deducted > 0 {
1299 self.queued_events
1300 .push_back(NetworkEvent::BytesConsumed(deducted));
1301 }
1302
1303 if request_failed && self.max_bytes_allowed.is_some() {
1305 self.set_block_all(true);
1306 }
1307
1308 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1309 request.set_response(event.response.clone());
1310 self.queued_events.push_back(if request_failed {
1311 NetworkEvent::RequestFailed(request)
1312 } else {
1313 NetworkEvent::RequestFinished(request)
1314 });
1315 }
1316 }
1317
1318 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1320 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1321 if let Some(interception_id) = request.interception_id.as_ref() {
1322 self.attempted_authentications
1323 .remove(interception_id.as_ref());
1324 }
1325 self.queued_events
1326 .push_back(NetworkEvent::RequestFinished(request));
1327 }
1328 }
1329
1330 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1332 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1333 request.failure_text = Some(event.error_text.clone());
1334 if let Some(interception_id) = request.interception_id.as_ref() {
1335 self.attempted_authentications
1336 .remove(interception_id.as_ref());
1337 }
1338 self.queued_events
1339 .push_back(NetworkEvent::RequestFailed(request));
1340 }
1341 }
1342
1343 fn on_request(
1345 &mut self,
1346 event: &EventRequestWillBeSent,
1347 interception_id: Option<InterceptionId>,
1348 ) {
1349 let mut redirect_chain = Vec::new();
1350 let mut redirect_location = None;
1351
1352 if let Some(redirect_resp) = &event.redirect_response {
1353 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1354 if is_redirect_status(redirect_resp.status) {
1355 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1356 if redirect_resp.url != location {
1357 let fixed_location = location.replace(&redirect_resp.url, "");
1358
1359 if !fixed_location.is_empty() {
1360 request.response.as_mut().map(|resp| {
1361 resp.headers.0["Location"] =
1362 serde_json::Value::String(fixed_location.clone());
1363 });
1364 }
1365
1366 redirect_location = Some(fixed_location);
1367 }
1368 }
1369 }
1370
1371 self.handle_request_redirect(
1372 &mut request,
1373 if let Some(redirect_location) = redirect_location {
1374 let mut redirect_resp = redirect_resp.clone();
1375
1376 if !redirect_location.is_empty() {
1377 redirect_resp.headers.0["Location"] =
1378 serde_json::Value::String(redirect_location);
1379 }
1380
1381 redirect_resp
1382 } else {
1383 redirect_resp.clone()
1384 },
1385 );
1386
1387 redirect_chain = std::mem::take(&mut request.redirect_chain);
1388 redirect_chain.push(request);
1389 }
1390 }
1391
1392 let request = HttpRequest::new(
1393 event.request_id.clone(),
1394 event.frame_id.clone(),
1395 interception_id,
1396 self.user_request_interception_enabled,
1397 redirect_chain,
1398 );
1399
1400 self.requests.insert(event.request_id.clone(), request);
1401 self.queued_events
1402 .push_back(NetworkEvent::Request(event.request_id.clone()));
1403 }
1404
1405 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1407 request.set_response(response);
1408 if let Some(interception_id) = request.interception_id.as_ref() {
1409 self.attempted_authentications
1410 .remove(interception_id.as_ref());
1411 }
1412 }
1413}
1414
1415#[derive(Debug)]
1416pub enum NetworkEvent {
1417 SendCdpRequest((MethodId, serde_json::Value)),
1419 Request(RequestId),
1421 Response(RequestId),
1423 RequestFailed(HttpRequest),
1425 RequestFinished(HttpRequest),
1427 BytesConsumed(u64),
1429}
1430
1431#[cfg(test)]
1432mod tests {
1433 use super::ALLOWED_MATCHER_3RD_PARTY;
1434 use crate::handler::network::NetworkManager;
1435 use std::time::Duration;
1436
1437 #[test]
1438 fn test_allowed_matcher_3rd_party() {
1439 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1441 assert!(
1442 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1443 "expected Cloudflare challenge script to be allowed"
1444 );
1445
1446 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1448 assert!(
1449 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1450 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1451 );
1452
1453 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1455 assert!(ALLOWED_MATCHER_3RD_PARTY
1456 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1457 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1458 }
1459
1460 #[test]
1461 fn test_script_allowed_by_default_when_not_blocklisted() {
1462 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1463 nm.set_page_url(
1464 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1465 );
1466
1467 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1469 assert!(
1470 !nm.should_block_script_blocklist_only(ok),
1471 "expected non-blocklisted script to be allowed"
1472 );
1473 }
1474
1475 #[test]
1476 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1477 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1478 nm.set_page_url(
1479 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1480 );
1481
1482 let bad = "https://cdn.example.net/js/analytics.js";
1484 assert!(
1485 nm.should_block_script_blocklist_only(bad),
1486 "expected analytics.js to be blocklisted"
1487 );
1488 }
1489
1490 #[test]
1491 fn test_allowed_matcher_3rd_party_sanity() {
1492 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1494 assert!(
1495 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1496 "expected Cloudflare challenge script to be allowed"
1497 );
1498
1499 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1501 assert!(
1502 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1503 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1504 );
1505
1506 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1507 assert!(ALLOWED_MATCHER_3RD_PARTY
1508 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1509 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1510 }
1511 #[test]
1512 fn test_dynamic_blacklist_blocks_url() {
1513 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1514 nm.set_page_url("https://example.com/".to_string());
1515
1516 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1517 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1518 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1519
1520 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1521 }
1522
1523 #[test]
1524 fn test_blacklist_strict_wins_over_whitelist() {
1525 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1526 nm.set_page_url("https://example.com/".to_string());
1527
1528 nm.set_blacklist_patterns(["beacon.min.js"]);
1530 nm.set_whitelist_patterns(["beacon.min.js"]);
1531
1532 nm.set_blacklist_strict(true);
1533
1534 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1535 assert!(nm.is_whitelisted(u));
1536 assert!(nm.is_blacklisted(u));
1537
1538 assert!(nm.blacklist_strict);
1541 }
1542
1543 #[test]
1544 fn test_blacklist_non_strict_allows_whitelist_override() {
1545 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1546 nm.set_page_url("https://example.com/".to_string());
1547
1548 nm.set_blacklist_patterns(["beacon.min.js"]);
1549 nm.set_whitelist_patterns(["beacon.min.js"]);
1550
1551 nm.set_blacklist_strict(false);
1552
1553 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1554 assert!(nm.is_blacklisted(u));
1555 assert!(nm.is_whitelisted(u));
1556 assert!(!nm.blacklist_strict);
1557 }
1558}