1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "captcha",
66 "client",
67 "/cdn-cgi/challenge-platform/",
68 "/wp-content/js/", "https://m.stripe.network/",
71 "https://challenges.cloudflare.com/",
72 "https://www.google.com/recaptcha/enterprise.js",
73 "https://www.google.com/recaptcha/api.js",
74 "https://google.com/recaptcha/api.js",
75 "https://captcha.px-cloud.net/",
76 "https://geo.captcha-delivery.com/",
77 "https://cdn.auth0.com/js/lock/",
78 "https://captcha.gtimg.com",
79 "https://newassets.hcaptcha.com/",
80 "https://cdn.auth0.com/client",
81 "https://js.stripe.com/",
82 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
85 ];
86
87 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
92
93 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
95 "https://m.stripe.network/",
97 "https://challenges.cloudflare.com/",
98 "https://www.google.com/recaptcha/api.js",
99 "https://google.com/recaptcha/api.js",
100 "https://www.google.com/recaptcha/enterprise.js",
101 "https://js.stripe.com/",
102 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
105 "https://ct.captcha-delivery.com/",
106 "https://geo.captcha-delivery.com/",
107 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
109 "https://cdn.auth0.com/client",
110 "https://captcha.px-cloud.net/",
111 "https://www.gstatic.com/recaptcha/",
112 "https://www.google.com/recaptcha/api2/",
113 "https://www.recaptcha.net/recaptcha/",
114 "https://js.hcaptcha.com/1/api.js",
115 "https://hcaptcha.com/1/api.js",
116 "https://js.datadome.co/tags.js",
117 "https://api-js.datadome.co/",
118 "https://client.perimeterx.net/",
119 "https://captcha.px-cdn.net/",
120 "https://newassets.hcaptcha.com/",
121 "https://captcha.px-cloud.net/",
122 "https://s.perimeterx.net/",
123 "https://client-api.arkoselabs.com/v2/",
124 "https://static.geetest.com/v4/gt4.js",
125 "https://static.geetest.com/",
126 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
127 "https://cdn.perfdrive.com/aperture/",
128 "https://assets.queue-it.net/",
129 "discourse-cdn.com/",
130 "hcaptcha.com",
131 "/cdn-cgi/challenge-platform/",
132 "/_Incapsula_Resource"
133 ];
134
135 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
137
138 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
140 phf::phf_set! {
141 "_astro/", "_app/immutable"
143 }
144 };
145
146 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
148 "application/pdf",
149 "application/zip",
150 "application/x-rar-compressed",
151 "application/x-tar",
152 "image/png",
153 "image/jpeg",
154 "image/gif",
155 "image/bmp",
156 "image/webp",
157 "image/svg+xml",
158 "video/mp4",
159 "video/x-msvideo",
160 "video/x-matroska",
161 "video/webm",
162 "audio/mpeg",
163 "audio/ogg",
164 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
165 "application/vnd.ms-excel",
166 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
167 "application/vnd.ms-powerpoint",
168 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
169 "application/x-7z-compressed",
170 "application/x-rpm",
171 "application/x-shockwave-flash",
172 "application/rtf",
173 };
174
175 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
177 "Image",
178 "Media",
179 "Font"
180 };
181
182 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
184 "CspViolationReport",
185 "Manifest",
186 "Other",
187 "Prefetch",
188 "Ping",
189 };
190
191 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
193
194 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
196 let enable = EnableParams::default();
197
198 if let Ok(c) = serde_json::to_value(&enable) {
199 vec![(enable.identifier(), c)]
200 } else {
201 vec![]
202 }
203 };
204
205 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
207 let enable = EnableParams::default();
208 let mut v = vec![];
209 if let Ok(c) = serde_json::to_value(&enable) {
210 v.push((enable.identifier(), c));
211 }
212 let ignore = SetIgnoreCertificateErrorsParams::new(true);
213 if let Ok(ignored) = serde_json::to_value(&ignore) {
214 v.push((ignore.identifier(), ignored));
215 }
216
217 v
218 };
219
220 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
222 fetch::EnableParams::builder()
223 .handle_auth_requests(true)
224 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
225 .build()
226 };
227}
228
229pub(crate) fn is_redirect_status(status: i64) -> bool {
231 matches!(status, 301 | 302 | 303 | 307 | 308)
232}
233
234#[derive(Debug)]
235pub struct NetworkManager {
237 queued_events: VecDeque<NetworkEvent>,
243 ignore_httpserrors: bool,
248 requests: HashMap<RequestId, HttpRequest>,
253 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
260 extra_headers: std::collections::HashMap<String, String>,
265 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
271 user_cache_disabled: bool,
276 attempted_authentications: HashSet<RequestId>,
282 credentials: Option<Credentials>,
287 pub(crate) user_request_interception_enabled: bool,
296 block_all: bool,
303 pub(crate) protocol_request_interception_enabled: bool,
309 offline: bool,
311 pub request_timeout: Duration,
313 pub ignore_visuals: bool,
316 pub block_stylesheets: bool,
318 pub block_javascript: bool,
323 pub block_analytics: bool,
325 pub only_html: bool,
327 pub xml_document: bool,
329 pub intercept_manager: NetworkInterceptManager,
331 pub document_reload_tracker: u8,
333 pub document_target_url: String,
335 pub document_target_domain: String,
337 pub max_bytes_allowed: Option<u64>,
339 #[cfg(feature = "_cache")]
340 pub cache_site_key: Option<String>,
342 #[cfg(feature = "_cache")]
344 pub cache_policy: Option<BasicCachePolicy>,
345 whitelist_patterns: Vec<String>,
347 whitelist_matcher: Option<AhoCorasick>,
349 blacklist_patterns: Vec<String>,
351 blacklist_matcher: Option<AhoCorasick>,
353 blacklist_strict: bool,
355}
356
357impl NetworkManager {
358 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
360 Self {
361 queued_events: Default::default(),
362 ignore_httpserrors,
363 requests: Default::default(),
364 requests_will_be_sent: Default::default(),
365 extra_headers: Default::default(),
366 request_id_to_interception_id: Default::default(),
367 user_cache_disabled: false,
368 attempted_authentications: Default::default(),
369 credentials: None,
370 block_all: false,
371 user_request_interception_enabled: false,
372 protocol_request_interception_enabled: false,
373 offline: false,
374 request_timeout,
375 ignore_visuals: false,
376 block_javascript: false,
377 block_stylesheets: false,
378 block_analytics: true,
379 only_html: false,
380 xml_document: false,
381 intercept_manager: NetworkInterceptManager::Unknown,
382 document_reload_tracker: 0,
383 document_target_url: String::new(),
384 document_target_domain: String::new(),
385 whitelist_patterns: Vec::new(),
386 whitelist_matcher: None,
387 blacklist_patterns: Vec::new(),
388 blacklist_matcher: None,
389 blacklist_strict: true,
390 max_bytes_allowed: None,
391 #[cfg(feature = "_cache")]
392 cache_site_key: None,
393 #[cfg(feature = "_cache")]
394 cache_policy: None,
395 }
396 }
397
398 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
400 where
401 I: IntoIterator<Item = S>,
402 S: Into<String>,
403 {
404 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
405 self.rebuild_whitelist_matcher();
406 }
407
408 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
410 where
411 I: IntoIterator<Item = S>,
412 S: Into<String>,
413 {
414 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
415 self.rebuild_blacklist_matcher();
416 }
417
418 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
420 self.blacklist_patterns.push(pattern.into());
421 self.rebuild_blacklist_matcher();
422 }
423
424 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
426 where
427 I: IntoIterator<Item = S>,
428 S: Into<String>,
429 {
430 self.blacklist_patterns
431 .extend(patterns.into_iter().map(Into::into));
432 self.rebuild_blacklist_matcher();
433 }
434
435 pub fn clear_blacklist(&mut self) {
437 self.blacklist_patterns.clear();
438 self.blacklist_matcher = None;
439 }
440
441 pub fn set_blacklist_strict(&mut self, strict: bool) {
443 self.blacklist_strict = strict;
444 }
445
446 #[inline]
447 fn rebuild_blacklist_matcher(&mut self) {
448 if self.blacklist_patterns.is_empty() {
449 self.blacklist_matcher = None;
450 return;
451 }
452
453 let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
454 self.blacklist_matcher = AhoCorasick::new(refs).ok();
455 }
456
457 #[inline]
458 fn is_blacklisted(&self, url: &str) -> bool {
459 self.blacklist_matcher
460 .as_ref()
461 .map(|m| m.is_match(url))
462 .unwrap_or(false)
463 }
464
465 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
467 self.whitelist_patterns.push(pattern.into());
468 self.rebuild_whitelist_matcher();
469 }
470
471 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
473 where
474 I: IntoIterator<Item = S>,
475 S: Into<String>,
476 {
477 self.whitelist_patterns
478 .extend(patterns.into_iter().map(Into::into));
479 self.rebuild_whitelist_matcher();
480 }
481
482 #[inline]
483 fn rebuild_whitelist_matcher(&mut self) {
484 if self.whitelist_patterns.is_empty() {
485 self.whitelist_matcher = None;
486 return;
487 }
488
489 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
490
491 self.whitelist_matcher = AhoCorasick::new(refs).ok();
493 }
494
495 #[inline]
496 fn is_whitelisted(&self, url: &str) -> bool {
497 self.whitelist_matcher
498 .as_ref()
499 .map(|m| m.is_match(url))
500 .unwrap_or(false)
501 }
502
503 pub fn init_commands(&self) -> CommandChain {
505 let cmds = if self.ignore_httpserrors {
506 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
507 } else {
508 INIT_CHAIN.clone()
509 };
510 CommandChain::new(cmds, self.request_timeout)
511 }
512
513 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
515 let method = cmd.identifier();
516 if let Ok(params) = serde_json::to_value(cmd) {
517 self.queued_events
518 .push_back(NetworkEvent::SendCdpRequest((method, params)));
519 }
520 }
521
522 pub fn poll(&mut self) -> Option<NetworkEvent> {
524 self.queued_events.pop_front()
525 }
526
527 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
529 &self.extra_headers
530 }
531
532 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
534 self.extra_headers = headers;
535 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
536 self.extra_headers.remove("Proxy-Authorization");
537 if !self.extra_headers.is_empty() {
538 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
539 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
540 }
541 }
542 }
543
544 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
545 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
546 }
547
548 pub fn set_block_all(&mut self, block_all: bool) {
549 self.block_all = block_all;
550 }
551
552 pub fn set_request_interception(&mut self, enabled: bool) {
553 self.user_request_interception_enabled = enabled;
554 self.update_protocol_request_interception();
555 }
556
557 pub fn set_cache_enabled(&mut self, enabled: bool) {
558 let run = self.user_cache_disabled != !enabled;
559 self.user_cache_disabled = !enabled;
560 if run {
561 self.update_protocol_cache_disabled();
562 }
563 }
564
565 pub fn enable_request_intercept(&mut self) {
567 self.protocol_request_interception_enabled = true;
568 }
569
570 pub fn disable_request_intercept(&mut self) {
572 self.protocol_request_interception_enabled = false;
573 }
574
575 #[cfg(feature = "_cache")]
577 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
578 self.cache_site_key = cache_site_key;
579 }
580
581 #[cfg(feature = "_cache")]
583 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
584 self.cache_policy = cache_policy;
585 }
586
587 pub fn update_protocol_cache_disabled(&mut self) {
588 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
589 }
590
591 pub fn authenticate(&mut self, credentials: Credentials) {
592 self.credentials = Some(credentials);
593 self.update_protocol_request_interception();
594 self.protocol_request_interception_enabled = true;
595 }
596
597 fn update_protocol_request_interception(&mut self) {
598 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
599
600 if enabled == self.protocol_request_interception_enabled {
601 return;
602 }
603
604 if enabled {
605 self.push_cdp_request(ENABLE_FETCH.clone())
606 } else {
607 self.push_cdp_request(DisableParams::default())
608 }
609 }
610
611 #[inline]
614 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
615 let block_analytics = self.block_analytics;
617
618 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
620 {
621 return true;
622 }
623
624 if crate::handler::blockers::block_websites::block_website(url) {
626 return true;
627 }
628
629 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
636 let p_slash = Self::strip_query_fragment(path_with_slash);
638 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
639
640 let base = match p_slash.rsplit('/').next() {
642 Some(b) => b,
643 None => p_slash,
644 };
645
646 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
649 return true;
650 }
651 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
652 return true;
653 }
654 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
655 return true;
656 }
657
658 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
661 return true;
662 }
663
664 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
666 return true;
667 }
668 }
669
670 false
671 }
672
673 #[inline]
678 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
679 let idx = url.find("//")?;
681 let after_slashes = idx + 2;
682
683 let slash_rel = url[after_slashes..].find('/')?;
685 let slash_idx = after_slashes + slash_rel;
686
687 if slash_idx < url.len() {
688 Some(&url[slash_idx..])
689 } else {
690 None
691 }
692 }
693
694 #[inline]
699 fn strip_query_fragment(s: &str) -> &str {
700 let q = s.find('?');
701 let h = s.find('#');
702
703 match (q, h) {
704 (None, None) => s,
705 (Some(i), None) => &s[..i],
706 (None, Some(i)) => &s[..i],
707 (Some(i), Some(j)) => &s[..i.min(j)],
708 }
709 }
710
711 #[inline]
713 fn skip_xhr(
714 &self,
715 skip_networking: bool,
716 event: &EventRequestPaused,
717 network_event: bool,
718 ) -> bool {
719 if !skip_networking && network_event {
721 let request_url = event.request.url.as_str();
722
723 let skip_analytics =
725 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
726
727 if skip_analytics {
728 true
729 } else if self.block_stylesheets || self.ignore_visuals {
730 let block_css = self.block_stylesheets;
731 let block_media = self.ignore_visuals;
732
733 let mut block_request = false;
734
735 if let Some(position) = request_url.rfind('.') {
736 let hlen = request_url.len();
737 let has_asset = hlen - position;
738
739 if has_asset >= 3 {
740 let next_position = position + 1;
741
742 if block_media
743 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
744 &request_url[next_position..].into(),
745 )
746 {
747 block_request = true;
748 } else if block_css {
749 block_request =
750 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
751 .contains(&**CSS_EXTENSION)
752 }
753 }
754 }
755
756 if !block_request {
757 block_request = ignore_script_xhr_media(request_url);
758 }
759
760 block_request
761 } else {
762 skip_networking
763 }
764 } else {
765 skip_networking
766 }
767 }
768
769 #[cfg(feature = "adblock")]
770 #[inline]
771 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
773 if skip_networking {
774 true
775 } else {
776 block_ads(&event.request.url) || self.detect_ad(event)
777 }
778 }
779
780 #[cfg(not(feature = "adblock"))]
782 #[inline]
783 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
784 use crate::handler::blockers::block_websites::block_ads;
785 if skip_networking {
786 true
787 } else {
788 block_ads(&event.request.url)
789 }
790 }
791
792 #[inline]
793 fn fail_request_blocked(
795 &mut self,
796 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
797 ) {
798 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
799 request_id.clone(),
800 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
801 );
802 self.push_cdp_request(params);
803 }
804
805 #[inline]
806 fn fulfill_request_empty_200(
808 &mut self,
809 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
810 ) {
811 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
812 request_id.clone(),
813 200,
814 );
815 self.push_cdp_request(params);
816 }
817
818 #[cfg(feature = "_cache")]
819 #[inline]
820 fn fulfill_request_from_cache(
824 &mut self,
825 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
826 body: &[u8],
827 headers: &std::collections::HashMap<String, String>,
828 status: i64,
829 ) {
830 use crate::cdp::browser_protocol::fetch::HeaderEntry;
831 use crate::handler::network::fetch::FulfillRequestParams;
832 use base64::Engine;
833
834 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
835
836 for (k, v) in headers.iter() {
837 resp_headers.push(HeaderEntry {
838 name: k.clone().into(),
839 value: v.clone().into(),
840 });
841 }
842
843 let mut params = FulfillRequestParams::new(request_id.clone(), status);
844
845 params.body = Some(
847 base64::engine::general_purpose::STANDARD
848 .encode(body)
849 .into(),
850 );
851
852 params.response_headers = Some(resp_headers);
853
854 self.push_cdp_request(params);
855 }
856
857 #[inline]
858 fn continue_request_with_url(
860 &mut self,
861 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
862 url: Option<&str>,
863 intercept_response: bool,
864 ) {
865 let mut params = ContinueRequestParams::new(request_id.clone());
866 if let Some(url) = url {
867 params.url = Some(url.to_string());
868 params.intercept_response = Some(intercept_response);
869 }
870 self.push_cdp_request(params);
871 }
872
873 #[inline]
875 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
876 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
877 return;
878 }
879
880 let resource_type = &event.resource_type;
881
882 if self.block_all {
883 tracing::debug!(
884 "Blocked (block_all): {:?} - {}",
885 event.resource_type,
886 event.request.url
887 );
888 return self.fail_request_blocked(&event.request_id);
889 }
890
891 if let Some(network_id) = event.network_id.as_ref() {
892 if let Some(request_will_be_sent) =
893 self.requests_will_be_sent.remove(network_id.as_ref())
894 {
895 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
896 } else {
897 self.request_id_to_interception_id
898 .insert(network_id.clone(), event.request_id.clone().into());
899 }
900 }
901
902 let javascript_resource = *resource_type == ResourceType::Script;
904 let document_resource = *resource_type == ResourceType::Document;
905 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
906
907 let mut skip_networking =
909 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
910
911 if !skip_networking {
913 skip_networking = self.document_reload_tracker >= 3;
914 }
915
916 let (current_url_cow, had_replacer) =
918 self.handle_document_replacement_and_tracking(event, document_resource);
919
920 let current_url: &str = current_url_cow.as_ref();
921
922 let blacklisted = self.is_blacklisted(current_url);
923
924 if !self.blacklist_strict && blacklisted {
925 skip_networking = true;
926 }
927
928 if !skip_networking {
929 if self.xml_document && current_url.ends_with(".xsl") {
931 skip_networking = false;
932 } else {
933 skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
934 }
935 }
936
937 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
938
939 if !skip_networking
941 && self.block_javascript
942 && (self.only_html || self.ignore_visuals)
943 && (javascript_resource || document_resource)
944 {
945 skip_networking = ignore_script_embedded(current_url);
946 }
947
948 if !skip_networking && javascript_resource {
951 skip_networking = self.should_block_script_blocklist_only(current_url);
952 }
953
954 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
956
957 if !skip_networking && (javascript_resource || network_resource || document_resource) {
959 skip_networking = self.intercept_manager.intercept_detection(
960 current_url,
961 self.ignore_visuals,
962 network_resource,
963 );
964 }
965
966 if !skip_networking && (javascript_resource || network_resource) {
968 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
969 }
970
971 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
974 {
975 skip_networking = false;
976 }
977
978 if skip_networking && self.is_whitelisted(current_url) {
980 skip_networking = false;
981 }
982
983 if self.blacklist_strict && blacklisted {
984 skip_networking = true;
985 }
986
987 if skip_networking {
988 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
989 self.fulfill_request_empty_200(&event.request_id);
990 } else {
991 #[cfg(feature = "_cache")]
992 {
993 if let (Some(policy), Some(cache_site_key)) =
994 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
995 {
996 let current_url = format!("{}:{}", event.request.method, ¤t_url);
997
998 if let Some((res, cache_policy)) =
999 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1000 {
1001 if policy.allows_cached(&cache_policy) {
1002 tracing::debug!(
1003 "Remote Cached: {:?} - {}",
1004 resource_type,
1005 ¤t_url
1006 );
1007 return self.fulfill_request_from_cache(
1008 &event.request_id,
1009 &res.body,
1010 &res.headers,
1011 res.status as i64,
1012 );
1013 }
1014 }
1015 }
1016 }
1017
1018 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1020 self.continue_request_with_url(
1021 &event.request_id,
1022 if had_replacer {
1023 Some(current_url)
1024 } else {
1025 None
1026 },
1027 !had_replacer,
1028 );
1029 }
1030 }
1031
1032 #[inline]
1038 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1039 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1040 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1041 }
1042
1043 pub fn has_target_domain(&self) -> bool {
1045 !self.document_target_url.is_empty()
1046 }
1047
1048 pub fn set_page_url(&mut self, page_target_url: String) {
1050 let host_base = host_and_rest(&page_target_url)
1051 .map(|(h, _)| base_domain_from_host(h))
1052 .unwrap_or("");
1053
1054 self.document_target_domain = host_base.to_string();
1055 self.document_target_url = page_target_url;
1056 }
1057
1058 pub fn clear_target_domain(&mut self) {
1060 self.document_reload_tracker = 0;
1061 self.document_target_url = Default::default();
1062 self.document_target_domain = Default::default();
1063 }
1064
1065 #[inline]
1073 fn handle_document_replacement_and_tracking<'a>(
1074 &mut self,
1075 event: &'a EventRequestPaused,
1076 document_resource: bool,
1077 ) -> (Cow<'a, str>, bool) {
1078 let mut replacer: Option<String> = None;
1079 let current_url = event.request.url.as_str();
1080
1081 if document_resource {
1082 if self.document_target_url == current_url {
1083 self.document_reload_tracker += 1;
1084 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1085 {
1086 let (http_document_replacement, mut https_document_replacement) =
1087 if self.document_target_url.starts_with("http://") {
1088 (
1089 self.document_target_url.replacen("http://", "http//", 1),
1090 self.document_target_url.replacen("http://", "https://", 1),
1091 )
1092 } else {
1093 (
1094 self.document_target_url.replacen("https://", "https//", 1),
1095 self.document_target_url.replacen("https://", "http://", 1),
1096 )
1097 };
1098
1099 let trailing = https_document_replacement.ends_with('/');
1101 if trailing {
1102 https_document_replacement.pop();
1103 }
1104 if https_document_replacement.ends_with('/') {
1105 https_document_replacement.pop();
1106 }
1107
1108 let redirect_mask = format!(
1109 "{}{}",
1110 https_document_replacement, http_document_replacement
1111 );
1112
1113 if current_url == redirect_mask {
1114 replacer = Some(if trailing {
1115 format!("{}/", https_document_replacement)
1116 } else {
1117 https_document_replacement
1118 });
1119 }
1120 }
1121
1122 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1123 self.xml_document = true;
1124 }
1125
1126 self.document_target_url = event.request.url.clone();
1128 self.document_target_domain = host_and_rest(&self.document_target_url)
1129 .map(|(h, _)| base_domain_from_host(h).to_string())
1130 .unwrap_or_default();
1131 }
1132
1133 let current_url_cow = match replacer {
1134 Some(r) => Cow::Owned(r),
1135 None => Cow::Borrowed(event.request.url.as_str()),
1136 };
1137
1138 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1139 (current_url_cow, had_replacer)
1140 }
1141
1142 #[cfg(feature = "adblock")]
1144 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1145 use adblock::{
1146 lists::{FilterSet, ParseOptions, RuleTypes},
1147 Engine,
1148 };
1149
1150 lazy_static::lazy_static! {
1151 static ref AD_ENGINE: Engine = {
1152 let mut filter_set = FilterSet::new(false);
1153 let mut rules = ParseOptions::default();
1154 rules.rule_types = RuleTypes::All;
1155
1156 filter_set.add_filters(
1157 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1158 rules,
1159 );
1160
1161 Engine::from_filter_set(filter_set, true)
1162 };
1163 };
1164
1165 let blockable = ResourceType::Image == event.resource_type
1166 || event.resource_type == ResourceType::Media
1167 || event.resource_type == ResourceType::Stylesheet
1168 || event.resource_type == ResourceType::Document
1169 || event.resource_type == ResourceType::Fetch
1170 || event.resource_type == ResourceType::Xhr;
1171
1172 let u = &event.request.url;
1173
1174 let block_request = blockable
1175 && {
1177 let request = adblock::request::Request::preparsed(
1178 &u,
1179 "example.com",
1180 "example.com",
1181 &event.resource_type.as_ref().to_lowercase(),
1182 !event.request.is_same_site.unwrap_or_default());
1183
1184 AD_ENGINE.check_network_request(&request).matched
1185 };
1186
1187 block_request
1188 }
1189
1190 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1191 let response = if self
1192 .attempted_authentications
1193 .contains(event.request_id.as_ref())
1194 {
1195 AuthChallengeResponseResponse::CancelAuth
1196 } else if self.credentials.is_some() {
1197 self.attempted_authentications
1198 .insert(event.request_id.clone().into());
1199 AuthChallengeResponseResponse::ProvideCredentials
1200 } else {
1201 AuthChallengeResponseResponse::Default
1202 };
1203
1204 let mut auth = AuthChallengeResponse::new(response);
1205 if let Some(creds) = self.credentials.clone() {
1206 auth.username = Some(creds.username);
1207 auth.password = Some(creds.password);
1208 }
1209 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1210 }
1211
1212 pub fn set_offline_mode(&mut self, value: bool) {
1214 if self.offline == value {
1215 return;
1216 }
1217 self.offline = value;
1218 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1219 .offline(self.offline)
1220 .latency(0)
1221 .download_throughput(-1.)
1222 .upload_throughput(-1.)
1223 .build()
1224 {
1225 self.push_cdp_request(network);
1226 }
1227 }
1228
1229 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1231 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1232 if let Some(interception_id) = self
1233 .request_id_to_interception_id
1234 .remove(event.request_id.as_ref())
1235 {
1236 self.on_request(event, Some(interception_id));
1237 } else {
1238 self.requests_will_be_sent
1240 .insert(event.request_id.clone(), event.clone());
1241 }
1242 } else {
1243 self.on_request(event, None);
1244 }
1245 }
1246
1247 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1249 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1250 request.from_memory_cache = true;
1251 }
1252 }
1253
1254 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1256 let mut request_failed = false;
1257
1258 let mut deducted: u64 = 0;
1260
1261 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1262 let before = *max_bytes;
1263
1264 let received_bytes: u64 = event.response.encoded_data_length as u64;
1266
1267 let content_length: Option<u64> = event
1269 .response
1270 .headers
1271 .inner()
1272 .get("content-length")
1273 .and_then(|v| v.as_str())
1274 .and_then(|s| s.trim().parse::<u64>().ok());
1275
1276 *max_bytes = max_bytes.saturating_sub(received_bytes);
1278
1279 if let Some(cl) = content_length {
1281 if cl > *max_bytes {
1282 *max_bytes = 0;
1283 }
1284 }
1285
1286 request_failed = *max_bytes == 0;
1287
1288 deducted = before.saturating_sub(*max_bytes);
1290 }
1291
1292 if deducted > 0 {
1294 self.queued_events
1295 .push_back(NetworkEvent::BytesConsumed(deducted));
1296 }
1297
1298 if request_failed && self.max_bytes_allowed.is_some() {
1300 self.set_block_all(true);
1301 }
1302
1303 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1304 request.set_response(event.response.clone());
1305 self.queued_events.push_back(if request_failed {
1306 NetworkEvent::RequestFailed(request)
1307 } else {
1308 NetworkEvent::RequestFinished(request)
1309 });
1310 }
1311 }
1312
1313 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1315 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1316 if let Some(interception_id) = request.interception_id.as_ref() {
1317 self.attempted_authentications
1318 .remove(interception_id.as_ref());
1319 }
1320 self.queued_events
1321 .push_back(NetworkEvent::RequestFinished(request));
1322 }
1323 }
1324
1325 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1327 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1328 request.failure_text = Some(event.error_text.clone());
1329 if let Some(interception_id) = request.interception_id.as_ref() {
1330 self.attempted_authentications
1331 .remove(interception_id.as_ref());
1332 }
1333 self.queued_events
1334 .push_back(NetworkEvent::RequestFailed(request));
1335 }
1336 }
1337
1338 fn on_request(
1340 &mut self,
1341 event: &EventRequestWillBeSent,
1342 interception_id: Option<InterceptionId>,
1343 ) {
1344 let mut redirect_chain = Vec::new();
1345 let mut redirect_location = None;
1346
1347 if let Some(redirect_resp) = &event.redirect_response {
1348 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1349 if is_redirect_status(redirect_resp.status) {
1350 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1351 if redirect_resp.url != location {
1352 let fixed_location = location.replace(&redirect_resp.url, "");
1353
1354 if !fixed_location.is_empty() {
1355 request.response.as_mut().map(|resp| {
1356 resp.headers.0["Location"] =
1357 serde_json::Value::String(fixed_location.clone());
1358 });
1359 }
1360
1361 redirect_location = Some(fixed_location);
1362 }
1363 }
1364 }
1365
1366 self.handle_request_redirect(
1367 &mut request,
1368 if let Some(redirect_location) = redirect_location {
1369 let mut redirect_resp = redirect_resp.clone();
1370
1371 if !redirect_location.is_empty() {
1372 redirect_resp.headers.0["Location"] =
1373 serde_json::Value::String(redirect_location);
1374 }
1375
1376 redirect_resp
1377 } else {
1378 redirect_resp.clone()
1379 },
1380 );
1381
1382 redirect_chain = std::mem::take(&mut request.redirect_chain);
1383 redirect_chain.push(request);
1384 }
1385 }
1386
1387 let request = HttpRequest::new(
1388 event.request_id.clone(),
1389 event.frame_id.clone(),
1390 interception_id,
1391 self.user_request_interception_enabled,
1392 redirect_chain,
1393 );
1394
1395 self.requests.insert(event.request_id.clone(), request);
1396 self.queued_events
1397 .push_back(NetworkEvent::Request(event.request_id.clone()));
1398 }
1399
1400 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1402 request.set_response(response);
1403 if let Some(interception_id) = request.interception_id.as_ref() {
1404 self.attempted_authentications
1405 .remove(interception_id.as_ref());
1406 }
1407 }
1408}
1409
1410#[derive(Debug)]
1411pub enum NetworkEvent {
1412 SendCdpRequest((MethodId, serde_json::Value)),
1414 Request(RequestId),
1416 Response(RequestId),
1418 RequestFailed(HttpRequest),
1420 RequestFinished(HttpRequest),
1422 BytesConsumed(u64),
1424}
1425
1426#[cfg(test)]
1427mod tests {
1428 use super::ALLOWED_MATCHER_3RD_PARTY;
1429 use crate::handler::network::NetworkManager;
1430 use std::time::Duration;
1431
1432 #[test]
1433 fn test_allowed_matcher_3rd_party() {
1434 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1436 assert!(
1437 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1438 "expected Cloudflare challenge script to be allowed"
1439 );
1440
1441 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1443 assert!(
1444 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1445 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1446 );
1447
1448 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1450 assert!(ALLOWED_MATCHER_3RD_PARTY
1451 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1452 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1453 }
1454
1455 #[test]
1456 fn test_script_allowed_by_default_when_not_blocklisted() {
1457 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1458 nm.set_page_url(
1459 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1460 );
1461
1462 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1464 assert!(
1465 !nm.should_block_script_blocklist_only(ok),
1466 "expected non-blocklisted script to be allowed"
1467 );
1468 }
1469
1470 #[test]
1471 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1472 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1473 nm.set_page_url(
1474 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1475 );
1476
1477 let bad = "https://cdn.example.net/js/analytics.js";
1479 assert!(
1480 nm.should_block_script_blocklist_only(bad),
1481 "expected analytics.js to be blocklisted"
1482 );
1483 }
1484
1485 #[test]
1486 fn test_allowed_matcher_3rd_party_sanity() {
1487 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1489 assert!(
1490 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1491 "expected Cloudflare challenge script to be allowed"
1492 );
1493
1494 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1496 assert!(
1497 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1498 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1499 );
1500
1501 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1502 assert!(ALLOWED_MATCHER_3RD_PARTY
1503 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1504 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1505 }
1506 #[test]
1507 fn test_dynamic_blacklist_blocks_url() {
1508 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1509 nm.set_page_url("https://example.com/".to_string());
1510
1511 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1512 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1513 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1514
1515 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1516 }
1517
1518 #[test]
1519 fn test_blacklist_strict_wins_over_whitelist() {
1520 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1521 nm.set_page_url("https://example.com/".to_string());
1522
1523 nm.set_blacklist_patterns(["beacon.min.js"]);
1525 nm.set_whitelist_patterns(["beacon.min.js"]);
1526
1527 nm.set_blacklist_strict(true);
1528
1529 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1530 assert!(nm.is_whitelisted(u));
1531 assert!(nm.is_blacklisted(u));
1532
1533 assert!(nm.blacklist_strict);
1536 }
1537
1538 #[test]
1539 fn test_blacklist_non_strict_allows_whitelist_override() {
1540 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1541 nm.set_page_url("https://example.com/".to_string());
1542
1543 nm.set_blacklist_patterns(["beacon.min.js"]);
1544 nm.set_whitelist_patterns(["beacon.min.js"]);
1545
1546 nm.set_blacklist_strict(false);
1547
1548 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1549 assert!(nm.is_blacklisted(u));
1550 assert!(nm.is_whitelisted(u));
1551 assert!(!nm.blacklist_strict);
1552 }
1553}