1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5 xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17 EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19 InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20 SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23 fetch::{
24 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26 },
27 network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::Duration;
43
44lazy_static! {
45 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47 "jquery", "angular",
49 "react", "vue", "bootstrap",
52 "d3",
53 "lodash",
54 "ajax",
55 "application",
56 "app", "main",
58 "index",
59 "bundle",
60 "vendor",
61 "runtime",
62 "polyfill",
63 "scripts",
64 "es2015.",
65 "es2020.",
66 "webpack",
67 "captcha",
68 "client",
69 "/cdn-cgi/challenge-platform/",
70 "/wp-content/js/", "https://m.stripe.network/",
73 "https://challenges.cloudflare.com/",
74 "https://www.google.com/recaptcha/",
75 "https://google.com/recaptcha/api.js",
76 "https://www.gstatic.com/recaptcha/",
77 "https://captcha.px-cloud.net/",
78 "https://geo.captcha-delivery.com/",
79 "https://api.leminnow.com/captcha/",
80 "https://cdn.auth0.com/js/lock/",
81 "https://captcha.gtimg.com",
82 "https://client-api.arkoselabs.com/",
83 "https://www.capy.me/puzzle/",
84 "https://newassets.hcaptcha.com/",
85 "https://cdn.auth0.com/client",
86 "https://js.stripe.com/",
87 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
90 ];
91
92 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100 "https://m.stripe.network/",
102 "https://challenges.cloudflare.com/",
103 "https://js.stripe.com/",
104 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
107 "https://ct.captcha-delivery.com/",
108 "https://geo.captcha-delivery.com/",
109 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
111 "https://captcha.px-cloud.net/",
112 "https://www.capy.me/puzzle/",
113 "https://www.gstatic.com/recaptcha/",
114 "https://google.com/recaptcha/",
115 "https://www.google.com/recaptcha/",
116 "https://www.recaptcha.net/recaptcha/",
117 "https://js.hcaptcha.com/1/api.js",
118 "https://hcaptcha.com/1/api.js",
119 "https://js.datadome.co/tags.js",
120 "https://api-js.datadome.co/",
121 "https://client.perimeterx.net/",
122 "https://captcha.px-cdn.net/",
123 "https://newassets.hcaptcha.com/",
124 "https://captcha.px-cloud.net/",
125 "https://s.perimeterx.net/",
126 "https://api.leminnow.com/captcha/",
127 "https://client-api.arkoselabs.com/",
128 "https://static.geetest.com/v4/gt4.js",
129 "https://static.geetest.com/",
130 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131 "https://cdn.perfdrive.com/aperture/",
132 "https://assets.queue-it.net/",
133 "discourse-cdn.com/",
134 "hcaptcha.com",
135 "/cdn-cgi/challenge-platform/",
136 "/_Incapsula_Resource"
137 ];
138
139 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144 phf::phf_set! {
145 "_astro/", "_app/immutable"
147 }
148 };
149
150 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152 "application/pdf",
153 "application/zip",
154 "application/x-rar-compressed",
155 "application/x-tar",
156 "image/png",
157 "image/jpeg",
158 "image/gif",
159 "image/bmp",
160 "image/webp",
161 "image/svg+xml",
162 "video/mp4",
163 "video/x-msvideo",
164 "video/x-matroska",
165 "video/webm",
166 "audio/mpeg",
167 "audio/ogg",
168 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169 "application/vnd.ms-excel",
170 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171 "application/vnd.ms-powerpoint",
172 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173 "application/x-7z-compressed",
174 "application/x-rpm",
175 "application/x-shockwave-flash",
176 "application/rtf",
177 };
178
179 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181 "Image",
182 "Media",
183 "Font"
184 };
185
186 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188 "CspViolationReport",
189 "Ping",
190 };
191
192 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
197 let enable = EnableParams::default();
198
199 if let Ok(c) = serde_json::to_value(&enable) {
200 vec![(enable.identifier(), c)]
201 } else {
202 vec![]
203 }
204 };
205
206 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
208 let enable = EnableParams::default();
209 let mut v = vec![];
210 if let Ok(c) = serde_json::to_value(&enable) {
211 v.push((enable.identifier(), c));
212 }
213 let ignore = SetIgnoreCertificateErrorsParams::new(true);
214 if let Ok(ignored) = serde_json::to_value(&ignore) {
215 v.push((ignore.identifier(), ignored));
216 }
217
218 v
219 };
220
221 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223 fetch::EnableParams::builder()
224 .handle_auth_requests(true)
225 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226 .build()
227 };
228}
229
230pub(crate) fn is_redirect_status(status: i64) -> bool {
232 matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235#[derive(Debug)]
236pub struct NetworkManager {
238 queued_events: VecDeque<NetworkEvent>,
244 ignore_httpserrors: bool,
249 requests: HashMap<RequestId, HttpRequest>,
254 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
261 extra_headers: std::collections::HashMap<String, String>,
266 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
272 user_cache_disabled: bool,
277 attempted_authentications: HashSet<RequestId>,
283 credentials: Option<Credentials>,
288 pub(crate) user_request_interception_enabled: bool,
297 block_all: bool,
304 pub(crate) protocol_request_interception_enabled: bool,
310 offline: bool,
312 pub request_timeout: Duration,
314 pub ignore_visuals: bool,
317 pub block_stylesheets: bool,
319 pub block_javascript: bool,
324 pub block_analytics: bool,
326 pub block_prefetch: bool,
328 pub only_html: bool,
330 pub xml_document: bool,
332 pub intercept_manager: NetworkInterceptManager,
334 pub document_reload_tracker: u8,
336 pub document_target_url: String,
338 pub document_target_domain: String,
340 pub max_bytes_allowed: Option<u64>,
342 #[cfg(feature = "_cache")]
343 pub cache_site_key: Option<String>,
345 #[cfg(feature = "_cache")]
347 pub cache_policy: Option<BasicCachePolicy>,
348 whitelist_patterns: Vec<String>,
350 whitelist_matcher: Option<AhoCorasick>,
352 blacklist_patterns: Vec<String>,
354 blacklist_matcher: Option<AhoCorasick>,
356 blacklist_strict: bool,
358}
359
360impl NetworkManager {
361 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
363 Self {
364 queued_events: Default::default(),
365 ignore_httpserrors,
366 requests: Default::default(),
367 requests_will_be_sent: Default::default(),
368 extra_headers: Default::default(),
369 request_id_to_interception_id: Default::default(),
370 user_cache_disabled: false,
371 attempted_authentications: Default::default(),
372 credentials: None,
373 block_all: false,
374 user_request_interception_enabled: false,
375 protocol_request_interception_enabled: false,
376 offline: false,
377 request_timeout,
378 ignore_visuals: false,
379 block_javascript: false,
380 block_stylesheets: false,
381 block_prefetch: true,
382 block_analytics: true,
383 only_html: false,
384 xml_document: false,
385 intercept_manager: NetworkInterceptManager::Unknown,
386 document_reload_tracker: 0,
387 document_target_url: String::new(),
388 document_target_domain: String::new(),
389 whitelist_patterns: Vec::new(),
390 whitelist_matcher: None,
391 blacklist_patterns: Vec::new(),
392 blacklist_matcher: None,
393 blacklist_strict: true,
394 max_bytes_allowed: None,
395 #[cfg(feature = "_cache")]
396 cache_site_key: None,
397 #[cfg(feature = "_cache")]
398 cache_policy: None,
399 }
400 }
401
402 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
404 where
405 I: IntoIterator<Item = S>,
406 S: Into<String>,
407 {
408 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
409 self.rebuild_whitelist_matcher();
410 }
411
412 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
414 where
415 I: IntoIterator<Item = S>,
416 S: Into<String>,
417 {
418 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
419 self.rebuild_blacklist_matcher();
420 }
421
422 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
424 self.blacklist_patterns.push(pattern.into());
425 self.rebuild_blacklist_matcher();
426 }
427
428 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
430 where
431 I: IntoIterator<Item = S>,
432 S: Into<String>,
433 {
434 self.blacklist_patterns
435 .extend(patterns.into_iter().map(Into::into));
436 self.rebuild_blacklist_matcher();
437 }
438
439 pub fn clear_blacklist(&mut self) {
441 self.blacklist_patterns.clear();
442 self.blacklist_matcher = None;
443 }
444
445 pub fn set_blacklist_strict(&mut self, strict: bool) {
447 self.blacklist_strict = strict;
448 }
449
450 #[inline]
451 fn rebuild_blacklist_matcher(&mut self) {
452 if self.blacklist_patterns.is_empty() {
453 self.blacklist_matcher = None;
454 return;
455 }
456
457 self.blacklist_matcher =
458 AhoCorasick::new(self.blacklist_patterns.iter().map(|s| s.as_str())).ok();
459 }
460
461 #[inline]
462 fn is_blacklisted(&self, url: &str) -> bool {
463 self.blacklist_matcher
464 .as_ref()
465 .map(|m| m.is_match(url))
466 .unwrap_or(false)
467 }
468
469 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
471 self.whitelist_patterns.push(pattern.into());
472 self.rebuild_whitelist_matcher();
473 }
474
475 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
477 where
478 I: IntoIterator<Item = S>,
479 S: Into<String>,
480 {
481 self.whitelist_patterns
482 .extend(patterns.into_iter().map(Into::into));
483 self.rebuild_whitelist_matcher();
484 }
485
486 #[inline]
487 fn rebuild_whitelist_matcher(&mut self) {
488 if self.whitelist_patterns.is_empty() {
489 self.whitelist_matcher = None;
490 return;
491 }
492
493 self.whitelist_matcher =
495 AhoCorasick::new(self.whitelist_patterns.iter().map(|s| s.as_str())).ok();
496 }
497
498 #[inline]
499 fn is_whitelisted(&self, url: &str) -> bool {
500 self.whitelist_matcher
501 .as_ref()
502 .map(|m| m.is_match(url))
503 .unwrap_or(false)
504 }
505
506 pub fn init_commands(&self) -> CommandChain {
508 let cmds = if self.ignore_httpserrors {
509 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
510 } else {
511 INIT_CHAIN.clone()
512 };
513 CommandChain::new(cmds, self.request_timeout)
514 }
515
516 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
518 let method = cmd.identifier();
519 if let Ok(params) = serde_json::to_value(cmd) {
520 self.queued_events
521 .push_back(NetworkEvent::SendCdpRequest((method, params)));
522 }
523 }
524
525 pub fn poll(&mut self) -> Option<NetworkEvent> {
527 self.queued_events.pop_front()
528 }
529
530 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
532 &self.extra_headers
533 }
534
535 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
537 self.extra_headers = headers;
538 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
539 self.extra_headers.remove("Proxy-Authorization");
540 if !self.extra_headers.is_empty() {
541 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
542 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
543 }
544 }
545 }
546
547 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
548 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
549 }
550
551 pub fn set_block_all(&mut self, block_all: bool) {
552 self.block_all = block_all;
553 }
554
555 pub fn set_request_interception(&mut self, enabled: bool) {
556 self.user_request_interception_enabled = enabled;
557 self.update_protocol_request_interception();
558 }
559
560 pub fn set_cache_enabled(&mut self, enabled: bool) {
561 let run = self.user_cache_disabled == enabled;
562 self.user_cache_disabled = !enabled;
563 if run {
564 self.update_protocol_cache_disabled();
565 }
566 }
567
568 pub fn enable_request_intercept(&mut self) {
570 self.protocol_request_interception_enabled = true;
571 }
572
573 pub fn disable_request_intercept(&mut self) {
575 self.protocol_request_interception_enabled = false;
576 }
577
578 #[cfg(feature = "_cache")]
580 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
581 self.cache_site_key = cache_site_key;
582 }
583
584 #[cfg(feature = "_cache")]
586 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
587 self.cache_policy = cache_policy;
588 }
589
590 pub fn update_protocol_cache_disabled(&mut self) {
591 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
592 }
593
594 pub fn authenticate(&mut self, credentials: Credentials) {
595 self.credentials = Some(credentials);
596 self.update_protocol_request_interception();
597 self.protocol_request_interception_enabled = true;
598 }
599
600 fn update_protocol_request_interception(&mut self) {
601 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
602
603 if enabled == self.protocol_request_interception_enabled {
604 return;
605 }
606
607 if enabled {
608 self.push_cdp_request(ENABLE_FETCH.clone())
609 } else {
610 self.push_cdp_request(DisableParams::default())
611 }
612 }
613
614 #[inline]
617 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
618 let block_analytics = self.block_analytics;
620
621 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
623 {
624 return true;
625 }
626
627 if crate::handler::blockers::block_websites::block_website(url) {
629 return true;
630 }
631
632 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
639 let p_slash = Self::strip_query_fragment(path_with_slash);
641 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
642
643 let base = match p_slash.rsplit('/').next() {
645 Some(b) => b,
646 None => p_slash,
647 };
648
649 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
652 return true;
653 }
654 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
655 return true;
656 }
657 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
658 return true;
659 }
660
661 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
664 return true;
665 }
666
667 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
669 return true;
670 }
671 }
672
673 false
674 }
675
676 #[inline]
681 fn url_path_with_leading_slash(url: &str) -> Option<&str> {
682 let idx = url.find("//")?;
684 let after_slashes = idx + 2;
685
686 let slash_rel = url[after_slashes..].find('/')?;
688 let slash_idx = after_slashes + slash_rel;
689
690 if slash_idx < url.len() {
691 Some(&url[slash_idx..])
692 } else {
693 None
694 }
695 }
696
697 #[inline]
702 fn strip_query_fragment(s: &str) -> &str {
703 let q = s.find('?');
704 let h = s.find('#');
705
706 match (q, h) {
707 (None, None) => s,
708 (Some(i), None) => &s[..i],
709 (None, Some(i)) => &s[..i],
710 (Some(i), Some(j)) => &s[..i.min(j)],
711 }
712 }
713
714 #[inline]
716 fn skip_xhr(
717 &self,
718 skip_networking: bool,
719 event: &EventRequestPaused,
720 network_event: bool,
721 ) -> bool {
722 if !skip_networking && network_event {
724 let request_url = event.request.url.as_str();
725
726 let skip_analytics =
728 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
729
730 if skip_analytics {
731 true
732 } else if self.block_stylesheets || self.ignore_visuals {
733 let block_css = self.block_stylesheets;
734 let block_media = self.ignore_visuals;
735
736 let mut block_request = false;
737
738 if let Some(position) = request_url.rfind('.') {
739 let hlen = request_url.len();
740 let has_asset = hlen - position;
741
742 if has_asset >= 3 {
743 let next_position = position + 1;
744
745 if block_media
746 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
747 &request_url[next_position..].into(),
748 )
749 {
750 block_request = true;
751 } else if block_css {
752 block_request = CaseInsensitiveString::from(
753 &request_url.as_bytes()[next_position..],
754 )
755 .contains(&**CSS_EXTENSION)
756 }
757 }
758 }
759
760 if !block_request {
761 block_request = ignore_script_xhr_media(request_url);
762 }
763
764 block_request
765 } else {
766 skip_networking
767 }
768 } else {
769 skip_networking
770 }
771 }
772
773 #[cfg(feature = "adblock")]
774 #[inline]
775 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
777 if skip_networking {
778 true
779 } else {
780 block_ads(&event.request.url) || self.detect_ad(event)
781 }
782 }
783
784 #[cfg(not(feature = "adblock"))]
786 #[inline]
787 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
788 use crate::handler::blockers::block_websites::block_ads;
789 if skip_networking {
790 true
791 } else {
792 block_ads(&event.request.url)
793 }
794 }
795
796 #[inline]
797 fn fail_request_blocked(
799 &mut self,
800 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
801 ) {
802 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
803 request_id.clone(),
804 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
805 );
806 self.push_cdp_request(params);
807 }
808
809 #[inline]
810 fn fulfill_request_empty_200(
812 &mut self,
813 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
814 ) {
815 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
816 request_id.clone(),
817 200,
818 );
819 self.push_cdp_request(params);
820 }
821
822 #[cfg(feature = "_cache")]
823 #[inline]
824 fn fulfill_request_from_cache(
828 &mut self,
829 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
830 body: &[u8],
831 headers: &std::collections::HashMap<String, String>,
832 status: i64,
833 ) {
834 use crate::cdp::browser_protocol::fetch::HeaderEntry;
835 use crate::handler::network::fetch::FulfillRequestParams;
836 use base64::Engine;
837
838 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
839
840 for (k, v) in headers.iter() {
841 resp_headers.push(HeaderEntry {
842 name: k.clone().into(),
843 value: v.clone().into(),
844 });
845 }
846
847 let mut params = FulfillRequestParams::new(request_id.clone(), status);
848
849 params.body = Some(
851 base64::engine::general_purpose::STANDARD
852 .encode(body)
853 .into(),
854 );
855
856 params.response_headers = Some(resp_headers);
857
858 self.push_cdp_request(params);
859 }
860
861 #[inline]
862 fn continue_request_with_url(
864 &mut self,
865 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
866 url: Option<&str>,
867 intercept_response: bool,
868 ) {
869 let mut params = ContinueRequestParams::new(request_id.clone());
870 if let Some(url) = url {
871 params.url = Some(url.to_string());
872 params.intercept_response = Some(intercept_response);
873 }
874 self.push_cdp_request(params);
875 }
876
877 #[inline]
879 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
880 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
881 return;
882 }
883
884 if self.block_all {
885 tracing::debug!(
886 "Blocked (block_all): {:?} - {}",
887 event.resource_type,
888 event.request.url
889 );
890 return self.fail_request_blocked(&event.request_id);
891 }
892
893 if let Some(network_id) = event.network_id.as_ref() {
894 if let Some(request_will_be_sent) =
895 self.requests_will_be_sent.remove(network_id.as_ref())
896 {
897 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
898 } else {
899 self.request_id_to_interception_id
900 .insert(network_id.clone(), event.request_id.clone().into());
901 }
902 }
903
904 let javascript_resource = event.resource_type == ResourceType::Script;
906 let document_resource = event.resource_type == ResourceType::Document;
907 let network_resource =
908 !document_resource && crate::utils::is_data_resource(&event.resource_type);
909
910 let mut skip_networking =
912 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
913
914 if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
915 skip_networking = true;
916 }
917
918 if !skip_networking {
920 skip_networking = self.document_reload_tracker >= 3;
921 }
922
923 let (current_url_cow, had_replacer) =
925 self.handle_document_replacement_and_tracking(event, document_resource);
926
927 let current_url: &str = current_url_cow.as_ref();
928
929 let blacklisted = self.is_blacklisted(current_url);
930
931 if !self.blacklist_strict && blacklisted {
932 skip_networking = true;
933 }
934
935 if !skip_networking {
936 if self.xml_document && current_url.ends_with(".xsl") {
938 skip_networking = false;
939 } else {
940 skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
941 }
942 }
943
944 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
945
946 if !skip_networking
948 && self.block_javascript
949 && (self.only_html || self.ignore_visuals)
950 && (javascript_resource || document_resource)
951 {
952 skip_networking = ignore_script_embedded(current_url);
953 }
954
955 if !skip_networking && javascript_resource {
958 skip_networking = self.should_block_script_blocklist_only(current_url);
959 }
960
961 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
963
964 if !skip_networking && (javascript_resource || network_resource || document_resource) {
966 skip_networking = self.intercept_manager.intercept_detection(
967 current_url,
968 self.ignore_visuals,
969 network_resource,
970 );
971 }
972
973 if !skip_networking && (javascript_resource || network_resource) {
975 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
976 }
977
978 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
981 {
982 skip_networking = false;
983 }
984
985 if skip_networking && self.is_whitelisted(current_url) {
987 skip_networking = false;
988 }
989
990 if self.blacklist_strict && blacklisted {
991 skip_networking = true;
992 }
993
994 if skip_networking {
995 tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
996 self.fulfill_request_empty_200(&event.request_id);
997 } else {
998 #[cfg(feature = "_cache")]
999 {
1000 if let (Some(policy), Some(cache_site_key)) =
1001 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1002 {
1003 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1004
1005 if let Some((res, cache_policy)) =
1006 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1007 {
1008 if policy.allows_cached(&cache_policy) {
1009 tracing::debug!(
1010 "Remote Cached: {:?} - {}",
1011 &event.resource_type,
1012 ¤t_url
1013 );
1014 let flat_headers = crate::http::headers_from_multi(&res.headers);
1015 return self.fulfill_request_from_cache(
1016 &event.request_id,
1017 &res.body,
1018 &flat_headers,
1019 res.status as i64,
1020 );
1021 }
1022 }
1023 }
1024 }
1025
1026 tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1028 self.continue_request_with_url(
1029 &event.request_id,
1030 if had_replacer {
1031 Some(current_url)
1032 } else {
1033 None
1034 },
1035 !had_replacer,
1036 );
1037 }
1038 }
1039
1040 #[inline]
1046 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1047 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1048 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1049 }
1050
1051 pub fn has_target_domain(&self) -> bool {
1053 !self.document_target_url.is_empty()
1054 }
1055
1056 pub fn set_page_url(&mut self, page_target_url: String) {
1058 let host_base = host_and_rest(&page_target_url)
1059 .map(|(h, _)| base_domain_from_host(h))
1060 .unwrap_or("");
1061
1062 self.document_target_domain = host_base.to_string();
1063 self.document_target_url = page_target_url;
1064 }
1065
1066 pub fn clear_target_domain(&mut self) {
1068 self.document_reload_tracker = 0;
1069 self.document_target_url = Default::default();
1070 self.document_target_domain = Default::default();
1071 }
1072
1073 #[inline]
1081 fn handle_document_replacement_and_tracking<'a>(
1082 &mut self,
1083 event: &'a EventRequestPaused,
1084 document_resource: bool,
1085 ) -> (Cow<'a, str>, bool) {
1086 let mut replacer: Option<String> = None;
1087 let current_url = event.request.url.as_str();
1088
1089 if document_resource {
1090 if self.document_target_url == current_url {
1091 self.document_reload_tracker += 1;
1092 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1093 {
1094 let (http_document_replacement, mut https_document_replacement) =
1095 if self.document_target_url.starts_with("http://") {
1096 (
1097 self.document_target_url.replacen("http://", "http//", 1),
1098 self.document_target_url.replacen("http://", "https://", 1),
1099 )
1100 } else {
1101 (
1102 self.document_target_url.replacen("https://", "https//", 1),
1103 self.document_target_url.replacen("https://", "http://", 1),
1104 )
1105 };
1106
1107 let trailing = https_document_replacement.ends_with('/');
1109 if trailing {
1110 https_document_replacement.pop();
1111 }
1112 if https_document_replacement.ends_with('/') {
1113 https_document_replacement.pop();
1114 }
1115
1116 let redirect_mask = format!(
1117 "{}{}",
1118 https_document_replacement, http_document_replacement
1119 );
1120
1121 if current_url == redirect_mask {
1122 replacer = Some(if trailing {
1123 format!("{}/", https_document_replacement)
1124 } else {
1125 https_document_replacement
1126 });
1127 }
1128 }
1129
1130 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1131 self.xml_document = true;
1132 }
1133
1134 self.document_target_url = event.request.url.clone();
1136 self.document_target_domain = host_and_rest(&self.document_target_url)
1137 .map(|(h, _)| base_domain_from_host(h).to_string())
1138 .unwrap_or_default();
1139 }
1140
1141 let current_url_cow = match replacer {
1142 Some(r) => Cow::Owned(r),
1143 None => Cow::Borrowed(event.request.url.as_str()),
1144 };
1145
1146 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1147 (current_url_cow, had_replacer)
1148 }
1149
1150 #[cfg(feature = "adblock")]
1152 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1153 use adblock::{
1154 lists::{FilterSet, ParseOptions, RuleTypes},
1155 Engine,
1156 };
1157
1158 lazy_static::lazy_static! {
1159 static ref AD_ENGINE: Engine = {
1160 let mut filter_set = FilterSet::new(false);
1161 let mut rules = ParseOptions::default();
1162 rules.rule_types = RuleTypes::All;
1163
1164 filter_set.add_filters(
1165 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1166 rules,
1167 );
1168
1169 Engine::from_filter_set(filter_set, true)
1170 };
1171 };
1172
1173 let blockable = ResourceType::Image == event.resource_type
1174 || event.resource_type == ResourceType::Media
1175 || event.resource_type == ResourceType::Stylesheet
1176 || event.resource_type == ResourceType::Document
1177 || event.resource_type == ResourceType::Fetch
1178 || event.resource_type == ResourceType::Xhr;
1179
1180 let u = &event.request.url;
1181
1182 let block_request = blockable
1183 && {
1185 let request = adblock::request::Request::preparsed(
1186 &u,
1187 "example.com",
1188 "example.com",
1189 &event.resource_type.as_ref().to_lowercase(),
1190 !event.request.is_same_site.unwrap_or_default());
1191
1192 AD_ENGINE.check_network_request(&request).matched
1193 };
1194
1195 block_request
1196 }
1197
1198 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1199 let response = if self
1200 .attempted_authentications
1201 .contains(event.request_id.as_ref())
1202 {
1203 AuthChallengeResponseResponse::CancelAuth
1204 } else if self.credentials.is_some() {
1205 self.attempted_authentications
1206 .insert(event.request_id.clone().into());
1207 AuthChallengeResponseResponse::ProvideCredentials
1208 } else {
1209 AuthChallengeResponseResponse::Default
1210 };
1211
1212 let mut auth = AuthChallengeResponse::new(response);
1213 if let Some(creds) = self.credentials.clone() {
1214 auth.username = Some(creds.username);
1215 auth.password = Some(creds.password);
1216 }
1217 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1218 }
1219
1220 pub fn set_offline_mode(&mut self, value: bool) {
1222 if self.offline == value {
1223 return;
1224 }
1225 self.offline = value;
1226 if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1227 .offline(self.offline)
1228 .matched_network_condition(
1229 NetworkConditions::builder()
1230 .url_pattern("")
1231 .latency(0)
1232 .download_throughput(-1.)
1233 .upload_throughput(-1.)
1234 .build()
1235 .unwrap(),
1236 )
1237 .build()
1238 {
1239 self.push_cdp_request(network);
1240 }
1241 }
1242
1243 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1245 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1246 if let Some(interception_id) = self
1247 .request_id_to_interception_id
1248 .remove(event.request_id.as_ref())
1249 {
1250 self.on_request(event, Some(interception_id));
1251 } else {
1252 self.requests_will_be_sent
1254 .insert(event.request_id.clone(), event.clone());
1255 }
1256 } else {
1257 self.on_request(event, None);
1258 }
1259 }
1260
1261 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1263 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1264 request.from_memory_cache = true;
1265 }
1266 }
1267
1268 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1270 let mut request_failed = false;
1271
1272 let mut deducted: u64 = 0;
1274
1275 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1276 let before = *max_bytes;
1277
1278 let received_bytes: u64 = event.response.encoded_data_length as u64;
1280
1281 let content_length: Option<u64> = event
1283 .response
1284 .headers
1285 .inner()
1286 .get("content-length")
1287 .and_then(|v| v.as_str())
1288 .and_then(|s| s.trim().parse::<u64>().ok());
1289
1290 *max_bytes = max_bytes.saturating_sub(received_bytes);
1292
1293 if let Some(cl) = content_length {
1295 if cl > *max_bytes {
1296 *max_bytes = 0;
1297 }
1298 }
1299
1300 request_failed = *max_bytes == 0;
1301
1302 deducted = before.saturating_sub(*max_bytes);
1304 }
1305
1306 if deducted > 0 {
1308 self.queued_events
1309 .push_back(NetworkEvent::BytesConsumed(deducted));
1310 }
1311
1312 if request_failed && self.max_bytes_allowed.is_some() {
1314 self.set_block_all(true);
1315 }
1316
1317 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1318 request.set_response(event.response.clone());
1319 self.queued_events.push_back(if request_failed {
1320 NetworkEvent::RequestFailed(request)
1321 } else {
1322 NetworkEvent::RequestFinished(request)
1323 });
1324 }
1325 }
1326
1327 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1329 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1330 if let Some(interception_id) = request.interception_id.as_ref() {
1331 self.attempted_authentications
1332 .remove(interception_id.as_ref());
1333 }
1334 self.queued_events
1335 .push_back(NetworkEvent::RequestFinished(request));
1336 }
1337 }
1338
1339 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1341 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1342 request.failure_text = Some(event.error_text.clone());
1343 if let Some(interception_id) = request.interception_id.as_ref() {
1344 self.attempted_authentications
1345 .remove(interception_id.as_ref());
1346 }
1347 self.queued_events
1348 .push_back(NetworkEvent::RequestFailed(request));
1349 }
1350 }
1351
1352 fn on_request(
1354 &mut self,
1355 event: &EventRequestWillBeSent,
1356 interception_id: Option<InterceptionId>,
1357 ) {
1358 let mut redirect_chain = Vec::new();
1359 let mut redirect_location = None;
1360
1361 if let Some(redirect_resp) = &event.redirect_response {
1362 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1363 if is_redirect_status(redirect_resp.status) {
1364 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1365 if redirect_resp.url != location {
1366 let fixed_location = location.replace(&redirect_resp.url, "");
1367
1368 if !fixed_location.is_empty() {
1369 if let Some(resp) = request.response.as_mut() {
1370 resp.headers.0["Location"] =
1371 serde_json::Value::String(fixed_location.clone());
1372 }
1373 }
1374
1375 redirect_location = Some(fixed_location);
1376 }
1377 }
1378 }
1379
1380 self.handle_request_redirect(
1381 &mut request,
1382 if let Some(redirect_location) = redirect_location {
1383 let mut redirect_resp = redirect_resp.clone();
1384
1385 if !redirect_location.is_empty() {
1386 redirect_resp.headers.0["Location"] =
1387 serde_json::Value::String(redirect_location);
1388 }
1389
1390 redirect_resp
1391 } else {
1392 redirect_resp.clone()
1393 },
1394 );
1395
1396 redirect_chain = std::mem::take(&mut request.redirect_chain);
1397 redirect_chain.push(request);
1398 }
1399 }
1400
1401 let request = HttpRequest::new(
1402 event.request_id.clone(),
1403 event.frame_id.clone(),
1404 interception_id,
1405 self.user_request_interception_enabled,
1406 redirect_chain,
1407 );
1408
1409 self.requests.insert(event.request_id.clone(), request);
1410 self.queued_events
1411 .push_back(NetworkEvent::Request(event.request_id.clone()));
1412 }
1413
1414 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1416 request.set_response(response);
1417 if let Some(interception_id) = request.interception_id.as_ref() {
1418 self.attempted_authentications
1419 .remove(interception_id.as_ref());
1420 }
1421 }
1422}
1423
1424#[derive(Debug)]
1425pub enum NetworkEvent {
1426 SendCdpRequest((MethodId, serde_json::Value)),
1428 Request(RequestId),
1430 Response(RequestId),
1432 RequestFailed(HttpRequest),
1434 RequestFinished(HttpRequest),
1436 BytesConsumed(u64),
1438}
1439
1440#[cfg(test)]
1441mod tests {
1442 use super::ALLOWED_MATCHER_3RD_PARTY;
1443 use crate::handler::network::NetworkManager;
1444 use std::time::Duration;
1445
1446 #[test]
1447 fn test_allowed_matcher_3rd_party() {
1448 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1450 assert!(
1451 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1452 "expected Cloudflare challenge script to be allowed"
1453 );
1454
1455 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1457 assert!(
1458 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1459 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1460 );
1461
1462 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1464 assert!(ALLOWED_MATCHER_3RD_PARTY
1465 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1466 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1467 }
1468
1469 #[test]
1470 fn test_script_allowed_by_default_when_not_blocklisted() {
1471 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1472 nm.set_page_url(
1473 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1474 );
1475
1476 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1478 assert!(
1479 !nm.should_block_script_blocklist_only(ok),
1480 "expected non-blocklisted script to be allowed"
1481 );
1482 }
1483
1484 #[test]
1485 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1486 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1487 nm.set_page_url(
1488 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1489 );
1490
1491 let bad = "https://cdn.example.net/js/analytics.js";
1493 assert!(
1494 nm.should_block_script_blocklist_only(bad),
1495 "expected analytics.js to be blocklisted"
1496 );
1497 }
1498
1499 #[test]
1500 fn test_allowed_matcher_3rd_party_sanity() {
1501 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1503 assert!(
1504 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1505 "expected Cloudflare challenge script to be allowed"
1506 );
1507
1508 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1510 assert!(
1511 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1512 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1513 );
1514
1515 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1516 assert!(ALLOWED_MATCHER_3RD_PARTY
1517 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1518 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1519 }
1520 #[test]
1521 fn test_dynamic_blacklist_blocks_url() {
1522 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1523 nm.set_page_url("https://example.com/".to_string());
1524
1525 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1526 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1527 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1528
1529 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1530 }
1531
1532 #[test]
1533 fn test_blacklist_strict_wins_over_whitelist() {
1534 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1535 nm.set_page_url("https://example.com/".to_string());
1536
1537 nm.set_blacklist_patterns(["beacon.min.js"]);
1539 nm.set_whitelist_patterns(["beacon.min.js"]);
1540
1541 nm.set_blacklist_strict(true);
1542
1543 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1544 assert!(nm.is_whitelisted(u));
1545 assert!(nm.is_blacklisted(u));
1546
1547 assert!(nm.blacklist_strict);
1550 }
1551
1552 #[test]
1553 fn test_blacklist_non_strict_allows_whitelist_override() {
1554 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1555 nm.set_page_url("https://example.com/".to_string());
1556
1557 nm.set_blacklist_patterns(["beacon.min.js"]);
1558 nm.set_whitelist_patterns(["beacon.min.js"]);
1559
1560 nm.set_blacklist_strict(false);
1561
1562 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1563 assert!(nm.is_blacklisted(u));
1564 assert!(nm.is_whitelisted(u));
1565 assert!(!nm.blacklist_strict);
1566 }
1567}