1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "captcha",
66 "client",
67 "/cdn-cgi/challenge-platform/",
68 "/wp-content/js/", "https://m.stripe.network/",
71 "https://challenges.cloudflare.com/",
72 "https://www.google.com/recaptcha/enterprise.js",
73 "https://www.google.com/recaptcha/api.js",
74 "https://google.com/recaptcha/api.js",
75 "https://captcha.px-cloud.net/",
76 "https://cdn.auth0.com/js/lock/",
77 "https://captcha.gtimg.com",
78 "https://cdn.auth0.com/client",
79 "https://js.stripe.com/",
80 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
83 ];
84
85 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
90
91 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
93 "https://m.stripe.network/",
95 "https://challenges.cloudflare.com/",
96 "https://www.google.com/recaptcha/api.js",
97 "https://google.com/recaptcha/api.js",
98 "https://www.google.com/recaptcha/enterprise.js",
99 "https://js.stripe.com/",
100 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
103 "https://ct.captcha-delivery.com/",
104 "https://geo.captcha-delivery.com/captcha/",
105 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
107 "https://cdn.auth0.com/client",
108 "https://captcha.px-cloud.net/",
109 "https://www.gstatic.com/recaptcha/",
110 "https://www.google.com/recaptcha/api2/",
111 "https://www.recaptcha.net/recaptcha/",
112 "https://js.hcaptcha.com/1/api.js",
113 "https://hcaptcha.com/1/api.js",
114 "https://js.datadome.co/tags.js",
115 "https://api-js.datadome.co/",
116 "https://client.perimeterx.net/",
117 "https://captcha.px-cdn.net/",
118 "https://captcha.px-cloud.net/",
119 "https://s.perimeterx.net/",
120 "https://client-api.arkoselabs.com/v2/",
121 "https://static.geetest.com/v4/gt4.js",
122 "https://static.geetest.com/",
123 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
124 "https://cdn.perfdrive.com/aperture/",
125 "https://assets.queue-it.net/",
126 "/cdn-cgi/challenge-platform/",
127 "/_Incapsula_Resource",
128 "discourse-cdn.com/"
129 ];
130
131 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
133
134 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
136 phf::phf_set! {
137 "_astro/", "_app/immutable"
139 }
140 };
141
142 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
144 "application/pdf",
145 "application/zip",
146 "application/x-rar-compressed",
147 "application/x-tar",
148 "image/png",
149 "image/jpeg",
150 "image/gif",
151 "image/bmp",
152 "image/webp",
153 "image/svg+xml",
154 "video/mp4",
155 "video/x-msvideo",
156 "video/x-matroska",
157 "video/webm",
158 "audio/mpeg",
159 "audio/ogg",
160 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
161 "application/vnd.ms-excel",
162 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
163 "application/vnd.ms-powerpoint",
164 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
165 "application/x-7z-compressed",
166 "application/x-rpm",
167 "application/x-shockwave-flash",
168 "application/rtf",
169 };
170
171 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
173 "Image",
174 "Media",
175 "Font"
176 };
177
178 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180 "CspViolationReport",
181 "Manifest",
182 "Other",
183 "Prefetch",
184 "Ping",
185 };
186
187 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
189
190 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
192 let enable = EnableParams::default();
193
194 if let Ok(c) = serde_json::to_value(&enable) {
195 vec![(enable.identifier(), c)]
196 } else {
197 vec![]
198 }
199 };
200
201 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
203 let enable = EnableParams::default();
204 let mut v = vec![];
205 if let Ok(c) = serde_json::to_value(&enable) {
206 v.push((enable.identifier(), c));
207 }
208 let ignore = SetIgnoreCertificateErrorsParams::new(true);
209 if let Ok(ignored) = serde_json::to_value(&ignore) {
210 v.push((ignore.identifier(), ignored));
211 }
212
213 v
214 };
215
216 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
218 fetch::EnableParams::builder()
219 .handle_auth_requests(true)
220 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
221 .build()
222 };
223}
224
225#[derive(Debug, Clone, Default)]
226struct SiteKeyword {
227 kw_lower: Box<[u8]>,
229 kw_slash: Box<[u8]>,
231}
232
233impl SiteKeyword {
234 #[inline]
235 fn new_from_base_domain(base: &str) -> Option<Self> {
236 let s = base.trim().trim_matches('.');
237 if s.is_empty() {
238 return None;
239 }
240
241 let kw = s.split('.').next().unwrap_or(s).trim();
242 if kw.len() < 4 {
243 return None;
244 }
245
246 let mut kw_lower = Vec::with_capacity(kw.len());
248 for &b in kw.as_bytes() {
249 kw_lower.push(b.to_ascii_lowercase());
250 }
251
252 let mut kw_slash = Vec::with_capacity(kw_lower.len() + 2);
254 kw_slash.push(b'/');
255 kw_slash.extend_from_slice(&kw_lower);
256 kw_slash.push(b'/');
257
258 Some(Self {
259 kw_lower: kw_lower.into_boxed_slice(),
260 kw_slash: kw_slash.into_boxed_slice(),
261 })
262 }
263}
264
265#[inline]
268fn contains_ascii_ci(haystack: &[u8], needle_lower: &[u8]) -> bool {
269 let n = needle_lower.len();
270 if n == 0 {
271 return true;
272 }
273 if haystack.len() < n {
274 return false;
275 }
276
277 let last = haystack.len() - n;
280 let first = needle_lower[0];
281
282 let mut i = 0usize;
283 while i <= last {
284 let b0 = haystack[i].to_ascii_lowercase();
286 if b0 != first {
287 i += 1;
288 continue;
289 }
290
291 let mut j = 1usize;
293 while j < n {
294 if haystack[i + j].to_ascii_lowercase() != needle_lower[j] {
295 break;
296 }
297 j += 1;
298 }
299 if j == n {
300 return true;
301 }
302
303 i += 1;
304 }
305 false
306}
307
308pub(crate) fn is_redirect_status(status: i64) -> bool {
310 matches!(status, 301 | 302 | 303 | 307 | 308)
311}
312
313#[derive(Debug)]
314pub struct NetworkManager {
316 queued_events: VecDeque<NetworkEvent>,
322 ignore_httpserrors: bool,
327 requests: HashMap<RequestId, HttpRequest>,
332 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
339 extra_headers: std::collections::HashMap<String, String>,
344 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
350 user_cache_disabled: bool,
355 attempted_authentications: HashSet<RequestId>,
361 credentials: Option<Credentials>,
366 pub(crate) user_request_interception_enabled: bool,
375 block_all: bool,
382 pub(crate) protocol_request_interception_enabled: bool,
388 offline: bool,
390 pub request_timeout: Duration,
392 pub ignore_visuals: bool,
395 pub block_stylesheets: bool,
397 pub block_javascript: bool,
402 pub block_analytics: bool,
404 pub only_html: bool,
406 pub xml_document: bool,
408 pub intercept_manager: NetworkInterceptManager,
410 pub document_reload_tracker: u8,
412 pub document_target_url: String,
414 pub document_target_domain: String,
416 pub max_bytes_allowed: Option<u64>,
418 #[cfg(feature = "_cache")]
419 pub cache_site_key: Option<String>,
421 #[cfg(feature = "_cache")]
423 pub cache_policy: Option<BasicCachePolicy>,
424 whitelist_patterns: Vec<String>,
426 whitelist_matcher: Option<AhoCorasick>,
428 site_keyword: Option<SiteKeyword>,
431}
432
433impl NetworkManager {
434 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
436 Self {
437 queued_events: Default::default(),
438 ignore_httpserrors,
439 requests: Default::default(),
440 requests_will_be_sent: Default::default(),
441 extra_headers: Default::default(),
442 request_id_to_interception_id: Default::default(),
443 user_cache_disabled: false,
444 attempted_authentications: Default::default(),
445 credentials: None,
446 block_all: false,
447 user_request_interception_enabled: false,
448 protocol_request_interception_enabled: false,
449 offline: false,
450 request_timeout,
451 ignore_visuals: false,
452 block_javascript: false,
453 block_stylesheets: false,
454 block_analytics: true,
455 only_html: false,
456 xml_document: false,
457 intercept_manager: NetworkInterceptManager::Unknown,
458 document_reload_tracker: 0,
459 document_target_url: String::new(),
460 document_target_domain: String::new(),
461 whitelist_patterns: Vec::new(),
462 whitelist_matcher: None,
463 max_bytes_allowed: None,
464 site_keyword: None,
465 #[cfg(feature = "_cache")]
466 cache_site_key: None,
467 #[cfg(feature = "_cache")]
468 cache_policy: None,
469 }
470 }
471
472 #[inline]
474 fn is_related_3rd_party_by_keyword_fast(&self, url: &str) -> bool {
475 let Some(kw) = self.site_keyword.as_ref() else {
476 return false;
477 };
478
479 let Some((host, rest)) = host_and_rest(url) else {
480 return false;
481 };
482
483 let host_b = host.as_bytes();
484 if contains_ascii_ci(host_b, &kw.kw_lower) {
485 return true;
486 }
487
488 let rest_b = rest.as_bytes();
489 if contains_ascii_ci(rest_b, &kw.kw_slash) {
490 return true;
491 }
492
493 false
494 }
495
496 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
498 where
499 I: IntoIterator<Item = S>,
500 S: Into<String>,
501 {
502 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
503 self.rebuild_whitelist_matcher();
504 }
505
506 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
508 self.whitelist_patterns.push(pattern.into());
509 self.rebuild_whitelist_matcher();
510 }
511
512 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
514 where
515 I: IntoIterator<Item = S>,
516 S: Into<String>,
517 {
518 self.whitelist_patterns
519 .extend(patterns.into_iter().map(Into::into));
520 self.rebuild_whitelist_matcher();
521 }
522
523 #[inline]
524 fn rebuild_whitelist_matcher(&mut self) {
525 if self.whitelist_patterns.is_empty() {
526 self.whitelist_matcher = None;
527 return;
528 }
529
530 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
531
532 self.whitelist_matcher = AhoCorasick::new(refs).ok();
534 }
535
536 #[inline]
537 fn is_whitelisted(&self, url: &str) -> bool {
538 self.whitelist_matcher
539 .as_ref()
540 .map(|m| m.is_match(url))
541 .unwrap_or(false)
542 }
543
544 pub fn init_commands(&self) -> CommandChain {
546 let cmds = if self.ignore_httpserrors {
547 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
548 } else {
549 INIT_CHAIN.clone()
550 };
551 CommandChain::new(cmds, self.request_timeout)
552 }
553
554 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
556 let method = cmd.identifier();
557 if let Ok(params) = serde_json::to_value(cmd) {
558 self.queued_events
559 .push_back(NetworkEvent::SendCdpRequest((method, params)));
560 }
561 }
562
563 pub fn poll(&mut self) -> Option<NetworkEvent> {
565 self.queued_events.pop_front()
566 }
567
568 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
570 &self.extra_headers
571 }
572
573 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
575 self.extra_headers = headers;
576 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
577 self.extra_headers.remove("Proxy-Authorization");
578 if !self.extra_headers.is_empty() {
579 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
580 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
581 }
582 }
583 }
584
585 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
586 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
587 }
588
589 pub fn set_block_all(&mut self, block_all: bool) {
590 self.block_all = block_all;
591 }
592
593 pub fn set_request_interception(&mut self, enabled: bool) {
594 self.user_request_interception_enabled = enabled;
595 self.update_protocol_request_interception();
596 }
597
598 pub fn set_cache_enabled(&mut self, enabled: bool) {
599 let run = self.user_cache_disabled != !enabled;
600 self.user_cache_disabled = !enabled;
601 if run {
602 self.update_protocol_cache_disabled();
603 }
604 }
605
606 pub fn enable_request_intercept(&mut self) {
608 self.protocol_request_interception_enabled = true;
609 }
610
611 pub fn disable_request_intercept(&mut self) {
613 self.protocol_request_interception_enabled = false;
614 }
615
616 #[cfg(feature = "_cache")]
618 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
619 self.cache_site_key = cache_site_key;
620 }
621
622 #[cfg(feature = "_cache")]
624 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
625 self.cache_policy = cache_policy;
626 }
627
628 pub fn update_protocol_cache_disabled(&mut self) {
629 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
630 }
631
632 pub fn authenticate(&mut self, credentials: Credentials) {
633 self.credentials = Some(credentials);
634 self.update_protocol_request_interception();
635 self.protocol_request_interception_enabled = true;
636 }
637
638 fn update_protocol_request_interception(&mut self) {
639 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
640
641 if enabled == self.protocol_request_interception_enabled {
642 return;
643 }
644
645 if enabled {
646 self.push_cdp_request(ENABLE_FETCH.clone())
647 } else {
648 self.push_cdp_request(DisableParams::default())
649 }
650 }
651
652 #[inline]
661 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
662 let block_analytics = self.block_analytics;
664
665 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
667 {
668 return true;
669 }
670
671 if crate::handler::blockers::block_websites::block_website(url) {
673 return true;
674 }
675
676 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
683 let p_slash = Self::strip_query_fragment(path_with_slash);
685 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
686
687 let base = match p_slash.rsplit('/').next() {
689 Some(b) => b,
690 None => p_slash,
691 };
692
693 if block_analytics && (base == "analytics.js" || p_noslash.ends_with("/analytics.js")) {
697 return true;
698 }
699
700 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
703 return true;
704 }
705 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
706 return true;
707 }
708 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
709 return true;
710 }
711
712 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
715 return true;
716 }
717
718 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
720 return true;
721 }
722 }
723
724 false
725 }
726
727 #[inline]
732 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
733 let idx = url.find("//")?;
735 let after_slashes = idx + 2;
736
737 let slash_rel = url[after_slashes..].find('/')?;
739 let slash_idx = after_slashes + slash_rel;
740
741 if slash_idx < url.len() {
742 Some(&url[slash_idx..])
743 } else {
744 None
745 }
746 }
747
748 #[inline]
753 fn strip_query_fragment(s: &str) -> &str {
754 let q = s.find('?');
755 let h = s.find('#');
756
757 match (q, h) {
758 (None, None) => s,
759 (Some(i), None) => &s[..i],
760 (None, Some(i)) => &s[..i],
761 (Some(i), Some(j)) => &s[..i.min(j)],
762 }
763 }
764
765 #[inline]
767 fn skip_xhr(
768 &self,
769 skip_networking: bool,
770 event: &EventRequestPaused,
771 network_event: bool,
772 ) -> bool {
773 if !skip_networking && network_event {
775 let request_url = event.request.url.as_str();
776
777 let skip_analytics =
779 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
780
781 if skip_analytics {
782 true
783 } else if self.block_stylesheets || self.ignore_visuals {
784 let block_css = self.block_stylesheets;
785 let block_media = self.ignore_visuals;
786
787 let mut block_request = false;
788
789 if let Some(position) = request_url.rfind('.') {
790 let hlen = request_url.len();
791 let has_asset = hlen - position;
792
793 if has_asset >= 3 {
794 let next_position = position + 1;
795
796 if block_media
797 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
798 &request_url[next_position..].into(),
799 )
800 {
801 block_request = true;
802 } else if block_css {
803 block_request =
804 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
805 .contains(&**CSS_EXTENSION)
806 }
807 }
808 }
809
810 if !block_request {
811 block_request = ignore_script_xhr_media(request_url);
812 }
813
814 block_request
815 } else {
816 skip_networking
817 }
818 } else {
819 skip_networking
820 }
821 }
822
823 #[cfg(feature = "adblock")]
824 #[inline]
825 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
827 if skip_networking {
828 true
829 } else {
830 self.detect_ad(event)
831 }
832 }
833
834 #[cfg(not(feature = "adblock"))]
836 #[inline]
837 fn detect_ad_if_enabled(&mut self, _event: &EventRequestPaused, skip_networking: bool) -> bool {
838 skip_networking
839 }
840
841 #[inline]
842 fn fail_request_blocked(
844 &mut self,
845 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
846 ) {
847 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
848 request_id.clone(),
849 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
850 );
851 self.push_cdp_request(params);
852 }
853
854 #[inline]
855 fn fulfill_request_empty_200(
857 &mut self,
858 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
859 ) {
860 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
861 request_id.clone(),
862 200,
863 );
864 self.push_cdp_request(params);
865 }
866
867 #[cfg(feature = "_cache")]
868 #[inline]
869 fn fulfill_request_from_cache(
873 &mut self,
874 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
875 body: &[u8],
876 headers: &std::collections::HashMap<String, String>,
877 status: i64,
878 ) {
879 use crate::cdp::browser_protocol::fetch::HeaderEntry;
880 use crate::handler::network::fetch::FulfillRequestParams;
881 use base64::Engine;
882
883 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
884
885 for (k, v) in headers.iter() {
886 resp_headers.push(HeaderEntry {
887 name: k.clone().into(),
888 value: v.clone().into(),
889 });
890 }
891
892 let mut params = FulfillRequestParams::new(request_id.clone(), status);
893
894 params.body = Some(
896 base64::engine::general_purpose::STANDARD
897 .encode(body)
898 .into(),
899 );
900
901 params.response_headers = Some(resp_headers);
902
903 self.push_cdp_request(params);
904 }
905
906 #[inline]
907 fn continue_request_with_url(
909 &mut self,
910 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
911 url: Option<&str>,
912 intercept_response: bool,
913 ) {
914 let mut params = ContinueRequestParams::new(request_id.clone());
915 if let Some(url) = url {
916 params.url = Some(url.to_string());
917 params.intercept_response = Some(intercept_response);
918 }
919 self.push_cdp_request(params);
920 }
921
922 #[inline]
924 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
925 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
926 return;
927 }
928
929 let resource_type = &event.resource_type;
930
931 if self.block_all {
932 tracing::debug!(
933 "Blocked (block_all): {:?} - {}",
934 event.resource_type,
935 event.request.url
936 );
937 return self.fail_request_blocked(&event.request_id);
938 }
939
940 if let Some(network_id) = event.network_id.as_ref() {
941 if let Some(request_will_be_sent) =
942 self.requests_will_be_sent.remove(network_id.as_ref())
943 {
944 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
945 } else {
946 self.request_id_to_interception_id
947 .insert(network_id.clone(), event.request_id.clone().into());
948 }
949 }
950
951 let javascript_resource = *resource_type == ResourceType::Script;
953 let document_resource = *resource_type == ResourceType::Document;
954 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
955
956 let mut skip_networking =
958 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
959
960 if !skip_networking {
962 skip_networking = self.document_reload_tracker >= 3;
963 }
964
965 let (current_url_cow, had_replacer) =
967 self.handle_document_replacement_and_tracking(event, document_resource);
968
969 let current_url: &str = current_url_cow.as_ref();
970
971 if !skip_networking {
977 if self.xml_document && current_url.ends_with(".xsl") {
979 skip_networking = false;
980 } else {
981 skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
982 }
983 }
984
985 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
987
988 if !skip_networking
990 && self.block_javascript
991 && (self.only_html || self.ignore_visuals)
992 && (javascript_resource || document_resource)
993 {
994 skip_networking = ignore_script_embedded(current_url);
995 }
996
997 if !skip_networking && javascript_resource {
1000 skip_networking = self.should_block_script_blocklist_only(current_url);
1001 }
1002
1003 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
1005
1006 if !skip_networking && (javascript_resource || network_resource || document_resource) {
1008 skip_networking = self.intercept_manager.intercept_detection(
1009 current_url,
1010 self.ignore_visuals,
1011 network_resource,
1012 );
1013 }
1014
1015 if !skip_networking && (javascript_resource || network_resource) {
1017 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
1018 }
1019
1020 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
1023 {
1024 skip_networking = false;
1025 }
1026
1027 if skip_networking && self.is_whitelisted(current_url) {
1029 skip_networking = false;
1030 }
1031
1032 if skip_networking
1034 && (javascript_resource || *resource_type == ResourceType::Stylesheet)
1035 && self.is_related_3rd_party_by_keyword_fast(current_url)
1036 {
1037 skip_networking = false;
1038 }
1039
1040 if skip_networking {
1041 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
1042 self.fulfill_request_empty_200(&event.request_id);
1043 } else {
1044 #[cfg(feature = "_cache")]
1045 {
1046 if let (Some(policy), Some(cache_site_key)) =
1047 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1048 {
1049 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1050
1051 if let Some((res, cache_policy)) =
1052 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1053 {
1054 if policy.allows_cached(&cache_policy) {
1055 tracing::debug!(
1056 "Remote Cached: {:?} - {}",
1057 resource_type,
1058 ¤t_url
1059 );
1060 return self.fulfill_request_from_cache(
1061 &event.request_id,
1062 &res.body,
1063 &res.headers,
1064 res.status as i64,
1065 );
1066 }
1067 }
1068 }
1069 }
1070
1071 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1073 self.continue_request_with_url(
1074 &event.request_id,
1075 if had_replacer {
1076 Some(current_url)
1077 } else {
1078 None
1079 },
1080 !had_replacer,
1081 );
1082 }
1083 }
1084
1085 #[inline]
1091 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1092 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1093 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1094 }
1095
1096 pub fn has_target_domain(&self) -> bool {
1098 !self.document_target_url.is_empty()
1099 }
1100
1101 pub fn set_page_url(&mut self, page_target_url: String) {
1103 let host_base = host_and_rest(&page_target_url)
1104 .map(|(h, _)| base_domain_from_host(h))
1105 .unwrap_or("");
1106
1107 self.document_target_domain = host_base.to_string();
1108 self.document_target_url = page_target_url;
1109 self.site_keyword = SiteKeyword::new_from_base_domain(&self.document_target_domain);
1111 }
1112
1113 pub fn clear_target_domain(&mut self) {
1115 self.document_reload_tracker = 0;
1116 self.document_target_url = Default::default();
1117 self.document_target_domain = Default::default();
1118 self.site_keyword = None;
1119 }
1120
1121 #[inline]
1129 fn handle_document_replacement_and_tracking<'a>(
1130 &mut self,
1131 event: &'a EventRequestPaused,
1132 document_resource: bool,
1133 ) -> (Cow<'a, str>, bool) {
1134 let mut replacer: Option<String> = None;
1135 let current_url = event.request.url.as_str();
1136
1137 if document_resource {
1138 if self.document_target_url == current_url {
1139 self.document_reload_tracker += 1;
1140 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1141 {
1142 let (http_document_replacement, mut https_document_replacement) =
1143 if self.document_target_url.starts_with("http://") {
1144 (
1145 self.document_target_url.replacen("http://", "http//", 1),
1146 self.document_target_url.replacen("http://", "https://", 1),
1147 )
1148 } else {
1149 (
1150 self.document_target_url.replacen("https://", "https//", 1),
1151 self.document_target_url.replacen("https://", "http://", 1),
1152 )
1153 };
1154
1155 let trailing = https_document_replacement.ends_with('/');
1157 if trailing {
1158 https_document_replacement.pop();
1159 }
1160 if https_document_replacement.ends_with('/') {
1161 https_document_replacement.pop();
1162 }
1163
1164 let redirect_mask = format!(
1165 "{}{}",
1166 https_document_replacement, http_document_replacement
1167 );
1168
1169 if current_url == redirect_mask {
1170 replacer = Some(if trailing {
1171 format!("{}/", https_document_replacement)
1172 } else {
1173 https_document_replacement
1174 });
1175 }
1176 }
1177
1178 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1179 self.xml_document = true;
1180 }
1181
1182 self.document_target_url = event.request.url.clone();
1184 self.document_target_domain = host_and_rest(&self.document_target_url)
1185 .map(|(h, _)| base_domain_from_host(h).to_string())
1186 .unwrap_or_default();
1187
1188 self.site_keyword = SiteKeyword::new_from_base_domain(&self.document_target_domain);
1189 }
1190
1191 let current_url_cow = match replacer {
1192 Some(r) => Cow::Owned(r),
1193 None => Cow::Borrowed(event.request.url.as_str()),
1194 };
1195
1196 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1197 (current_url_cow, had_replacer)
1198 }
1199
1200 #[cfg(feature = "adblock")]
1202 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1203 use adblock::{
1204 lists::{FilterSet, ParseOptions, RuleTypes},
1205 Engine,
1206 };
1207
1208 lazy_static::lazy_static! {
1209 static ref AD_ENGINE: Engine = {
1210 let mut filter_set = FilterSet::new(false);
1211 let mut rules = ParseOptions::default();
1212 rules.rule_types = RuleTypes::All;
1213
1214 filter_set.add_filters(
1215 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1216 rules,
1217 );
1218
1219 Engine::from_filter_set(filter_set, true)
1220 };
1221 };
1222
1223 let blockable = ResourceType::Image == event.resource_type
1224 || event.resource_type == ResourceType::Media
1225 || event.resource_type == ResourceType::Stylesheet
1226 || event.resource_type == ResourceType::Document
1227 || event.resource_type == ResourceType::Fetch
1228 || event.resource_type == ResourceType::Xhr;
1229
1230 let u = &event.request.url;
1231
1232 let block_request = blockable
1233 && {
1235 let request = adblock::request::Request::preparsed(
1236 &u,
1237 "example.com",
1238 "example.com",
1239 &event.resource_type.as_ref().to_lowercase(),
1240 !event.request.is_same_site.unwrap_or_default());
1241
1242 AD_ENGINE.check_network_request(&request).matched
1243 };
1244
1245 block_request
1246 }
1247
1248 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1249 let response = if self
1250 .attempted_authentications
1251 .contains(event.request_id.as_ref())
1252 {
1253 AuthChallengeResponseResponse::CancelAuth
1254 } else if self.credentials.is_some() {
1255 self.attempted_authentications
1256 .insert(event.request_id.clone().into());
1257 AuthChallengeResponseResponse::ProvideCredentials
1258 } else {
1259 AuthChallengeResponseResponse::Default
1260 };
1261
1262 let mut auth = AuthChallengeResponse::new(response);
1263 if let Some(creds) = self.credentials.clone() {
1264 auth.username = Some(creds.username);
1265 auth.password = Some(creds.password);
1266 }
1267 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1268 }
1269
1270 pub fn set_offline_mode(&mut self, value: bool) {
1272 if self.offline == value {
1273 return;
1274 }
1275 self.offline = value;
1276 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1277 .offline(self.offline)
1278 .latency(0)
1279 .download_throughput(-1.)
1280 .upload_throughput(-1.)
1281 .build()
1282 {
1283 self.push_cdp_request(network);
1284 }
1285 }
1286
1287 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1289 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1290 if let Some(interception_id) = self
1291 .request_id_to_interception_id
1292 .remove(event.request_id.as_ref())
1293 {
1294 self.on_request(event, Some(interception_id));
1295 } else {
1296 self.requests_will_be_sent
1298 .insert(event.request_id.clone(), event.clone());
1299 }
1300 } else {
1301 self.on_request(event, None);
1302 }
1303 }
1304
1305 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1307 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1308 request.from_memory_cache = true;
1309 }
1310 }
1311
1312 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1314 let mut request_failed = false;
1315
1316 let mut deducted: u64 = 0;
1318
1319 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1320 let before = *max_bytes;
1321
1322 let received_bytes: u64 = event.response.encoded_data_length as u64;
1324
1325 let content_length: Option<u64> = event
1327 .response
1328 .headers
1329 .inner()
1330 .get("content-length")
1331 .and_then(|v| v.as_str())
1332 .and_then(|s| s.trim().parse::<u64>().ok());
1333
1334 *max_bytes = max_bytes.saturating_sub(received_bytes);
1336
1337 if let Some(cl) = content_length {
1339 if cl > *max_bytes {
1340 *max_bytes = 0;
1341 }
1342 }
1343
1344 request_failed = *max_bytes == 0;
1345
1346 deducted = before.saturating_sub(*max_bytes);
1348 }
1349
1350 if deducted > 0 {
1352 self.queued_events
1353 .push_back(NetworkEvent::BytesConsumed(deducted));
1354 }
1355
1356 if request_failed && self.max_bytes_allowed.is_some() {
1358 self.set_block_all(true);
1359 }
1360
1361 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1362 request.set_response(event.response.clone());
1363 self.queued_events.push_back(if request_failed {
1364 NetworkEvent::RequestFailed(request)
1365 } else {
1366 NetworkEvent::RequestFinished(request)
1367 });
1368 }
1369 }
1370
1371 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1373 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1374 if let Some(interception_id) = request.interception_id.as_ref() {
1375 self.attempted_authentications
1376 .remove(interception_id.as_ref());
1377 }
1378 self.queued_events
1379 .push_back(NetworkEvent::RequestFinished(request));
1380 }
1381 }
1382
1383 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1385 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1386 request.failure_text = Some(event.error_text.clone());
1387 if let Some(interception_id) = request.interception_id.as_ref() {
1388 self.attempted_authentications
1389 .remove(interception_id.as_ref());
1390 }
1391 self.queued_events
1392 .push_back(NetworkEvent::RequestFailed(request));
1393 }
1394 }
1395
1396 fn on_request(
1398 &mut self,
1399 event: &EventRequestWillBeSent,
1400 interception_id: Option<InterceptionId>,
1401 ) {
1402 let mut redirect_chain = Vec::new();
1403 let mut redirect_location = None;
1404
1405 if let Some(redirect_resp) = &event.redirect_response {
1406 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1407 if is_redirect_status(redirect_resp.status) {
1408 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1409 if redirect_resp.url != location {
1410 let fixed_location = location.replace(&redirect_resp.url, "");
1411
1412 if !fixed_location.is_empty() {
1413 request.response.as_mut().map(|resp| {
1414 resp.headers.0["Location"] =
1415 serde_json::Value::String(fixed_location.clone());
1416 });
1417 }
1418
1419 redirect_location = Some(fixed_location);
1420 }
1421 }
1422 }
1423
1424 self.handle_request_redirect(
1425 &mut request,
1426 if let Some(redirect_location) = redirect_location {
1427 let mut redirect_resp = redirect_resp.clone();
1428
1429 if !redirect_location.is_empty() {
1430 redirect_resp.headers.0["Location"] =
1431 serde_json::Value::String(redirect_location);
1432 }
1433
1434 redirect_resp
1435 } else {
1436 redirect_resp.clone()
1437 },
1438 );
1439
1440 redirect_chain = std::mem::take(&mut request.redirect_chain);
1441 redirect_chain.push(request);
1442 }
1443 }
1444
1445 let request = HttpRequest::new(
1446 event.request_id.clone(),
1447 event.frame_id.clone(),
1448 interception_id,
1449 self.user_request_interception_enabled,
1450 redirect_chain,
1451 );
1452
1453 self.requests.insert(event.request_id.clone(), request);
1454 self.queued_events
1455 .push_back(NetworkEvent::Request(event.request_id.clone()));
1456 }
1457
1458 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1460 request.set_response(response);
1461 if let Some(interception_id) = request.interception_id.as_ref() {
1462 self.attempted_authentications
1463 .remove(interception_id.as_ref());
1464 }
1465 }
1466}
1467
1468#[derive(Debug)]
1469pub enum NetworkEvent {
1470 SendCdpRequest((MethodId, serde_json::Value)),
1472 Request(RequestId),
1474 Response(RequestId),
1476 RequestFailed(HttpRequest),
1478 RequestFinished(HttpRequest),
1480 BytesConsumed(u64),
1482}
1483
1484#[cfg(test)]
1485mod tests {
1486 use super::ALLOWED_MATCHER_3RD_PARTY;
1487 use crate::handler::network::NetworkManager;
1488 use std::time::Duration;
1489
1490 #[test]
1491 fn test_allowed_matcher_3rd_party() {
1492 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1494 assert!(
1495 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1496 "expected Cloudflare challenge script to be allowed"
1497 );
1498
1499 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1501 assert!(
1502 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1503 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1504 );
1505
1506 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1508 assert!(ALLOWED_MATCHER_3RD_PARTY
1509 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1510 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1511 }
1512
1513 #[test]
1514 fn test_script_allowed_by_default_when_not_blocklisted() {
1515 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1516 nm.set_page_url(
1517 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1518 );
1519
1520 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1522 assert!(
1523 !nm.should_block_script_blocklist_only(ok),
1524 "expected non-blocklisted script to be allowed"
1525 );
1526 }
1527
1528 #[test]
1529 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1530 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1531 nm.set_page_url(
1532 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1533 );
1534
1535 let bad = "https://cdn.example.net/js/analytics.js";
1537 assert!(
1538 nm.should_block_script_blocklist_only(bad),
1539 "expected analytics.js to be blocklisted"
1540 );
1541 }
1542
1543 #[test]
1544 fn test_allowed_matcher_3rd_party_sanity() {
1545 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1547 assert!(
1548 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1549 "expected Cloudflare challenge script to be allowed"
1550 );
1551
1552 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1554 assert!(
1555 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1556 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1557 );
1558
1559 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1560 assert!(ALLOWED_MATCHER_3RD_PARTY
1561 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1562 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1563 }
1564
1565 #[test]
1566 fn test_related_3rd_party_keyword_fast_gelcom() {
1567 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1568 nm.set_page_url("https://www.gelcom.de/".to_string());
1569
1570 let a = "https://tags-eu.tiqcdn.com/utag/gelcom/oneshop-eu/prod/utag.js";
1572 assert!(nm.is_related_3rd_party_by_keyword_fast(a));
1573
1574 let b = "https://www2.gelcom.de/forward/ablyft-cdn/s/55651514.js";
1576 assert!(nm.is_related_3rd_party_by_keyword_fast(b));
1577
1578 let c = "https://ebs01.gelcom.de/resout/legalnote-replacer/legalnote-replacer-oneshop.js";
1580 assert!(nm.is_related_3rd_party_by_keyword_fast(c));
1581
1582 let d = "https://static.cloudflareinsights.com/beacon.min.js";
1584 assert!(!nm.is_related_3rd_party_by_keyword_fast(d));
1585 }
1586}