1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "captcha",
66 "client",
67 "/cdn-cgi/challenge-platform/",
68 "/wp-content/js/", "https://m.stripe.network/",
71 "https://challenges.cloudflare.com/",
72 "https://www.google.com/recaptcha/",
73 "https://google.com/recaptcha/api.js",
74 "https://www.gstatic.com/recaptcha/",
75 "https://captcha.px-cloud.net/",
76 "https://geo.captcha-delivery.com/",
77 "https://api.leminnow.com/captcha/",
78 "https://cdn.auth0.com/js/lock/",
79 "https://captcha.gtimg.com",
80 "https://client-api.arkoselabs.com/",
81 "https://www.capy.me/puzzle/",
82 "https://newassets.hcaptcha.com/",
83 "https://cdn.auth0.com/client",
84 "https://js.stripe.com/",
85 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
88 ];
89
90 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
95
96 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
98 "https://m.stripe.network/",
100 "https://challenges.cloudflare.com/",
101 "https://js.stripe.com/",
102 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
105 "https://ct.captcha-delivery.com/",
106 "https://geo.captcha-delivery.com/",
107 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
109 "https://cdn.auth0.com/client",
110 "https://captcha.px-cloud.net/",
111 "https://www.capy.me/puzzle/",
112 "https://www.gstatic.com/recaptcha/",
113 "https://google.com/recaptcha/",
114 "https://www.google.com/recaptcha/",
115 "https://www.recaptcha.net/recaptcha/",
116 "https://js.hcaptcha.com/1/api.js",
117 "https://hcaptcha.com/1/api.js",
118 "https://js.datadome.co/tags.js",
119 "https://api-js.datadome.co/",
120 "https://client.perimeterx.net/",
121 "https://captcha.px-cdn.net/",
122 "https://newassets.hcaptcha.com/",
123 "https://captcha.px-cloud.net/",
124 "https://s.perimeterx.net/",
125 "https://api.leminnow.com/captcha/",
126 "https://client-api.arkoselabs.com/",
127 "https://static.geetest.com/v4/gt4.js",
128 "https://static.geetest.com/",
129 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
130 "https://cdn.perfdrive.com/aperture/",
131 "https://assets.queue-it.net/",
132 "discourse-cdn.com/",
133 "hcaptcha.com",
134 "/cdn-cgi/challenge-platform/",
135 "/_Incapsula_Resource"
136 ];
137
138 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
140
141 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
143 phf::phf_set! {
144 "_astro/", "_app/immutable"
146 }
147 };
148
149 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
151 "application/pdf",
152 "application/zip",
153 "application/x-rar-compressed",
154 "application/x-tar",
155 "image/png",
156 "image/jpeg",
157 "image/gif",
158 "image/bmp",
159 "image/webp",
160 "image/svg+xml",
161 "video/mp4",
162 "video/x-msvideo",
163 "video/x-matroska",
164 "video/webm",
165 "audio/mpeg",
166 "audio/ogg",
167 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
168 "application/vnd.ms-excel",
169 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
170 "application/vnd.ms-powerpoint",
171 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
172 "application/x-7z-compressed",
173 "application/x-rpm",
174 "application/x-shockwave-flash",
175 "application/rtf",
176 };
177
178 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
180 "Image",
181 "Media",
182 "Font"
183 };
184
185 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
187 "CspViolationReport",
188 "Manifest",
189 "Other",
190 "Prefetch",
191 "Ping",
192 };
193
194 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
196
197 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
199 let enable = EnableParams::default();
200
201 if let Ok(c) = serde_json::to_value(&enable) {
202 vec![(enable.identifier(), c)]
203 } else {
204 vec![]
205 }
206 };
207
208 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
210 let enable = EnableParams::default();
211 let mut v = vec![];
212 if let Ok(c) = serde_json::to_value(&enable) {
213 v.push((enable.identifier(), c));
214 }
215 let ignore = SetIgnoreCertificateErrorsParams::new(true);
216 if let Ok(ignored) = serde_json::to_value(&ignore) {
217 v.push((ignore.identifier(), ignored));
218 }
219
220 v
221 };
222
223 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
225 fetch::EnableParams::builder()
226 .handle_auth_requests(true)
227 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
228 .build()
229 };
230}
231
232pub(crate) fn is_redirect_status(status: i64) -> bool {
234 matches!(status, 301 | 302 | 303 | 307 | 308)
235}
236
237#[derive(Debug)]
238pub struct NetworkManager {
240 queued_events: VecDeque<NetworkEvent>,
246 ignore_httpserrors: bool,
251 requests: HashMap<RequestId, HttpRequest>,
256 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
263 extra_headers: std::collections::HashMap<String, String>,
268 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
274 user_cache_disabled: bool,
279 attempted_authentications: HashSet<RequestId>,
285 credentials: Option<Credentials>,
290 pub(crate) user_request_interception_enabled: bool,
299 block_all: bool,
306 pub(crate) protocol_request_interception_enabled: bool,
312 offline: bool,
314 pub request_timeout: Duration,
316 pub ignore_visuals: bool,
319 pub block_stylesheets: bool,
321 pub block_javascript: bool,
326 pub block_analytics: bool,
328 pub only_html: bool,
330 pub xml_document: bool,
332 pub intercept_manager: NetworkInterceptManager,
334 pub document_reload_tracker: u8,
336 pub document_target_url: String,
338 pub document_target_domain: String,
340 pub max_bytes_allowed: Option<u64>,
342 #[cfg(feature = "_cache")]
343 pub cache_site_key: Option<String>,
345 #[cfg(feature = "_cache")]
347 pub cache_policy: Option<BasicCachePolicy>,
348 whitelist_patterns: Vec<String>,
350 whitelist_matcher: Option<AhoCorasick>,
352 blacklist_patterns: Vec<String>,
354 blacklist_matcher: Option<AhoCorasick>,
356 blacklist_strict: bool,
358}
359
360impl NetworkManager {
361 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
363 Self {
364 queued_events: Default::default(),
365 ignore_httpserrors,
366 requests: Default::default(),
367 requests_will_be_sent: Default::default(),
368 extra_headers: Default::default(),
369 request_id_to_interception_id: Default::default(),
370 user_cache_disabled: false,
371 attempted_authentications: Default::default(),
372 credentials: None,
373 block_all: false,
374 user_request_interception_enabled: false,
375 protocol_request_interception_enabled: false,
376 offline: false,
377 request_timeout,
378 ignore_visuals: false,
379 block_javascript: false,
380 block_stylesheets: false,
381 block_analytics: true,
382 only_html: false,
383 xml_document: false,
384 intercept_manager: NetworkInterceptManager::Unknown,
385 document_reload_tracker: 0,
386 document_target_url: String::new(),
387 document_target_domain: String::new(),
388 whitelist_patterns: Vec::new(),
389 whitelist_matcher: None,
390 blacklist_patterns: Vec::new(),
391 blacklist_matcher: None,
392 blacklist_strict: true,
393 max_bytes_allowed: None,
394 #[cfg(feature = "_cache")]
395 cache_site_key: None,
396 #[cfg(feature = "_cache")]
397 cache_policy: None,
398 }
399 }
400
401 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
403 where
404 I: IntoIterator<Item = S>,
405 S: Into<String>,
406 {
407 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
408 self.rebuild_whitelist_matcher();
409 }
410
411 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
413 where
414 I: IntoIterator<Item = S>,
415 S: Into<String>,
416 {
417 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
418 self.rebuild_blacklist_matcher();
419 }
420
421 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
423 self.blacklist_patterns.push(pattern.into());
424 self.rebuild_blacklist_matcher();
425 }
426
427 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
429 where
430 I: IntoIterator<Item = S>,
431 S: Into<String>,
432 {
433 self.blacklist_patterns
434 .extend(patterns.into_iter().map(Into::into));
435 self.rebuild_blacklist_matcher();
436 }
437
438 pub fn clear_blacklist(&mut self) {
440 self.blacklist_patterns.clear();
441 self.blacklist_matcher = None;
442 }
443
444 pub fn set_blacklist_strict(&mut self, strict: bool) {
446 self.blacklist_strict = strict;
447 }
448
449 #[inline]
450 fn rebuild_blacklist_matcher(&mut self) {
451 if self.blacklist_patterns.is_empty() {
452 self.blacklist_matcher = None;
453 return;
454 }
455
456 let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
457 self.blacklist_matcher = AhoCorasick::new(refs).ok();
458 }
459
460 #[inline]
461 fn is_blacklisted(&self, url: &str) -> bool {
462 self.blacklist_matcher
463 .as_ref()
464 .map(|m| m.is_match(url))
465 .unwrap_or(false)
466 }
467
468 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
470 self.whitelist_patterns.push(pattern.into());
471 self.rebuild_whitelist_matcher();
472 }
473
474 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
476 where
477 I: IntoIterator<Item = S>,
478 S: Into<String>,
479 {
480 self.whitelist_patterns
481 .extend(patterns.into_iter().map(Into::into));
482 self.rebuild_whitelist_matcher();
483 }
484
485 #[inline]
486 fn rebuild_whitelist_matcher(&mut self) {
487 if self.whitelist_patterns.is_empty() {
488 self.whitelist_matcher = None;
489 return;
490 }
491
492 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
493
494 self.whitelist_matcher = AhoCorasick::new(refs).ok();
496 }
497
498 #[inline]
499 fn is_whitelisted(&self, url: &str) -> bool {
500 self.whitelist_matcher
501 .as_ref()
502 .map(|m| m.is_match(url))
503 .unwrap_or(false)
504 }
505
506 pub fn init_commands(&self) -> CommandChain {
508 let cmds = if self.ignore_httpserrors {
509 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
510 } else {
511 INIT_CHAIN.clone()
512 };
513 CommandChain::new(cmds, self.request_timeout)
514 }
515
516 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
518 let method = cmd.identifier();
519 if let Ok(params) = serde_json::to_value(cmd) {
520 self.queued_events
521 .push_back(NetworkEvent::SendCdpRequest((method, params)));
522 }
523 }
524
525 pub fn poll(&mut self) -> Option<NetworkEvent> {
527 self.queued_events.pop_front()
528 }
529
530 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
532 &self.extra_headers
533 }
534
535 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
537 self.extra_headers = headers;
538 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
539 self.extra_headers.remove("Proxy-Authorization");
540 if !self.extra_headers.is_empty() {
541 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
542 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
543 }
544 }
545 }
546
547 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
548 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
549 }
550
551 pub fn set_block_all(&mut self, block_all: bool) {
552 self.block_all = block_all;
553 }
554
555 pub fn set_request_interception(&mut self, enabled: bool) {
556 self.user_request_interception_enabled = enabled;
557 self.update_protocol_request_interception();
558 }
559
560 pub fn set_cache_enabled(&mut self, enabled: bool) {
561 let run = self.user_cache_disabled != !enabled;
562 self.user_cache_disabled = !enabled;
563 if run {
564 self.update_protocol_cache_disabled();
565 }
566 }
567
568 pub fn enable_request_intercept(&mut self) {
570 self.protocol_request_interception_enabled = true;
571 }
572
573 pub fn disable_request_intercept(&mut self) {
575 self.protocol_request_interception_enabled = false;
576 }
577
578 #[cfg(feature = "_cache")]
580 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
581 self.cache_site_key = cache_site_key;
582 }
583
584 #[cfg(feature = "_cache")]
586 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
587 self.cache_policy = cache_policy;
588 }
589
590 pub fn update_protocol_cache_disabled(&mut self) {
591 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
592 }
593
594 pub fn authenticate(&mut self, credentials: Credentials) {
595 self.credentials = Some(credentials);
596 self.update_protocol_request_interception();
597 self.protocol_request_interception_enabled = true;
598 }
599
600 fn update_protocol_request_interception(&mut self) {
601 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
602
603 if enabled == self.protocol_request_interception_enabled {
604 return;
605 }
606
607 if enabled {
608 self.push_cdp_request(ENABLE_FETCH.clone())
609 } else {
610 self.push_cdp_request(DisableParams::default())
611 }
612 }
613
614 #[inline]
617 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
618 let block_analytics = self.block_analytics;
620
621 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
623 {
624 return true;
625 }
626
627 if crate::handler::blockers::block_websites::block_website(url) {
629 return true;
630 }
631
632 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
639 let p_slash = Self::strip_query_fragment(path_with_slash);
641 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
642
643 let base = match p_slash.rsplit('/').next() {
645 Some(b) => b,
646 None => p_slash,
647 };
648
649 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
652 return true;
653 }
654 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
655 return true;
656 }
657 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
658 return true;
659 }
660
661 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
664 return true;
665 }
666
667 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
669 return true;
670 }
671 }
672
673 false
674 }
675
676 #[inline]
681 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
682 let idx = url.find("//")?;
684 let after_slashes = idx + 2;
685
686 let slash_rel = url[after_slashes..].find('/')?;
688 let slash_idx = after_slashes + slash_rel;
689
690 if slash_idx < url.len() {
691 Some(&url[slash_idx..])
692 } else {
693 None
694 }
695 }
696
697 #[inline]
702 fn strip_query_fragment(s: &str) -> &str {
703 let q = s.find('?');
704 let h = s.find('#');
705
706 match (q, h) {
707 (None, None) => s,
708 (Some(i), None) => &s[..i],
709 (None, Some(i)) => &s[..i],
710 (Some(i), Some(j)) => &s[..i.min(j)],
711 }
712 }
713
714 #[inline]
716 fn skip_xhr(
717 &self,
718 skip_networking: bool,
719 event: &EventRequestPaused,
720 network_event: bool,
721 ) -> bool {
722 if !skip_networking && network_event {
724 let request_url = event.request.url.as_str();
725
726 let skip_analytics =
728 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
729
730 if skip_analytics {
731 true
732 } else if self.block_stylesheets || self.ignore_visuals {
733 let block_css = self.block_stylesheets;
734 let block_media = self.ignore_visuals;
735
736 let mut block_request = false;
737
738 if let Some(position) = request_url.rfind('.') {
739 let hlen = request_url.len();
740 let has_asset = hlen - position;
741
742 if has_asset >= 3 {
743 let next_position = position + 1;
744
745 if block_media
746 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
747 &request_url[next_position..].into(),
748 )
749 {
750 block_request = true;
751 } else if block_css {
752 block_request =
753 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
754 .contains(&**CSS_EXTENSION)
755 }
756 }
757 }
758
759 if !block_request {
760 block_request = ignore_script_xhr_media(request_url);
761 }
762
763 block_request
764 } else {
765 skip_networking
766 }
767 } else {
768 skip_networking
769 }
770 }
771
772 #[cfg(feature = "adblock")]
773 #[inline]
774 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
776 if skip_networking {
777 true
778 } else {
779 block_ads(&event.request.url) || self.detect_ad(event)
780 }
781 }
782
783 #[cfg(not(feature = "adblock"))]
785 #[inline]
786 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
787 use crate::handler::blockers::block_websites::block_ads;
788 if skip_networking {
789 true
790 } else {
791 block_ads(&event.request.url)
792 }
793 }
794
795 #[inline]
796 fn fail_request_blocked(
798 &mut self,
799 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
800 ) {
801 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
802 request_id.clone(),
803 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
804 );
805 self.push_cdp_request(params);
806 }
807
808 #[inline]
809 fn fulfill_request_empty_200(
811 &mut self,
812 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
813 ) {
814 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
815 request_id.clone(),
816 200,
817 );
818 self.push_cdp_request(params);
819 }
820
821 #[cfg(feature = "_cache")]
822 #[inline]
823 fn fulfill_request_from_cache(
827 &mut self,
828 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
829 body: &[u8],
830 headers: &std::collections::HashMap<String, String>,
831 status: i64,
832 ) {
833 use crate::cdp::browser_protocol::fetch::HeaderEntry;
834 use crate::handler::network::fetch::FulfillRequestParams;
835 use base64::Engine;
836
837 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
838
839 for (k, v) in headers.iter() {
840 resp_headers.push(HeaderEntry {
841 name: k.clone().into(),
842 value: v.clone().into(),
843 });
844 }
845
846 let mut params = FulfillRequestParams::new(request_id.clone(), status);
847
848 params.body = Some(
850 base64::engine::general_purpose::STANDARD
851 .encode(body)
852 .into(),
853 );
854
855 params.response_headers = Some(resp_headers);
856
857 self.push_cdp_request(params);
858 }
859
860 #[inline]
861 fn continue_request_with_url(
863 &mut self,
864 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
865 url: Option<&str>,
866 intercept_response: bool,
867 ) {
868 let mut params = ContinueRequestParams::new(request_id.clone());
869 if let Some(url) = url {
870 params.url = Some(url.to_string());
871 params.intercept_response = Some(intercept_response);
872 }
873 self.push_cdp_request(params);
874 }
875
876 #[inline]
878 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
879 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
880 return;
881 }
882
883 let resource_type = &event.resource_type;
884
885 if self.block_all {
886 tracing::debug!(
887 "Blocked (block_all): {:?} - {}",
888 event.resource_type,
889 event.request.url
890 );
891 return self.fail_request_blocked(&event.request_id);
892 }
893
894 if let Some(network_id) = event.network_id.as_ref() {
895 if let Some(request_will_be_sent) =
896 self.requests_will_be_sent.remove(network_id.as_ref())
897 {
898 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
899 } else {
900 self.request_id_to_interception_id
901 .insert(network_id.clone(), event.request_id.clone().into());
902 }
903 }
904
905 let javascript_resource = *resource_type == ResourceType::Script;
907 let document_resource = *resource_type == ResourceType::Document;
908 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
909
910 let mut skip_networking =
912 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
913
914 if !skip_networking {
916 skip_networking = self.document_reload_tracker >= 3;
917 }
918
919 let (current_url_cow, had_replacer) =
921 self.handle_document_replacement_and_tracking(event, document_resource);
922
923 let current_url: &str = current_url_cow.as_ref();
924
925 let blacklisted = self.is_blacklisted(current_url);
926
927 if !self.blacklist_strict && blacklisted {
928 skip_networking = true;
929 }
930
931 if !skip_networking {
932 if self.xml_document && current_url.ends_with(".xsl") {
934 skip_networking = false;
935 } else {
936 skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
937 }
938 }
939
940 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
941
942 if !skip_networking
944 && self.block_javascript
945 && (self.only_html || self.ignore_visuals)
946 && (javascript_resource || document_resource)
947 {
948 skip_networking = ignore_script_embedded(current_url);
949 }
950
951 if !skip_networking && javascript_resource {
954 skip_networking = self.should_block_script_blocklist_only(current_url);
955 }
956
957 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
959
960 if !skip_networking && (javascript_resource || network_resource || document_resource) {
962 skip_networking = self.intercept_manager.intercept_detection(
963 current_url,
964 self.ignore_visuals,
965 network_resource,
966 );
967 }
968
969 if !skip_networking && (javascript_resource || network_resource) {
971 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
972 }
973
974 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
977 {
978 skip_networking = false;
979 }
980
981 if skip_networking && self.is_whitelisted(current_url) {
983 skip_networking = false;
984 }
985
986 if self.blacklist_strict && blacklisted {
987 skip_networking = true;
988 }
989
990 if skip_networking {
991 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
992 self.fulfill_request_empty_200(&event.request_id);
993 } else {
994 #[cfg(feature = "_cache")]
995 {
996 if let (Some(policy), Some(cache_site_key)) =
997 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
998 {
999 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1000
1001 if let Some((res, cache_policy)) =
1002 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1003 {
1004 if policy.allows_cached(&cache_policy) {
1005 tracing::debug!(
1006 "Remote Cached: {:?} - {}",
1007 resource_type,
1008 ¤t_url
1009 );
1010 return self.fulfill_request_from_cache(
1011 &event.request_id,
1012 &res.body,
1013 &res.headers,
1014 res.status as i64,
1015 );
1016 }
1017 }
1018 }
1019 }
1020
1021 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1023 self.continue_request_with_url(
1024 &event.request_id,
1025 if had_replacer {
1026 Some(current_url)
1027 } else {
1028 None
1029 },
1030 !had_replacer,
1031 );
1032 }
1033 }
1034
1035 #[inline]
1041 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1042 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1043 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1044 }
1045
1046 pub fn has_target_domain(&self) -> bool {
1048 !self.document_target_url.is_empty()
1049 }
1050
1051 pub fn set_page_url(&mut self, page_target_url: String) {
1053 let host_base = host_and_rest(&page_target_url)
1054 .map(|(h, _)| base_domain_from_host(h))
1055 .unwrap_or("");
1056
1057 self.document_target_domain = host_base.to_string();
1058 self.document_target_url = page_target_url;
1059 }
1060
1061 pub fn clear_target_domain(&mut self) {
1063 self.document_reload_tracker = 0;
1064 self.document_target_url = Default::default();
1065 self.document_target_domain = Default::default();
1066 }
1067
1068 #[inline]
1076 fn handle_document_replacement_and_tracking<'a>(
1077 &mut self,
1078 event: &'a EventRequestPaused,
1079 document_resource: bool,
1080 ) -> (Cow<'a, str>, bool) {
1081 let mut replacer: Option<String> = None;
1082 let current_url = event.request.url.as_str();
1083
1084 if document_resource {
1085 if self.document_target_url == current_url {
1086 self.document_reload_tracker += 1;
1087 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1088 {
1089 let (http_document_replacement, mut https_document_replacement) =
1090 if self.document_target_url.starts_with("http://") {
1091 (
1092 self.document_target_url.replacen("http://", "http//", 1),
1093 self.document_target_url.replacen("http://", "https://", 1),
1094 )
1095 } else {
1096 (
1097 self.document_target_url.replacen("https://", "https//", 1),
1098 self.document_target_url.replacen("https://", "http://", 1),
1099 )
1100 };
1101
1102 let trailing = https_document_replacement.ends_with('/');
1104 if trailing {
1105 https_document_replacement.pop();
1106 }
1107 if https_document_replacement.ends_with('/') {
1108 https_document_replacement.pop();
1109 }
1110
1111 let redirect_mask = format!(
1112 "{}{}",
1113 https_document_replacement, http_document_replacement
1114 );
1115
1116 if current_url == redirect_mask {
1117 replacer = Some(if trailing {
1118 format!("{}/", https_document_replacement)
1119 } else {
1120 https_document_replacement
1121 });
1122 }
1123 }
1124
1125 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1126 self.xml_document = true;
1127 }
1128
1129 self.document_target_url = event.request.url.clone();
1131 self.document_target_domain = host_and_rest(&self.document_target_url)
1132 .map(|(h, _)| base_domain_from_host(h).to_string())
1133 .unwrap_or_default();
1134 }
1135
1136 let current_url_cow = match replacer {
1137 Some(r) => Cow::Owned(r),
1138 None => Cow::Borrowed(event.request.url.as_str()),
1139 };
1140
1141 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1142 (current_url_cow, had_replacer)
1143 }
1144
1145 #[cfg(feature = "adblock")]
1147 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1148 use adblock::{
1149 lists::{FilterSet, ParseOptions, RuleTypes},
1150 Engine,
1151 };
1152
1153 lazy_static::lazy_static! {
1154 static ref AD_ENGINE: Engine = {
1155 let mut filter_set = FilterSet::new(false);
1156 let mut rules = ParseOptions::default();
1157 rules.rule_types = RuleTypes::All;
1158
1159 filter_set.add_filters(
1160 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1161 rules,
1162 );
1163
1164 Engine::from_filter_set(filter_set, true)
1165 };
1166 };
1167
1168 let blockable = ResourceType::Image == event.resource_type
1169 || event.resource_type == ResourceType::Media
1170 || event.resource_type == ResourceType::Stylesheet
1171 || event.resource_type == ResourceType::Document
1172 || event.resource_type == ResourceType::Fetch
1173 || event.resource_type == ResourceType::Xhr;
1174
1175 let u = &event.request.url;
1176
1177 let block_request = blockable
1178 && {
1180 let request = adblock::request::Request::preparsed(
1181 &u,
1182 "example.com",
1183 "example.com",
1184 &event.resource_type.as_ref().to_lowercase(),
1185 !event.request.is_same_site.unwrap_or_default());
1186
1187 AD_ENGINE.check_network_request(&request).matched
1188 };
1189
1190 block_request
1191 }
1192
1193 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1194 let response = if self
1195 .attempted_authentications
1196 .contains(event.request_id.as_ref())
1197 {
1198 AuthChallengeResponseResponse::CancelAuth
1199 } else if self.credentials.is_some() {
1200 self.attempted_authentications
1201 .insert(event.request_id.clone().into());
1202 AuthChallengeResponseResponse::ProvideCredentials
1203 } else {
1204 AuthChallengeResponseResponse::Default
1205 };
1206
1207 let mut auth = AuthChallengeResponse::new(response);
1208 if let Some(creds) = self.credentials.clone() {
1209 auth.username = Some(creds.username);
1210 auth.password = Some(creds.password);
1211 }
1212 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1213 }
1214
1215 pub fn set_offline_mode(&mut self, value: bool) {
1217 if self.offline == value {
1218 return;
1219 }
1220 self.offline = value;
1221 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1222 .offline(self.offline)
1223 .latency(0)
1224 .download_throughput(-1.)
1225 .upload_throughput(-1.)
1226 .build()
1227 {
1228 self.push_cdp_request(network);
1229 }
1230 }
1231
1232 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1234 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1235 if let Some(interception_id) = self
1236 .request_id_to_interception_id
1237 .remove(event.request_id.as_ref())
1238 {
1239 self.on_request(event, Some(interception_id));
1240 } else {
1241 self.requests_will_be_sent
1243 .insert(event.request_id.clone(), event.clone());
1244 }
1245 } else {
1246 self.on_request(event, None);
1247 }
1248 }
1249
1250 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1252 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1253 request.from_memory_cache = true;
1254 }
1255 }
1256
1257 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1259 let mut request_failed = false;
1260
1261 let mut deducted: u64 = 0;
1263
1264 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1265 let before = *max_bytes;
1266
1267 let received_bytes: u64 = event.response.encoded_data_length as u64;
1269
1270 let content_length: Option<u64> = event
1272 .response
1273 .headers
1274 .inner()
1275 .get("content-length")
1276 .and_then(|v| v.as_str())
1277 .and_then(|s| s.trim().parse::<u64>().ok());
1278
1279 *max_bytes = max_bytes.saturating_sub(received_bytes);
1281
1282 if let Some(cl) = content_length {
1284 if cl > *max_bytes {
1285 *max_bytes = 0;
1286 }
1287 }
1288
1289 request_failed = *max_bytes == 0;
1290
1291 deducted = before.saturating_sub(*max_bytes);
1293 }
1294
1295 if deducted > 0 {
1297 self.queued_events
1298 .push_back(NetworkEvent::BytesConsumed(deducted));
1299 }
1300
1301 if request_failed && self.max_bytes_allowed.is_some() {
1303 self.set_block_all(true);
1304 }
1305
1306 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1307 request.set_response(event.response.clone());
1308 self.queued_events.push_back(if request_failed {
1309 NetworkEvent::RequestFailed(request)
1310 } else {
1311 NetworkEvent::RequestFinished(request)
1312 });
1313 }
1314 }
1315
1316 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1318 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1319 if let Some(interception_id) = request.interception_id.as_ref() {
1320 self.attempted_authentications
1321 .remove(interception_id.as_ref());
1322 }
1323 self.queued_events
1324 .push_back(NetworkEvent::RequestFinished(request));
1325 }
1326 }
1327
1328 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1330 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1331 request.failure_text = Some(event.error_text.clone());
1332 if let Some(interception_id) = request.interception_id.as_ref() {
1333 self.attempted_authentications
1334 .remove(interception_id.as_ref());
1335 }
1336 self.queued_events
1337 .push_back(NetworkEvent::RequestFailed(request));
1338 }
1339 }
1340
1341 fn on_request(
1343 &mut self,
1344 event: &EventRequestWillBeSent,
1345 interception_id: Option<InterceptionId>,
1346 ) {
1347 let mut redirect_chain = Vec::new();
1348 let mut redirect_location = None;
1349
1350 if let Some(redirect_resp) = &event.redirect_response {
1351 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1352 if is_redirect_status(redirect_resp.status) {
1353 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1354 if redirect_resp.url != location {
1355 let fixed_location = location.replace(&redirect_resp.url, "");
1356
1357 if !fixed_location.is_empty() {
1358 request.response.as_mut().map(|resp| {
1359 resp.headers.0["Location"] =
1360 serde_json::Value::String(fixed_location.clone());
1361 });
1362 }
1363
1364 redirect_location = Some(fixed_location);
1365 }
1366 }
1367 }
1368
1369 self.handle_request_redirect(
1370 &mut request,
1371 if let Some(redirect_location) = redirect_location {
1372 let mut redirect_resp = redirect_resp.clone();
1373
1374 if !redirect_location.is_empty() {
1375 redirect_resp.headers.0["Location"] =
1376 serde_json::Value::String(redirect_location);
1377 }
1378
1379 redirect_resp
1380 } else {
1381 redirect_resp.clone()
1382 },
1383 );
1384
1385 redirect_chain = std::mem::take(&mut request.redirect_chain);
1386 redirect_chain.push(request);
1387 }
1388 }
1389
1390 let request = HttpRequest::new(
1391 event.request_id.clone(),
1392 event.frame_id.clone(),
1393 interception_id,
1394 self.user_request_interception_enabled,
1395 redirect_chain,
1396 );
1397
1398 self.requests.insert(event.request_id.clone(), request);
1399 self.queued_events
1400 .push_back(NetworkEvent::Request(event.request_id.clone()));
1401 }
1402
1403 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1405 request.set_response(response);
1406 if let Some(interception_id) = request.interception_id.as_ref() {
1407 self.attempted_authentications
1408 .remove(interception_id.as_ref());
1409 }
1410 }
1411}
1412
1413#[derive(Debug)]
1414pub enum NetworkEvent {
1415 SendCdpRequest((MethodId, serde_json::Value)),
1417 Request(RequestId),
1419 Response(RequestId),
1421 RequestFailed(HttpRequest),
1423 RequestFinished(HttpRequest),
1425 BytesConsumed(u64),
1427}
1428
1429#[cfg(test)]
1430mod tests {
1431 use super::ALLOWED_MATCHER_3RD_PARTY;
1432 use crate::handler::network::NetworkManager;
1433 use std::time::Duration;
1434
1435 #[test]
1436 fn test_allowed_matcher_3rd_party() {
1437 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1439 assert!(
1440 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1441 "expected Cloudflare challenge script to be allowed"
1442 );
1443
1444 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1446 assert!(
1447 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1448 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1449 );
1450
1451 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1453 assert!(ALLOWED_MATCHER_3RD_PARTY
1454 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1455 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1456 }
1457
1458 #[test]
1459 fn test_script_allowed_by_default_when_not_blocklisted() {
1460 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1461 nm.set_page_url(
1462 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1463 );
1464
1465 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1467 assert!(
1468 !nm.should_block_script_blocklist_only(ok),
1469 "expected non-blocklisted script to be allowed"
1470 );
1471 }
1472
1473 #[test]
1474 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1475 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1476 nm.set_page_url(
1477 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1478 );
1479
1480 let bad = "https://cdn.example.net/js/analytics.js";
1482 assert!(
1483 nm.should_block_script_blocklist_only(bad),
1484 "expected analytics.js to be blocklisted"
1485 );
1486 }
1487
1488 #[test]
1489 fn test_allowed_matcher_3rd_party_sanity() {
1490 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1492 assert!(
1493 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1494 "expected Cloudflare challenge script to be allowed"
1495 );
1496
1497 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1499 assert!(
1500 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1501 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1502 );
1503
1504 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1505 assert!(ALLOWED_MATCHER_3RD_PARTY
1506 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1507 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1508 }
1509 #[test]
1510 fn test_dynamic_blacklist_blocks_url() {
1511 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1512 nm.set_page_url("https://example.com/".to_string());
1513
1514 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1515 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1516 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1517
1518 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1519 }
1520
1521 #[test]
1522 fn test_blacklist_strict_wins_over_whitelist() {
1523 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1524 nm.set_page_url("https://example.com/".to_string());
1525
1526 nm.set_blacklist_patterns(["beacon.min.js"]);
1528 nm.set_whitelist_patterns(["beacon.min.js"]);
1529
1530 nm.set_blacklist_strict(true);
1531
1532 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1533 assert!(nm.is_whitelisted(u));
1534 assert!(nm.is_blacklisted(u));
1535
1536 assert!(nm.blacklist_strict);
1539 }
1540
1541 #[test]
1542 fn test_blacklist_non_strict_allows_whitelist_override() {
1543 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1544 nm.set_page_url("https://example.com/".to_string());
1545
1546 nm.set_blacklist_patterns(["beacon.min.js"]);
1547 nm.set_whitelist_patterns(["beacon.min.js"]);
1548
1549 nm.set_blacklist_strict(false);
1550
1551 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1552 assert!(nm.is_blacklisted(u));
1553 assert!(nm.is_whitelisted(u));
1554 assert!(!nm.blacklist_strict);
1555 }
1556}