1use super::blockers::{
2 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
3 xhr::IGNORE_XHR_ASSETS,
4};
5use crate::auth::Credentials;
6#[cfg(feature = "_cache")]
7use crate::cache::BasicCachePolicy;
8use crate::cmd::CommandChain;
9use crate::handler::http::HttpRequest;
10use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
11use aho_corasick::AhoCorasick;
12use case_insensitive_string::CaseInsensitiveString;
13use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
14use chromiumoxide_cdp::cdp::browser_protocol::network::{
15 EmulateNetworkConditionsParams, EventLoadingFailed, EventLoadingFinished,
16 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
17 InterceptionId, RequestId, ResourceType, Response, SetCacheDisabledParams,
18 SetExtraHttpHeadersParams,
19};
20use chromiumoxide_cdp::cdp::browser_protocol::{
21 fetch::{
22 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
23 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
24 },
25 network::SetBypassServiceWorkerParams,
26};
27use chromiumoxide_cdp::cdp::browser_protocol::{
28 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
29};
30use chromiumoxide_types::{Command, Method, MethodId};
31use hashbrown::{HashMap, HashSet};
32use lazy_static::lazy_static;
33use reqwest::header::PROXY_AUTHORIZATION;
34use spider_network_blocker::intercept_manager::NetworkInterceptManager;
35pub use spider_network_blocker::scripts::{
36 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
37};
38use std::borrow::Cow;
39use std::collections::VecDeque;
40use std::time::Duration;
41
42lazy_static! {
43 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
45 "jquery", "angular",
47 "react", "vue", "bootstrap",
50 "d3",
51 "lodash",
52 "ajax",
53 "application",
54 "app", "main",
56 "index",
57 "bundle",
58 "vendor",
59 "runtime",
60 "polyfill",
61 "scripts",
62 "es2015.",
63 "es2020.",
64 "webpack",
65 "captcha",
66 "client",
67 "/cdn-cgi/challenge-platform/",
68 "/wp-content/js/", "https://m.stripe.network/",
71 "https://challenges.cloudflare.com/",
72 "https://www.google.com/recaptcha/enterprise.js",
73 "https://www.google.com/recaptcha/api.js",
74 "https://google.com/recaptcha/api.js",
75 "https://captcha.px-cloud.net/",
76 "https://geo.captcha-delivery.com/",
77 "https://cdn.auth0.com/js/lock/",
78 "https://captcha.gtimg.com",
79 "https://cdn.auth0.com/client",
80 "https://js.stripe.com/",
81 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
84 ];
85
86 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
91
92 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
94 "https://m.stripe.network/",
96 "https://challenges.cloudflare.com/",
97 "https://www.google.com/recaptcha/api.js",
98 "https://google.com/recaptcha/api.js",
99 "https://www.google.com/recaptcha/enterprise.js",
100 "https://js.stripe.com/",
101 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
104 "https://ct.captcha-delivery.com/",
105 "https://geo.captcha-delivery.com/",
106 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://ct.captcha-delivery.com/",
108 "https://cdn.auth0.com/client",
109 "https://captcha.px-cloud.net/",
110 "https://www.gstatic.com/recaptcha/",
111 "https://www.google.com/recaptcha/api2/",
112 "https://www.recaptcha.net/recaptcha/",
113 "https://js.hcaptcha.com/1/api.js",
114 "https://hcaptcha.com/1/api.js",
115 "https://js.datadome.co/tags.js",
116 "https://api-js.datadome.co/",
117 "https://client.perimeterx.net/",
118 "https://captcha.px-cdn.net/",
119 "https://captcha.px-cloud.net/",
120 "https://s.perimeterx.net/",
121 "https://client-api.arkoselabs.com/v2/",
122 "https://static.geetest.com/v4/gt4.js",
123 "https://static.geetest.com/",
124 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
125 "https://cdn.perfdrive.com/aperture/",
126 "https://assets.queue-it.net/",
127 "discourse-cdn.com/",
128 "/cdn-cgi/challenge-platform/",
129 "/_Incapsula_Resource"
130 ];
131
132 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
134
135 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
137 phf::phf_set! {
138 "_astro/", "_app/immutable"
140 }
141 };
142
143 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
145 "application/pdf",
146 "application/zip",
147 "application/x-rar-compressed",
148 "application/x-tar",
149 "image/png",
150 "image/jpeg",
151 "image/gif",
152 "image/bmp",
153 "image/webp",
154 "image/svg+xml",
155 "video/mp4",
156 "video/x-msvideo",
157 "video/x-matroska",
158 "video/webm",
159 "audio/mpeg",
160 "audio/ogg",
161 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
162 "application/vnd.ms-excel",
163 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
164 "application/vnd.ms-powerpoint",
165 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
166 "application/x-7z-compressed",
167 "application/x-rpm",
168 "application/x-shockwave-flash",
169 "application/rtf",
170 };
171
172 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
174 "Image",
175 "Media",
176 "Font"
177 };
178
179 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181 "CspViolationReport",
182 "Manifest",
183 "Other",
184 "Prefetch",
185 "Ping",
186 };
187
188 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
190
191 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
193 let enable = EnableParams::default();
194
195 if let Ok(c) = serde_json::to_value(&enable) {
196 vec![(enable.identifier(), c)]
197 } else {
198 vec![]
199 }
200 };
201
202 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
204 let enable = EnableParams::default();
205 let mut v = vec![];
206 if let Ok(c) = serde_json::to_value(&enable) {
207 v.push((enable.identifier(), c));
208 }
209 let ignore = SetIgnoreCertificateErrorsParams::new(true);
210 if let Ok(ignored) = serde_json::to_value(&ignore) {
211 v.push((ignore.identifier(), ignored));
212 }
213
214 v
215 };
216
217 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
219 fetch::EnableParams::builder()
220 .handle_auth_requests(true)
221 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
222 .build()
223 };
224}
225
226pub(crate) fn is_redirect_status(status: i64) -> bool {
228 matches!(status, 301 | 302 | 303 | 307 | 308)
229}
230
231#[derive(Debug)]
232pub struct NetworkManager {
234 queued_events: VecDeque<NetworkEvent>,
240 ignore_httpserrors: bool,
245 requests: HashMap<RequestId, HttpRequest>,
250 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
257 extra_headers: std::collections::HashMap<String, String>,
262 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
268 user_cache_disabled: bool,
273 attempted_authentications: HashSet<RequestId>,
279 credentials: Option<Credentials>,
284 pub(crate) user_request_interception_enabled: bool,
293 block_all: bool,
300 pub(crate) protocol_request_interception_enabled: bool,
306 offline: bool,
308 pub request_timeout: Duration,
310 pub ignore_visuals: bool,
313 pub block_stylesheets: bool,
315 pub block_javascript: bool,
320 pub block_analytics: bool,
322 pub only_html: bool,
324 pub xml_document: bool,
326 pub intercept_manager: NetworkInterceptManager,
328 pub document_reload_tracker: u8,
330 pub document_target_url: String,
332 pub document_target_domain: String,
334 pub max_bytes_allowed: Option<u64>,
336 #[cfg(feature = "_cache")]
337 pub cache_site_key: Option<String>,
339 #[cfg(feature = "_cache")]
341 pub cache_policy: Option<BasicCachePolicy>,
342 whitelist_patterns: Vec<String>,
344 whitelist_matcher: Option<AhoCorasick>,
346 blacklist_patterns: Vec<String>,
348 blacklist_matcher: Option<AhoCorasick>,
350 blacklist_strict: bool,
352}
353
354impl NetworkManager {
355 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
357 Self {
358 queued_events: Default::default(),
359 ignore_httpserrors,
360 requests: Default::default(),
361 requests_will_be_sent: Default::default(),
362 extra_headers: Default::default(),
363 request_id_to_interception_id: Default::default(),
364 user_cache_disabled: false,
365 attempted_authentications: Default::default(),
366 credentials: None,
367 block_all: false,
368 user_request_interception_enabled: false,
369 protocol_request_interception_enabled: false,
370 offline: false,
371 request_timeout,
372 ignore_visuals: false,
373 block_javascript: false,
374 block_stylesheets: false,
375 block_analytics: true,
376 only_html: false,
377 xml_document: false,
378 intercept_manager: NetworkInterceptManager::Unknown,
379 document_reload_tracker: 0,
380 document_target_url: String::new(),
381 document_target_domain: String::new(),
382 whitelist_patterns: Vec::new(),
383 whitelist_matcher: None,
384 blacklist_patterns: Vec::new(),
385 blacklist_matcher: None,
386 blacklist_strict: true,
387 max_bytes_allowed: None,
388 #[cfg(feature = "_cache")]
389 cache_site_key: None,
390 #[cfg(feature = "_cache")]
391 cache_policy: None,
392 }
393 }
394
395 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
397 where
398 I: IntoIterator<Item = S>,
399 S: Into<String>,
400 {
401 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
402 self.rebuild_whitelist_matcher();
403 }
404
405 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
407 where
408 I: IntoIterator<Item = S>,
409 S: Into<String>,
410 {
411 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
412 self.rebuild_blacklist_matcher();
413 }
414
415 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
417 self.blacklist_patterns.push(pattern.into());
418 self.rebuild_blacklist_matcher();
419 }
420
421 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
423 where
424 I: IntoIterator<Item = S>,
425 S: Into<String>,
426 {
427 self.blacklist_patterns
428 .extend(patterns.into_iter().map(Into::into));
429 self.rebuild_blacklist_matcher();
430 }
431
432 pub fn clear_blacklist(&mut self) {
434 self.blacklist_patterns.clear();
435 self.blacklist_matcher = None;
436 }
437
438 pub fn set_blacklist_strict(&mut self, strict: bool) {
440 self.blacklist_strict = strict;
441 }
442
443 #[inline]
444 fn rebuild_blacklist_matcher(&mut self) {
445 if self.blacklist_patterns.is_empty() {
446 self.blacklist_matcher = None;
447 return;
448 }
449
450 let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
451 self.blacklist_matcher = AhoCorasick::new(refs).ok();
452 }
453
454 #[inline]
455 fn is_blacklisted(&self, url: &str) -> bool {
456 self.blacklist_matcher
457 .as_ref()
458 .map(|m| m.is_match(url))
459 .unwrap_or(false)
460 }
461
462 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
464 self.whitelist_patterns.push(pattern.into());
465 self.rebuild_whitelist_matcher();
466 }
467
468 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
470 where
471 I: IntoIterator<Item = S>,
472 S: Into<String>,
473 {
474 self.whitelist_patterns
475 .extend(patterns.into_iter().map(Into::into));
476 self.rebuild_whitelist_matcher();
477 }
478
479 #[inline]
480 fn rebuild_whitelist_matcher(&mut self) {
481 if self.whitelist_patterns.is_empty() {
482 self.whitelist_matcher = None;
483 return;
484 }
485
486 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
487
488 self.whitelist_matcher = AhoCorasick::new(refs).ok();
490 }
491
492 #[inline]
493 fn is_whitelisted(&self, url: &str) -> bool {
494 self.whitelist_matcher
495 .as_ref()
496 .map(|m| m.is_match(url))
497 .unwrap_or(false)
498 }
499
500 pub fn init_commands(&self) -> CommandChain {
502 let cmds = if self.ignore_httpserrors {
503 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
504 } else {
505 INIT_CHAIN.clone()
506 };
507 CommandChain::new(cmds, self.request_timeout)
508 }
509
510 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
512 let method = cmd.identifier();
513 if let Ok(params) = serde_json::to_value(cmd) {
514 self.queued_events
515 .push_back(NetworkEvent::SendCdpRequest((method, params)));
516 }
517 }
518
519 pub fn poll(&mut self) -> Option<NetworkEvent> {
521 self.queued_events.pop_front()
522 }
523
524 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
526 &self.extra_headers
527 }
528
529 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
531 self.extra_headers = headers;
532 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
533 self.extra_headers.remove("Proxy-Authorization");
534 if !self.extra_headers.is_empty() {
535 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
536 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
537 }
538 }
539 }
540
541 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
542 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
543 }
544
545 pub fn set_block_all(&mut self, block_all: bool) {
546 self.block_all = block_all;
547 }
548
549 pub fn set_request_interception(&mut self, enabled: bool) {
550 self.user_request_interception_enabled = enabled;
551 self.update_protocol_request_interception();
552 }
553
554 pub fn set_cache_enabled(&mut self, enabled: bool) {
555 let run = self.user_cache_disabled != !enabled;
556 self.user_cache_disabled = !enabled;
557 if run {
558 self.update_protocol_cache_disabled();
559 }
560 }
561
562 pub fn enable_request_intercept(&mut self) {
564 self.protocol_request_interception_enabled = true;
565 }
566
567 pub fn disable_request_intercept(&mut self) {
569 self.protocol_request_interception_enabled = false;
570 }
571
572 #[cfg(feature = "_cache")]
574 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
575 self.cache_site_key = cache_site_key;
576 }
577
578 #[cfg(feature = "_cache")]
580 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
581 self.cache_policy = cache_policy;
582 }
583
584 pub fn update_protocol_cache_disabled(&mut self) {
585 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
586 }
587
588 pub fn authenticate(&mut self, credentials: Credentials) {
589 self.credentials = Some(credentials);
590 self.update_protocol_request_interception();
591 self.protocol_request_interception_enabled = true;
592 }
593
594 fn update_protocol_request_interception(&mut self) {
595 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
596
597 if enabled == self.protocol_request_interception_enabled {
598 return;
599 }
600
601 if enabled {
602 self.push_cdp_request(ENABLE_FETCH.clone())
603 } else {
604 self.push_cdp_request(DisableParams::default())
605 }
606 }
607
608 #[inline]
611 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
612 let block_analytics = self.block_analytics;
614
615 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
617 {
618 return true;
619 }
620
621 if crate::handler::blockers::block_websites::block_website(url) {
623 return true;
624 }
625
626 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
633 let p_slash = Self::strip_query_fragment(path_with_slash);
635 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
636
637 let base = match p_slash.rsplit('/').next() {
639 Some(b) => b,
640 None => p_slash,
641 };
642
643 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
646 return true;
647 }
648 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
649 return true;
650 }
651 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
652 return true;
653 }
654
655 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
658 return true;
659 }
660
661 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
663 return true;
664 }
665 }
666
667 false
668 }
669
670 #[inline]
675 fn url_path_with_leading_slash<'a>(url: &'a str) -> Option<&'a str> {
676 let idx = url.find("//")?;
678 let after_slashes = idx + 2;
679
680 let slash_rel = url[after_slashes..].find('/')?;
682 let slash_idx = after_slashes + slash_rel;
683
684 if slash_idx < url.len() {
685 Some(&url[slash_idx..])
686 } else {
687 None
688 }
689 }
690
691 #[inline]
696 fn strip_query_fragment(s: &str) -> &str {
697 let q = s.find('?');
698 let h = s.find('#');
699
700 match (q, h) {
701 (None, None) => s,
702 (Some(i), None) => &s[..i],
703 (None, Some(i)) => &s[..i],
704 (Some(i), Some(j)) => &s[..i.min(j)],
705 }
706 }
707
708 #[inline]
710 fn skip_xhr(
711 &self,
712 skip_networking: bool,
713 event: &EventRequestPaused,
714 network_event: bool,
715 ) -> bool {
716 if !skip_networking && network_event {
718 let request_url = event.request.url.as_str();
719
720 let skip_analytics =
722 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
723
724 if skip_analytics {
725 true
726 } else if self.block_stylesheets || self.ignore_visuals {
727 let block_css = self.block_stylesheets;
728 let block_media = self.ignore_visuals;
729
730 let mut block_request = false;
731
732 if let Some(position) = request_url.rfind('.') {
733 let hlen = request_url.len();
734 let has_asset = hlen - position;
735
736 if has_asset >= 3 {
737 let next_position = position + 1;
738
739 if block_media
740 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
741 &request_url[next_position..].into(),
742 )
743 {
744 block_request = true;
745 } else if block_css {
746 block_request =
747 CaseInsensitiveString::from(request_url[next_position..].as_bytes())
748 .contains(&**CSS_EXTENSION)
749 }
750 }
751 }
752
753 if !block_request {
754 block_request = ignore_script_xhr_media(request_url);
755 }
756
757 block_request
758 } else {
759 skip_networking
760 }
761 } else {
762 skip_networking
763 }
764 }
765
766 #[cfg(feature = "adblock")]
767 #[inline]
768 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
770 if skip_networking {
771 true
772 } else {
773 block_ads(&event.request.url) || self.detect_ad(event)
774 }
775 }
776
777 #[cfg(not(feature = "adblock"))]
779 #[inline]
780 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
781 use crate::handler::blockers::block_websites::block_ads;
782 if skip_networking {
783 true
784 } else {
785 block_ads(&event.request.url)
786 }
787 }
788
789 #[inline]
790 fn fail_request_blocked(
792 &mut self,
793 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
794 ) {
795 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
796 request_id.clone(),
797 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
798 );
799 self.push_cdp_request(params);
800 }
801
802 #[inline]
803 fn fulfill_request_empty_200(
805 &mut self,
806 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
807 ) {
808 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
809 request_id.clone(),
810 200,
811 );
812 self.push_cdp_request(params);
813 }
814
815 #[cfg(feature = "_cache")]
816 #[inline]
817 fn fulfill_request_from_cache(
821 &mut self,
822 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
823 body: &[u8],
824 headers: &std::collections::HashMap<String, String>,
825 status: i64,
826 ) {
827 use crate::cdp::browser_protocol::fetch::HeaderEntry;
828 use crate::handler::network::fetch::FulfillRequestParams;
829 use base64::Engine;
830
831 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
832
833 for (k, v) in headers.iter() {
834 resp_headers.push(HeaderEntry {
835 name: k.clone().into(),
836 value: v.clone().into(),
837 });
838 }
839
840 let mut params = FulfillRequestParams::new(request_id.clone(), status);
841
842 params.body = Some(
844 base64::engine::general_purpose::STANDARD
845 .encode(body)
846 .into(),
847 );
848
849 params.response_headers = Some(resp_headers);
850
851 self.push_cdp_request(params);
852 }
853
854 #[inline]
855 fn continue_request_with_url(
857 &mut self,
858 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
859 url: Option<&str>,
860 intercept_response: bool,
861 ) {
862 let mut params = ContinueRequestParams::new(request_id.clone());
863 if let Some(url) = url {
864 params.url = Some(url.to_string());
865 params.intercept_response = Some(intercept_response);
866 }
867 self.push_cdp_request(params);
868 }
869
870 #[inline]
872 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
873 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
874 return;
875 }
876
877 let resource_type = &event.resource_type;
878
879 if self.block_all {
880 tracing::debug!(
881 "Blocked (block_all): {:?} - {}",
882 event.resource_type,
883 event.request.url
884 );
885 return self.fail_request_blocked(&event.request_id);
886 }
887
888 if let Some(network_id) = event.network_id.as_ref() {
889 if let Some(request_will_be_sent) =
890 self.requests_will_be_sent.remove(network_id.as_ref())
891 {
892 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
893 } else {
894 self.request_id_to_interception_id
895 .insert(network_id.clone(), event.request_id.clone().into());
896 }
897 }
898
899 let javascript_resource = *resource_type == ResourceType::Script;
901 let document_resource = *resource_type == ResourceType::Document;
902 let network_resource = !document_resource && crate::utils::is_data_resource(resource_type);
903
904 let mut skip_networking =
906 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(resource_type.as_ref());
907
908 if !skip_networking {
910 skip_networking = self.document_reload_tracker >= 3;
911 }
912
913 let (current_url_cow, had_replacer) =
915 self.handle_document_replacement_and_tracking(event, document_resource);
916
917 let current_url: &str = current_url_cow.as_ref();
918
919 let blacklisted = self.is_blacklisted(current_url);
920
921 if !self.blacklist_strict && blacklisted {
922 skip_networking = true;
923 }
924
925 if !skip_networking {
926 if self.xml_document && current_url.ends_with(".xsl") {
928 skip_networking = false;
929 } else {
930 skip_networking = self.should_skip_for_visuals_and_basic(resource_type);
931 }
932 }
933
934 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
935
936 if !skip_networking
938 && self.block_javascript
939 && (self.only_html || self.ignore_visuals)
940 && (javascript_resource || document_resource)
941 {
942 skip_networking = ignore_script_embedded(current_url);
943 }
944
945 if !skip_networking && javascript_resource {
948 skip_networking = self.should_block_script_blocklist_only(current_url);
949 }
950
951 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
953
954 if !skip_networking && (javascript_resource || network_resource || document_resource) {
956 skip_networking = self.intercept_manager.intercept_detection(
957 current_url,
958 self.ignore_visuals,
959 network_resource,
960 );
961 }
962
963 if !skip_networking && (javascript_resource || network_resource) {
965 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
966 }
967
968 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
971 {
972 skip_networking = false;
973 }
974
975 if skip_networking && self.is_whitelisted(current_url) {
977 skip_networking = false;
978 }
979
980 if self.blacklist_strict && blacklisted {
981 skip_networking = true;
982 }
983
984 if skip_networking {
985 tracing::debug!("Blocked: {:?} - {}", resource_type, current_url);
986 self.fulfill_request_empty_200(&event.request_id);
987 } else {
988 #[cfg(feature = "_cache")]
989 {
990 if let (Some(policy), Some(cache_site_key)) =
991 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
992 {
993 let current_url = format!("{}:{}", event.request.method, ¤t_url);
994
995 if let Some((res, cache_policy)) =
996 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
997 {
998 if policy.allows_cached(&cache_policy) {
999 tracing::debug!(
1000 "Remote Cached: {:?} - {}",
1001 resource_type,
1002 ¤t_url
1003 );
1004 return self.fulfill_request_from_cache(
1005 &event.request_id,
1006 &res.body,
1007 &res.headers,
1008 res.status as i64,
1009 );
1010 }
1011 }
1012 }
1013 }
1014
1015 tracing::debug!("Allowed: {:?} - {}", resource_type, current_url);
1017 self.continue_request_with_url(
1018 &event.request_id,
1019 if had_replacer {
1020 Some(current_url)
1021 } else {
1022 None
1023 },
1024 !had_replacer,
1025 );
1026 }
1027 }
1028
1029 #[inline]
1035 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1036 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1037 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1038 }
1039
1040 pub fn has_target_domain(&self) -> bool {
1042 !self.document_target_url.is_empty()
1043 }
1044
1045 pub fn set_page_url(&mut self, page_target_url: String) {
1047 let host_base = host_and_rest(&page_target_url)
1048 .map(|(h, _)| base_domain_from_host(h))
1049 .unwrap_or("");
1050
1051 self.document_target_domain = host_base.to_string();
1052 self.document_target_url = page_target_url;
1053 }
1054
1055 pub fn clear_target_domain(&mut self) {
1057 self.document_reload_tracker = 0;
1058 self.document_target_url = Default::default();
1059 self.document_target_domain = Default::default();
1060 }
1061
1062 #[inline]
1070 fn handle_document_replacement_and_tracking<'a>(
1071 &mut self,
1072 event: &'a EventRequestPaused,
1073 document_resource: bool,
1074 ) -> (Cow<'a, str>, bool) {
1075 let mut replacer: Option<String> = None;
1076 let current_url = event.request.url.as_str();
1077
1078 if document_resource {
1079 if self.document_target_url == current_url {
1080 self.document_reload_tracker += 1;
1081 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1082 {
1083 let (http_document_replacement, mut https_document_replacement) =
1084 if self.document_target_url.starts_with("http://") {
1085 (
1086 self.document_target_url.replacen("http://", "http//", 1),
1087 self.document_target_url.replacen("http://", "https://", 1),
1088 )
1089 } else {
1090 (
1091 self.document_target_url.replacen("https://", "https//", 1),
1092 self.document_target_url.replacen("https://", "http://", 1),
1093 )
1094 };
1095
1096 let trailing = https_document_replacement.ends_with('/');
1098 if trailing {
1099 https_document_replacement.pop();
1100 }
1101 if https_document_replacement.ends_with('/') {
1102 https_document_replacement.pop();
1103 }
1104
1105 let redirect_mask = format!(
1106 "{}{}",
1107 https_document_replacement, http_document_replacement
1108 );
1109
1110 if current_url == redirect_mask {
1111 replacer = Some(if trailing {
1112 format!("{}/", https_document_replacement)
1113 } else {
1114 https_document_replacement
1115 });
1116 }
1117 }
1118
1119 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1120 self.xml_document = true;
1121 }
1122
1123 self.document_target_url = event.request.url.clone();
1125 self.document_target_domain = host_and_rest(&self.document_target_url)
1126 .map(|(h, _)| base_domain_from_host(h).to_string())
1127 .unwrap_or_default();
1128 }
1129
1130 let current_url_cow = match replacer {
1131 Some(r) => Cow::Owned(r),
1132 None => Cow::Borrowed(event.request.url.as_str()),
1133 };
1134
1135 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1136 (current_url_cow, had_replacer)
1137 }
1138
1139 #[cfg(feature = "adblock")]
1141 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1142 use adblock::{
1143 lists::{FilterSet, ParseOptions, RuleTypes},
1144 Engine,
1145 };
1146
1147 lazy_static::lazy_static! {
1148 static ref AD_ENGINE: Engine = {
1149 let mut filter_set = FilterSet::new(false);
1150 let mut rules = ParseOptions::default();
1151 rules.rule_types = RuleTypes::All;
1152
1153 filter_set.add_filters(
1154 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1155 rules,
1156 );
1157
1158 Engine::from_filter_set(filter_set, true)
1159 };
1160 };
1161
1162 let blockable = ResourceType::Image == event.resource_type
1163 || event.resource_type == ResourceType::Media
1164 || event.resource_type == ResourceType::Stylesheet
1165 || event.resource_type == ResourceType::Document
1166 || event.resource_type == ResourceType::Fetch
1167 || event.resource_type == ResourceType::Xhr;
1168
1169 let u = &event.request.url;
1170
1171 let block_request = blockable
1172 && {
1174 let request = adblock::request::Request::preparsed(
1175 &u,
1176 "example.com",
1177 "example.com",
1178 &event.resource_type.as_ref().to_lowercase(),
1179 !event.request.is_same_site.unwrap_or_default());
1180
1181 AD_ENGINE.check_network_request(&request).matched
1182 };
1183
1184 block_request
1185 }
1186
1187 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1188 let response = if self
1189 .attempted_authentications
1190 .contains(event.request_id.as_ref())
1191 {
1192 AuthChallengeResponseResponse::CancelAuth
1193 } else if self.credentials.is_some() {
1194 self.attempted_authentications
1195 .insert(event.request_id.clone().into());
1196 AuthChallengeResponseResponse::ProvideCredentials
1197 } else {
1198 AuthChallengeResponseResponse::Default
1199 };
1200
1201 let mut auth = AuthChallengeResponse::new(response);
1202 if let Some(creds) = self.credentials.clone() {
1203 auth.username = Some(creds.username);
1204 auth.password = Some(creds.password);
1205 }
1206 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1207 }
1208
1209 pub fn set_offline_mode(&mut self, value: bool) {
1211 if self.offline == value {
1212 return;
1213 }
1214 self.offline = value;
1215 if let Ok(network) = EmulateNetworkConditionsParams::builder()
1216 .offline(self.offline)
1217 .latency(0)
1218 .download_throughput(-1.)
1219 .upload_throughput(-1.)
1220 .build()
1221 {
1222 self.push_cdp_request(network);
1223 }
1224 }
1225
1226 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1228 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1229 if let Some(interception_id) = self
1230 .request_id_to_interception_id
1231 .remove(event.request_id.as_ref())
1232 {
1233 self.on_request(event, Some(interception_id));
1234 } else {
1235 self.requests_will_be_sent
1237 .insert(event.request_id.clone(), event.clone());
1238 }
1239 } else {
1240 self.on_request(event, None);
1241 }
1242 }
1243
1244 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1246 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1247 request.from_memory_cache = true;
1248 }
1249 }
1250
1251 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1253 let mut request_failed = false;
1254
1255 let mut deducted: u64 = 0;
1257
1258 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1259 let before = *max_bytes;
1260
1261 let received_bytes: u64 = event.response.encoded_data_length as u64;
1263
1264 let content_length: Option<u64> = event
1266 .response
1267 .headers
1268 .inner()
1269 .get("content-length")
1270 .and_then(|v| v.as_str())
1271 .and_then(|s| s.trim().parse::<u64>().ok());
1272
1273 *max_bytes = max_bytes.saturating_sub(received_bytes);
1275
1276 if let Some(cl) = content_length {
1278 if cl > *max_bytes {
1279 *max_bytes = 0;
1280 }
1281 }
1282
1283 request_failed = *max_bytes == 0;
1284
1285 deducted = before.saturating_sub(*max_bytes);
1287 }
1288
1289 if deducted > 0 {
1291 self.queued_events
1292 .push_back(NetworkEvent::BytesConsumed(deducted));
1293 }
1294
1295 if request_failed && self.max_bytes_allowed.is_some() {
1297 self.set_block_all(true);
1298 }
1299
1300 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1301 request.set_response(event.response.clone());
1302 self.queued_events.push_back(if request_failed {
1303 NetworkEvent::RequestFailed(request)
1304 } else {
1305 NetworkEvent::RequestFinished(request)
1306 });
1307 }
1308 }
1309
1310 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1312 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1313 if let Some(interception_id) = request.interception_id.as_ref() {
1314 self.attempted_authentications
1315 .remove(interception_id.as_ref());
1316 }
1317 self.queued_events
1318 .push_back(NetworkEvent::RequestFinished(request));
1319 }
1320 }
1321
1322 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1324 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1325 request.failure_text = Some(event.error_text.clone());
1326 if let Some(interception_id) = request.interception_id.as_ref() {
1327 self.attempted_authentications
1328 .remove(interception_id.as_ref());
1329 }
1330 self.queued_events
1331 .push_back(NetworkEvent::RequestFailed(request));
1332 }
1333 }
1334
1335 fn on_request(
1337 &mut self,
1338 event: &EventRequestWillBeSent,
1339 interception_id: Option<InterceptionId>,
1340 ) {
1341 let mut redirect_chain = Vec::new();
1342 let mut redirect_location = None;
1343
1344 if let Some(redirect_resp) = &event.redirect_response {
1345 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1346 if is_redirect_status(redirect_resp.status) {
1347 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1348 if redirect_resp.url != location {
1349 let fixed_location = location.replace(&redirect_resp.url, "");
1350
1351 if !fixed_location.is_empty() {
1352 request.response.as_mut().map(|resp| {
1353 resp.headers.0["Location"] =
1354 serde_json::Value::String(fixed_location.clone());
1355 });
1356 }
1357
1358 redirect_location = Some(fixed_location);
1359 }
1360 }
1361 }
1362
1363 self.handle_request_redirect(
1364 &mut request,
1365 if let Some(redirect_location) = redirect_location {
1366 let mut redirect_resp = redirect_resp.clone();
1367
1368 if !redirect_location.is_empty() {
1369 redirect_resp.headers.0["Location"] =
1370 serde_json::Value::String(redirect_location);
1371 }
1372
1373 redirect_resp
1374 } else {
1375 redirect_resp.clone()
1376 },
1377 );
1378
1379 redirect_chain = std::mem::take(&mut request.redirect_chain);
1380 redirect_chain.push(request);
1381 }
1382 }
1383
1384 let request = HttpRequest::new(
1385 event.request_id.clone(),
1386 event.frame_id.clone(),
1387 interception_id,
1388 self.user_request_interception_enabled,
1389 redirect_chain,
1390 );
1391
1392 self.requests.insert(event.request_id.clone(), request);
1393 self.queued_events
1394 .push_back(NetworkEvent::Request(event.request_id.clone()));
1395 }
1396
1397 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1399 request.set_response(response);
1400 if let Some(interception_id) = request.interception_id.as_ref() {
1401 self.attempted_authentications
1402 .remove(interception_id.as_ref());
1403 }
1404 }
1405}
1406
1407#[derive(Debug)]
1408pub enum NetworkEvent {
1409 SendCdpRequest((MethodId, serde_json::Value)),
1411 Request(RequestId),
1413 Response(RequestId),
1415 RequestFailed(HttpRequest),
1417 RequestFinished(HttpRequest),
1419 BytesConsumed(u64),
1421}
1422
1423#[cfg(test)]
1424mod tests {
1425 use super::ALLOWED_MATCHER_3RD_PARTY;
1426 use crate::handler::network::NetworkManager;
1427 use std::time::Duration;
1428
1429 #[test]
1430 fn test_allowed_matcher_3rd_party() {
1431 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1433 assert!(
1434 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1435 "expected Cloudflare challenge script to be allowed"
1436 );
1437
1438 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1440 assert!(
1441 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1442 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1443 );
1444
1445 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1447 assert!(ALLOWED_MATCHER_3RD_PARTY
1448 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1449 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1450 }
1451
1452 #[test]
1453 fn test_script_allowed_by_default_when_not_blocklisted() {
1454 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1455 nm.set_page_url(
1456 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1457 );
1458
1459 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1461 assert!(
1462 !nm.should_block_script_blocklist_only(ok),
1463 "expected non-blocklisted script to be allowed"
1464 );
1465 }
1466
1467 #[test]
1468 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1469 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1470 nm.set_page_url(
1471 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1472 );
1473
1474 let bad = "https://cdn.example.net/js/analytics.js";
1476 assert!(
1477 nm.should_block_script_blocklist_only(bad),
1478 "expected analytics.js to be blocklisted"
1479 );
1480 }
1481
1482 #[test]
1483 fn test_allowed_matcher_3rd_party_sanity() {
1484 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1486 assert!(
1487 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1488 "expected Cloudflare challenge script to be allowed"
1489 );
1490
1491 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1493 assert!(
1494 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1495 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1496 );
1497
1498 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1499 assert!(ALLOWED_MATCHER_3RD_PARTY
1500 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1501 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1502 }
1503 #[test]
1504 fn test_dynamic_blacklist_blocks_url() {
1505 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1506 nm.set_page_url("https://example.com/".to_string());
1507
1508 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1509 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1510 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1511
1512 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1513 }
1514
1515 #[test]
1516 fn test_blacklist_strict_wins_over_whitelist() {
1517 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1518 nm.set_page_url("https://example.com/".to_string());
1519
1520 nm.set_blacklist_patterns(["beacon.min.js"]);
1522 nm.set_whitelist_patterns(["beacon.min.js"]);
1523
1524 nm.set_blacklist_strict(true);
1525
1526 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1527 assert!(nm.is_whitelisted(u));
1528 assert!(nm.is_blacklisted(u));
1529
1530 assert!(nm.blacklist_strict);
1533 }
1534
1535 #[test]
1536 fn test_blacklist_non_strict_allows_whitelist_override() {
1537 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1538 nm.set_page_url("https://example.com/".to_string());
1539
1540 nm.set_blacklist_patterns(["beacon.min.js"]);
1541 nm.set_whitelist_patterns(["beacon.min.js"]);
1542
1543 nm.set_blacklist_strict(false);
1544
1545 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1546 assert!(nm.is_blacklisted(u));
1547 assert!(nm.is_whitelisted(u));
1548 assert!(!nm.blacklist_strict);
1549 }
1550}