1#[cfg(any(feature = "adblock", feature = "firewall"))]
2use super::blockers::block_websites::block_ads;
3use super::blockers::{
4 block_websites::block_xhr, ignore_script_embedded, ignore_script_xhr, ignore_script_xhr_media,
5 xhr::IGNORE_XHR_ASSETS,
6};
7use crate::auth::Credentials;
8#[cfg(feature = "_cache")]
9use crate::cache::BasicCachePolicy;
10use crate::cmd::CommandChain;
11use crate::handler::http::HttpRequest;
12use crate::handler::network_utils::{base_domain_from_host, host_and_rest};
13use aho_corasick::AhoCorasick;
14use case_insensitive_string::CaseInsensitiveString;
15use chromiumoxide_cdp::cdp::browser_protocol::fetch::{RequestPattern, RequestStage};
16use chromiumoxide_cdp::cdp::browser_protocol::network::{
17 EmulateNetworkConditionsByRuleParams, EventLoadingFailed, EventLoadingFinished,
18 EventRequestServedFromCache, EventRequestWillBeSent, EventResponseReceived, Headers,
19 InterceptionId, NetworkConditions, RequestId, ResourceType, Response, SetCacheDisabledParams,
20 SetExtraHttpHeadersParams,
21};
22use chromiumoxide_cdp::cdp::browser_protocol::{
23 fetch::{
24 self, AuthChallengeResponse, AuthChallengeResponseResponse, ContinueRequestParams,
25 ContinueWithAuthParams, DisableParams, EventAuthRequired, EventRequestPaused,
26 },
27 network::SetBypassServiceWorkerParams,
28};
29use chromiumoxide_cdp::cdp::browser_protocol::{
30 network::EnableParams, security::SetIgnoreCertificateErrorsParams,
31};
32use chromiumoxide_types::{Command, Method, MethodId};
33use hashbrown::{HashMap, HashSet};
34use lazy_static::lazy_static;
35use reqwest::header::PROXY_AUTHORIZATION;
36use spider_network_blocker::intercept_manager::NetworkInterceptManager;
37pub use spider_network_blocker::scripts::{
38 URL_IGNORE_SCRIPT_BASE_PATHS, URL_IGNORE_SCRIPT_STYLES_PATHS, URL_IGNORE_TRIE_PATHS,
39};
40use std::borrow::Cow;
41use std::collections::VecDeque;
42use std::time::Duration;
43
44lazy_static! {
45 static ref JS_FRAMEWORK_ALLOW: Vec<&'static str> = vec![
47 "jquery", "angular",
49 "react", "vue", "bootstrap",
52 "d3",
53 "lodash",
54 "ajax",
55 "application",
56 "app", "main",
58 "index",
59 "bundle",
60 "vendor",
61 "runtime",
62 "polyfill",
63 "scripts",
64 "es2015.",
65 "es2020.",
66 "webpack",
67 "captcha",
68 "client",
69 "/cdn-cgi/challenge-platform/",
70 "/wp-content/js/", "https://m.stripe.network/",
73 "https://challenges.cloudflare.com/",
74 "https://www.google.com/recaptcha/",
75 "https://google.com/recaptcha/api.js",
76 "https://www.gstatic.com/recaptcha/",
77 "https://captcha.px-cloud.net/",
78 "https://geo.captcha-delivery.com/",
79 "https://api.leminnow.com/captcha/",
80 "https://cdn.auth0.com/js/lock/",
81 "https://captcha.gtimg.com",
82 "https://client-api.arkoselabs.com/",
83 "https://www.capy.me/puzzle/",
84 "https://newassets.hcaptcha.com/",
85 "https://cdn.auth0.com/client",
86 "https://js.stripe.com/",
87 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-"
90 ];
91
92 pub static ref ALLOWED_MATCHER: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW.iter()).expect("matcher to build");
97
98 static ref JS_FRAMEWORK_ALLOW_3RD_PARTY: Vec<&'static str> = vec![
100 "https://m.stripe.network/",
102 "https://challenges.cloudflare.com/",
103 "https://js.stripe.com/",
104 "https://cdn.prod.website-files.com/", "https://cdnjs.cloudflare.com/", "https://code.jquery.com/jquery-",
107 "https://ct.captcha-delivery.com/",
108 "https://geo.captcha-delivery.com/",
109 "https://img1.wsimg.com/parking-lander/static/js/main.d9ebbb8c.js", "https://cdn.auth0.com/client",
111 "https://captcha.px-cloud.net/",
112 "https://www.capy.me/puzzle/",
113 "https://www.gstatic.com/recaptcha/",
114 "https://google.com/recaptcha/",
115 "https://www.google.com/recaptcha/",
116 "https://www.recaptcha.net/recaptcha/",
117 "https://js.hcaptcha.com/1/api.js",
118 "https://hcaptcha.com/1/api.js",
119 "https://js.datadome.co/tags.js",
120 "https://api-js.datadome.co/",
121 "https://client.perimeterx.net/",
122 "https://captcha.px-cdn.net/",
123 "https://newassets.hcaptcha.com/",
124 "https://captcha.px-cloud.net/",
125 "https://s.perimeterx.net/",
126 "https://api.leminnow.com/captcha/",
127 "https://client-api.arkoselabs.com/",
128 "https://static.geetest.com/v4/gt4.js",
129 "https://static.geetest.com/",
130 "https://cdn.jsdelivr.net/npm/@friendlycaptcha/",
131 "https://cdn.perfdrive.com/aperture/",
132 "https://assets.queue-it.net/",
133 "discourse-cdn.com/",
134 "hcaptcha.com",
135 "/cdn-cgi/challenge-platform/",
136 "/_Incapsula_Resource"
137 ];
138
139 pub static ref ALLOWED_MATCHER_3RD_PARTY: AhoCorasick = AhoCorasick::new(JS_FRAMEWORK_ALLOW_3RD_PARTY.iter()).expect("matcher to build");
141
142 pub static ref JS_FRAMEWORK_PATH: phf::Set<&'static str> = {
144 phf::phf_set! {
145 "_astro/", "_app/immutable"
147 }
148 };
149
150 pub static ref IGNORE_CONTENT_TYPES: phf::Set<&'static str> = phf::phf_set! {
152 "application/pdf",
153 "application/zip",
154 "application/x-rar-compressed",
155 "application/x-tar",
156 "image/png",
157 "image/jpeg",
158 "image/gif",
159 "image/bmp",
160 "image/webp",
161 "image/svg+xml",
162 "video/mp4",
163 "video/x-msvideo",
164 "video/x-matroska",
165 "video/webm",
166 "audio/mpeg",
167 "audio/ogg",
168 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
169 "application/vnd.ms-excel",
170 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
171 "application/vnd.ms-powerpoint",
172 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
173 "application/x-7z-compressed",
174 "application/x-rpm",
175 "application/x-shockwave-flash",
176 "application/rtf",
177 };
178
179 pub static ref IGNORE_VISUAL_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
181 "Image",
182 "Media",
183 "Font"
184 };
185
186 pub static ref IGNORE_NETWORKING_RESOURCE_MAP: phf::Set<&'static str> = phf::phf_set! {
188 "CspViolationReport",
189 "Ping",
190 };
191
192 pub static ref CSS_EXTENSION: CaseInsensitiveString = CaseInsensitiveString::from("css");
194
195 pub static ref INIT_CHAIN: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
197 let enable = EnableParams::default();
198
199 if let Ok(c) = serde_json::to_value(&enable) {
200 vec![(enable.identifier(), c)]
201 } else {
202 vec![]
203 }
204 };
205
206 pub static ref INIT_CHAIN_IGNORE_HTTP_ERRORS: Vec<(std::borrow::Cow<'static, str>, serde_json::Value)> = {
208 let enable = EnableParams::default();
209 let mut v = vec![];
210 if let Ok(c) = serde_json::to_value(&enable) {
211 v.push((enable.identifier(), c));
212 }
213 let ignore = SetIgnoreCertificateErrorsParams::new(true);
214 if let Ok(ignored) = serde_json::to_value(&ignore) {
215 v.push((ignore.identifier(), ignored));
216 }
217
218 v
219 };
220
221 pub static ref ENABLE_FETCH: chromiumoxide_cdp::cdp::browser_protocol::fetch::EnableParams = {
223 fetch::EnableParams::builder()
224 .handle_auth_requests(true)
225 .pattern(RequestPattern::builder().url_pattern("*").request_stage(RequestStage::Request).build())
226 .build()
227 };
228}
229
230pub(crate) fn is_redirect_status(status: i64) -> bool {
232 matches!(status, 301 | 302 | 303 | 307 | 308)
233}
234
235#[derive(Debug)]
236pub struct NetworkManager {
238 queued_events: VecDeque<NetworkEvent>,
244 ignore_httpserrors: bool,
249 requests: HashMap<RequestId, HttpRequest>,
254 requests_will_be_sent: HashMap<RequestId, EventRequestWillBeSent>,
261 extra_headers: std::collections::HashMap<String, String>,
266 request_id_to_interception_id: HashMap<RequestId, InterceptionId>,
272 user_cache_disabled: bool,
277 attempted_authentications: HashSet<RequestId>,
283 credentials: Option<Credentials>,
288 pub(crate) user_request_interception_enabled: bool,
297 block_all: bool,
304 pub(crate) protocol_request_interception_enabled: bool,
310 offline: bool,
312 pub request_timeout: Duration,
314 pub ignore_visuals: bool,
317 pub block_stylesheets: bool,
319 pub block_javascript: bool,
324 pub block_analytics: bool,
326 pub block_prefetch: bool,
328 pub only_html: bool,
330 pub xml_document: bool,
332 pub intercept_manager: NetworkInterceptManager,
334 pub document_reload_tracker: u8,
336 pub document_target_url: String,
338 pub document_target_domain: String,
340 pub max_bytes_allowed: Option<u64>,
342 #[cfg(feature = "_cache")]
343 pub cache_site_key: Option<String>,
345 #[cfg(feature = "_cache")]
347 pub cache_policy: Option<BasicCachePolicy>,
348 whitelist_patterns: Vec<String>,
350 whitelist_matcher: Option<AhoCorasick>,
352 blacklist_patterns: Vec<String>,
354 blacklist_matcher: Option<AhoCorasick>,
356 blacklist_strict: bool,
358}
359
360impl NetworkManager {
361 pub fn new(ignore_httpserrors: bool, request_timeout: Duration) -> Self {
363 Self {
364 queued_events: Default::default(),
365 ignore_httpserrors,
366 requests: Default::default(),
367 requests_will_be_sent: Default::default(),
368 extra_headers: Default::default(),
369 request_id_to_interception_id: Default::default(),
370 user_cache_disabled: false,
371 attempted_authentications: Default::default(),
372 credentials: None,
373 block_all: false,
374 user_request_interception_enabled: false,
375 protocol_request_interception_enabled: false,
376 offline: false,
377 request_timeout,
378 ignore_visuals: false,
379 block_javascript: false,
380 block_stylesheets: false,
381 block_prefetch: true,
382 block_analytics: true,
383 only_html: false,
384 xml_document: false,
385 intercept_manager: NetworkInterceptManager::Unknown,
386 document_reload_tracker: 0,
387 document_target_url: String::new(),
388 document_target_domain: String::new(),
389 whitelist_patterns: Vec::new(),
390 whitelist_matcher: None,
391 blacklist_patterns: Vec::new(),
392 blacklist_matcher: None,
393 blacklist_strict: true,
394 max_bytes_allowed: None,
395 #[cfg(feature = "_cache")]
396 cache_site_key: None,
397 #[cfg(feature = "_cache")]
398 cache_policy: None,
399 }
400 }
401
402 pub fn set_whitelist_patterns<I, S>(&mut self, patterns: I)
404 where
405 I: IntoIterator<Item = S>,
406 S: Into<String>,
407 {
408 self.whitelist_patterns = patterns.into_iter().map(Into::into).collect();
409 self.rebuild_whitelist_matcher();
410 }
411
412 pub fn set_blacklist_patterns<I, S>(&mut self, patterns: I)
414 where
415 I: IntoIterator<Item = S>,
416 S: Into<String>,
417 {
418 self.blacklist_patterns = patterns.into_iter().map(Into::into).collect();
419 self.rebuild_blacklist_matcher();
420 }
421
422 pub fn add_blacklist_pattern<S: Into<String>>(&mut self, pattern: S) {
424 self.blacklist_patterns.push(pattern.into());
425 self.rebuild_blacklist_matcher();
426 }
427
428 pub fn add_blacklist_patterns<I, S>(&mut self, patterns: I)
430 where
431 I: IntoIterator<Item = S>,
432 S: Into<String>,
433 {
434 self.blacklist_patterns
435 .extend(patterns.into_iter().map(Into::into));
436 self.rebuild_blacklist_matcher();
437 }
438
439 pub fn clear_blacklist(&mut self) {
441 self.blacklist_patterns.clear();
442 self.blacklist_matcher = None;
443 }
444
445 pub fn set_blacklist_strict(&mut self, strict: bool) {
447 self.blacklist_strict = strict;
448 }
449
450 #[inline]
451 fn rebuild_blacklist_matcher(&mut self) {
452 if self.blacklist_patterns.is_empty() {
453 self.blacklist_matcher = None;
454 return;
455 }
456
457 let refs: Vec<&str> = self.blacklist_patterns.iter().map(|s| s.as_str()).collect();
458 self.blacklist_matcher = AhoCorasick::new(refs).ok();
459 }
460
461 #[inline]
462 fn is_blacklisted(&self, url: &str) -> bool {
463 self.blacklist_matcher
464 .as_ref()
465 .map(|m| m.is_match(url))
466 .unwrap_or(false)
467 }
468
469 pub fn add_whitelist_pattern<S: Into<String>>(&mut self, pattern: S) {
471 self.whitelist_patterns.push(pattern.into());
472 self.rebuild_whitelist_matcher();
473 }
474
475 pub fn add_whitelist_patterns<I, S>(&mut self, patterns: I)
477 where
478 I: IntoIterator<Item = S>,
479 S: Into<String>,
480 {
481 self.whitelist_patterns
482 .extend(patterns.into_iter().map(Into::into));
483 self.rebuild_whitelist_matcher();
484 }
485
486 #[inline]
487 fn rebuild_whitelist_matcher(&mut self) {
488 if self.whitelist_patterns.is_empty() {
489 self.whitelist_matcher = None;
490 return;
491 }
492
493 let refs: Vec<&str> = self.whitelist_patterns.iter().map(|s| s.as_str()).collect();
494
495 self.whitelist_matcher = AhoCorasick::new(refs).ok();
497 }
498
499 #[inline]
500 fn is_whitelisted(&self, url: &str) -> bool {
501 self.whitelist_matcher
502 .as_ref()
503 .map(|m| m.is_match(url))
504 .unwrap_or(false)
505 }
506
507 pub fn init_commands(&self) -> CommandChain {
509 let cmds = if self.ignore_httpserrors {
510 INIT_CHAIN_IGNORE_HTTP_ERRORS.clone()
511 } else {
512 INIT_CHAIN.clone()
513 };
514 CommandChain::new(cmds, self.request_timeout)
515 }
516
517 pub(crate) fn push_cdp_request<T: Command>(&mut self, cmd: T) {
519 let method = cmd.identifier();
520 if let Ok(params) = serde_json::to_value(cmd) {
521 self.queued_events
522 .push_back(NetworkEvent::SendCdpRequest((method, params)));
523 }
524 }
525
526 pub fn poll(&mut self) -> Option<NetworkEvent> {
528 self.queued_events.pop_front()
529 }
530
531 pub fn extra_headers(&self) -> &std::collections::HashMap<String, String> {
533 &self.extra_headers
534 }
535
536 pub fn set_extra_headers(&mut self, headers: std::collections::HashMap<String, String>) {
538 self.extra_headers = headers;
539 self.extra_headers.remove(PROXY_AUTHORIZATION.as_str());
540 self.extra_headers.remove("Proxy-Authorization");
541 if !self.extra_headers.is_empty() {
542 if let Ok(headers) = serde_json::to_value(&self.extra_headers) {
543 self.push_cdp_request(SetExtraHttpHeadersParams::new(Headers::new(headers)));
544 }
545 }
546 }
547
548 pub fn set_service_worker_enabled(&mut self, bypass: bool) {
549 self.push_cdp_request(SetBypassServiceWorkerParams::new(bypass));
550 }
551
552 pub fn set_block_all(&mut self, block_all: bool) {
553 self.block_all = block_all;
554 }
555
556 pub fn set_request_interception(&mut self, enabled: bool) {
557 self.user_request_interception_enabled = enabled;
558 self.update_protocol_request_interception();
559 }
560
561 pub fn set_cache_enabled(&mut self, enabled: bool) {
562 let run = self.user_cache_disabled == enabled;
563 self.user_cache_disabled = !enabled;
564 if run {
565 self.update_protocol_cache_disabled();
566 }
567 }
568
569 pub fn enable_request_intercept(&mut self) {
571 self.protocol_request_interception_enabled = true;
572 }
573
574 pub fn disable_request_intercept(&mut self) {
576 self.protocol_request_interception_enabled = false;
577 }
578
579 #[cfg(feature = "_cache")]
581 pub fn set_cache_site_key(&mut self, cache_site_key: Option<String>) {
582 self.cache_site_key = cache_site_key;
583 }
584
585 #[cfg(feature = "_cache")]
587 pub fn set_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) {
588 self.cache_policy = cache_policy;
589 }
590
591 pub fn update_protocol_cache_disabled(&mut self) {
592 self.push_cdp_request(SetCacheDisabledParams::new(self.user_cache_disabled));
593 }
594
595 pub fn authenticate(&mut self, credentials: Credentials) {
596 self.credentials = Some(credentials);
597 self.update_protocol_request_interception();
598 self.protocol_request_interception_enabled = true;
599 }
600
601 fn update_protocol_request_interception(&mut self) {
602 let enabled = self.user_request_interception_enabled || self.credentials.is_some();
603
604 if enabled == self.protocol_request_interception_enabled {
605 return;
606 }
607
608 if enabled {
609 self.push_cdp_request(ENABLE_FETCH.clone())
610 } else {
611 self.push_cdp_request(DisableParams::default())
612 }
613 }
614
615 #[inline]
618 fn should_block_script_blocklist_only(&self, url: &str) -> bool {
619 let block_analytics = self.block_analytics;
621
622 if block_analytics && spider_network_blocker::scripts::URL_IGNORE_TRIE.contains_prefix(url)
624 {
625 return true;
626 }
627
628 if crate::handler::blockers::block_websites::block_website(url) {
630 return true;
631 }
632
633 if let Some(path_with_slash) = Self::url_path_with_leading_slash(url) {
640 let p_slash = Self::strip_query_fragment(path_with_slash);
642 let p_noslash = p_slash.strip_prefix('/').unwrap_or(p_slash);
643
644 let base = match p_slash.rsplit('/').next() {
646 Some(b) => b,
647 None => p_slash,
648 };
649
650 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_slash) {
653 return true;
654 }
655 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(p_noslash) {
656 return true;
657 }
658 if block_analytics && URL_IGNORE_TRIE_PATHS.contains_prefix(base) {
659 return true;
660 }
661
662 if URL_IGNORE_SCRIPT_BASE_PATHS.contains_prefix(p_noslash) {
665 return true;
666 }
667
668 if self.ignore_visuals && URL_IGNORE_SCRIPT_STYLES_PATHS.contains_prefix(p_noslash) {
670 return true;
671 }
672 }
673
674 false
675 }
676
677 #[inline]
682 fn url_path_with_leading_slash(url: &str) -> Option<&str> {
683 let idx = url.find("//")?;
685 let after_slashes = idx + 2;
686
687 let slash_rel = url[after_slashes..].find('/')?;
689 let slash_idx = after_slashes + slash_rel;
690
691 if slash_idx < url.len() {
692 Some(&url[slash_idx..])
693 } else {
694 None
695 }
696 }
697
698 #[inline]
703 fn strip_query_fragment(s: &str) -> &str {
704 let q = s.find('?');
705 let h = s.find('#');
706
707 match (q, h) {
708 (None, None) => s,
709 (Some(i), None) => &s[..i],
710 (None, Some(i)) => &s[..i],
711 (Some(i), Some(j)) => &s[..i.min(j)],
712 }
713 }
714
715 #[inline]
717 fn skip_xhr(
718 &self,
719 skip_networking: bool,
720 event: &EventRequestPaused,
721 network_event: bool,
722 ) -> bool {
723 if !skip_networking && network_event {
725 let request_url = event.request.url.as_str();
726
727 let skip_analytics =
729 self.block_analytics && (ignore_script_xhr(request_url) || block_xhr(request_url));
730
731 if skip_analytics {
732 true
733 } else if self.block_stylesheets || self.ignore_visuals {
734 let block_css = self.block_stylesheets;
735 let block_media = self.ignore_visuals;
736
737 let mut block_request = false;
738
739 if let Some(position) = request_url.rfind('.') {
740 let hlen = request_url.len();
741 let has_asset = hlen - position;
742
743 if has_asset >= 3 {
744 let next_position = position + 1;
745
746 if block_media
747 && IGNORE_XHR_ASSETS.contains::<CaseInsensitiveString>(
748 &request_url[next_position..].into(),
749 )
750 {
751 block_request = true;
752 } else if block_css {
753 block_request = CaseInsensitiveString::from(
754 &request_url.as_bytes()[next_position..],
755 )
756 .contains(&**CSS_EXTENSION)
757 }
758 }
759 }
760
761 if !block_request {
762 block_request = ignore_script_xhr_media(request_url);
763 }
764
765 block_request
766 } else {
767 skip_networking
768 }
769 } else {
770 skip_networking
771 }
772 }
773
774 #[cfg(feature = "adblock")]
775 #[inline]
776 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
778 if skip_networking {
779 true
780 } else {
781 block_ads(&event.request.url) || self.detect_ad(event)
782 }
783 }
784
785 #[cfg(not(feature = "adblock"))]
787 #[inline]
788 fn detect_ad_if_enabled(&mut self, event: &EventRequestPaused, skip_networking: bool) -> bool {
789 use crate::handler::blockers::block_websites::block_ads;
790 if skip_networking {
791 true
792 } else {
793 block_ads(&event.request.url)
794 }
795 }
796
797 #[inline]
798 fn fail_request_blocked(
800 &mut self,
801 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
802 ) {
803 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FailRequestParams::new(
804 request_id.clone(),
805 chromiumoxide_cdp::cdp::browser_protocol::network::ErrorReason::BlockedByClient,
806 );
807 self.push_cdp_request(params);
808 }
809
810 #[inline]
811 fn fulfill_request_empty_200(
813 &mut self,
814 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
815 ) {
816 let params = chromiumoxide_cdp::cdp::browser_protocol::fetch::FulfillRequestParams::new(
817 request_id.clone(),
818 200,
819 );
820 self.push_cdp_request(params);
821 }
822
823 #[cfg(feature = "_cache")]
824 #[inline]
825 fn fulfill_request_from_cache(
829 &mut self,
830 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
831 body: &[u8],
832 headers: &std::collections::HashMap<String, String>,
833 status: i64,
834 ) {
835 use crate::cdp::browser_protocol::fetch::HeaderEntry;
836 use crate::handler::network::fetch::FulfillRequestParams;
837 use base64::Engine;
838
839 let mut resp_headers = Vec::<HeaderEntry>::with_capacity(headers.len());
840
841 for (k, v) in headers.iter() {
842 resp_headers.push(HeaderEntry {
843 name: k.clone().into(),
844 value: v.clone().into(),
845 });
846 }
847
848 let mut params = FulfillRequestParams::new(request_id.clone(), status);
849
850 params.body = Some(
852 base64::engine::general_purpose::STANDARD
853 .encode(body)
854 .into(),
855 );
856
857 params.response_headers = Some(resp_headers);
858
859 self.push_cdp_request(params);
860 }
861
862 #[inline]
863 fn continue_request_with_url(
865 &mut self,
866 request_id: &chromiumoxide_cdp::cdp::browser_protocol::fetch::RequestId,
867 url: Option<&str>,
868 intercept_response: bool,
869 ) {
870 let mut params = ContinueRequestParams::new(request_id.clone());
871 if let Some(url) = url {
872 params.url = Some(url.to_string());
873 params.intercept_response = Some(intercept_response);
874 }
875 self.push_cdp_request(params);
876 }
877
878 #[inline]
880 pub fn on_fetch_request_paused(&mut self, event: &EventRequestPaused) {
881 if self.user_request_interception_enabled && self.protocol_request_interception_enabled {
882 return;
883 }
884
885 if self.block_all {
886 tracing::debug!(
887 "Blocked (block_all): {:?} - {}",
888 event.resource_type,
889 event.request.url
890 );
891 return self.fail_request_blocked(&event.request_id);
892 }
893
894 if let Some(network_id) = event.network_id.as_ref() {
895 if let Some(request_will_be_sent) =
896 self.requests_will_be_sent.remove(network_id.as_ref())
897 {
898 self.on_request(&request_will_be_sent, Some(event.request_id.clone().into()));
899 } else {
900 self.request_id_to_interception_id
901 .insert(network_id.clone(), event.request_id.clone().into());
902 }
903 }
904
905 let javascript_resource = event.resource_type == ResourceType::Script;
907 let document_resource = event.resource_type == ResourceType::Document;
908 let network_resource =
909 !document_resource && crate::utils::is_data_resource(&event.resource_type);
910
911 let mut skip_networking =
913 self.block_all || IGNORE_NETWORKING_RESOURCE_MAP.contains(event.resource_type.as_ref());
914
915 if event.resource_type == ResourceType::Prefetch && !self.block_prefetch {
916 skip_networking = true;
917 }
918
919 if !skip_networking {
921 skip_networking = self.document_reload_tracker >= 3;
922 }
923
924 let (current_url_cow, had_replacer) =
926 self.handle_document_replacement_and_tracking(event, document_resource);
927
928 let current_url: &str = current_url_cow.as_ref();
929
930 let blacklisted = self.is_blacklisted(current_url);
931
932 if !self.blacklist_strict && blacklisted {
933 skip_networking = true;
934 }
935
936 if !skip_networking {
937 if self.xml_document && current_url.ends_with(".xsl") {
939 skip_networking = false;
940 } else {
941 skip_networking = self.should_skip_for_visuals_and_basic(&event.resource_type);
942 }
943 }
944
945 skip_networking = self.detect_ad_if_enabled(event, skip_networking);
946
947 if !skip_networking
949 && self.block_javascript
950 && (self.only_html || self.ignore_visuals)
951 && (javascript_resource || document_resource)
952 {
953 skip_networking = ignore_script_embedded(current_url);
954 }
955
956 if !skip_networking && javascript_resource {
959 skip_networking = self.should_block_script_blocklist_only(current_url);
960 }
961
962 skip_networking = self.skip_xhr(skip_networking, event, network_resource);
964
965 if !skip_networking && (javascript_resource || network_resource || document_resource) {
967 skip_networking = self.intercept_manager.intercept_detection(
968 current_url,
969 self.ignore_visuals,
970 network_resource,
971 );
972 }
973
974 if !skip_networking && (javascript_resource || network_resource) {
976 skip_networking = crate::handler::blockers::block_websites::block_website(current_url);
977 }
978
979 if skip_networking && javascript_resource && ALLOWED_MATCHER_3RD_PARTY.is_match(current_url)
982 {
983 skip_networking = false;
984 }
985
986 if skip_networking && self.is_whitelisted(current_url) {
988 skip_networking = false;
989 }
990
991 if self.blacklist_strict && blacklisted {
992 skip_networking = true;
993 }
994
995 if skip_networking {
996 tracing::debug!("Blocked: {:?} - {}", event.resource_type, current_url);
997 self.fulfill_request_empty_200(&event.request_id);
998 } else {
999 #[cfg(feature = "_cache")]
1000 {
1001 if let (Some(policy), Some(cache_site_key)) =
1002 (self.cache_policy.as_ref(), self.cache_site_key.as_deref())
1003 {
1004 let current_url = format!("{}:{}", event.request.method, ¤t_url);
1005
1006 if let Some((res, cache_policy)) =
1007 crate::cache::remote::get_session_cache_item(cache_site_key, ¤t_url)
1008 {
1009 if policy.allows_cached(&cache_policy) {
1010 tracing::debug!(
1011 "Remote Cached: {:?} - {}",
1012 &event.resource_type,
1013 ¤t_url
1014 );
1015 let flat_headers = crate::http::headers_from_multi(&res.headers);
1016 return self.fulfill_request_from_cache(
1017 &event.request_id,
1018 &res.body,
1019 &flat_headers,
1020 res.status as i64,
1021 );
1022 }
1023 }
1024 }
1025 }
1026
1027 tracing::debug!("Allowed: {:?} - {}", event.resource_type, current_url);
1029 self.continue_request_with_url(
1030 &event.request_id,
1031 if had_replacer {
1032 Some(current_url)
1033 } else {
1034 None
1035 },
1036 !had_replacer,
1037 );
1038 }
1039 }
1040
1041 #[inline]
1047 fn should_skip_for_visuals_and_basic(&self, resource_type: &ResourceType) -> bool {
1048 (self.ignore_visuals && IGNORE_VISUAL_RESOURCE_MAP.contains(resource_type.as_ref()))
1049 || (self.block_stylesheets && *resource_type == ResourceType::Stylesheet)
1050 }
1051
1052 pub fn has_target_domain(&self) -> bool {
1054 !self.document_target_url.is_empty()
1055 }
1056
1057 pub fn set_page_url(&mut self, page_target_url: String) {
1059 let host_base = host_and_rest(&page_target_url)
1060 .map(|(h, _)| base_domain_from_host(h))
1061 .unwrap_or("");
1062
1063 self.document_target_domain = host_base.to_string();
1064 self.document_target_url = page_target_url;
1065 }
1066
1067 pub fn clear_target_domain(&mut self) {
1069 self.document_reload_tracker = 0;
1070 self.document_target_url = Default::default();
1071 self.document_target_domain = Default::default();
1072 }
1073
1074 #[inline]
1082 fn handle_document_replacement_and_tracking<'a>(
1083 &mut self,
1084 event: &'a EventRequestPaused,
1085 document_resource: bool,
1086 ) -> (Cow<'a, str>, bool) {
1087 let mut replacer: Option<String> = None;
1088 let current_url = event.request.url.as_str();
1089
1090 if document_resource {
1091 if self.document_target_url == current_url {
1092 self.document_reload_tracker += 1;
1093 } else if !self.document_target_url.is_empty() && event.redirected_request_id.is_some()
1094 {
1095 let (http_document_replacement, mut https_document_replacement) =
1096 if self.document_target_url.starts_with("http://") {
1097 (
1098 self.document_target_url.replacen("http://", "http//", 1),
1099 self.document_target_url.replacen("http://", "https://", 1),
1100 )
1101 } else {
1102 (
1103 self.document_target_url.replacen("https://", "https//", 1),
1104 self.document_target_url.replacen("https://", "http://", 1),
1105 )
1106 };
1107
1108 let trailing = https_document_replacement.ends_with('/');
1110 if trailing {
1111 https_document_replacement.pop();
1112 }
1113 if https_document_replacement.ends_with('/') {
1114 https_document_replacement.pop();
1115 }
1116
1117 let redirect_mask = format!(
1118 "{}{}",
1119 https_document_replacement, http_document_replacement
1120 );
1121
1122 if current_url == redirect_mask {
1123 replacer = Some(if trailing {
1124 format!("{}/", https_document_replacement)
1125 } else {
1126 https_document_replacement
1127 });
1128 }
1129 }
1130
1131 if self.document_target_url.is_empty() && current_url.ends_with(".xml") {
1132 self.xml_document = true;
1133 }
1134
1135 self.document_target_url = event.request.url.clone();
1137 self.document_target_domain = host_and_rest(&self.document_target_url)
1138 .map(|(h, _)| base_domain_from_host(h).to_string())
1139 .unwrap_or_default();
1140 }
1141
1142 let current_url_cow = match replacer {
1143 Some(r) => Cow::Owned(r),
1144 None => Cow::Borrowed(event.request.url.as_str()),
1145 };
1146
1147 let had_replacer = matches!(current_url_cow, Cow::Owned(_));
1148 (current_url_cow, had_replacer)
1149 }
1150
1151 #[cfg(feature = "adblock")]
1153 pub fn detect_ad(&self, event: &EventRequestPaused) -> bool {
1154 use adblock::{
1155 lists::{FilterSet, ParseOptions, RuleTypes},
1156 Engine,
1157 };
1158
1159 lazy_static::lazy_static! {
1160 static ref AD_ENGINE: Engine = {
1161 let mut filter_set = FilterSet::new(false);
1162 let mut rules = ParseOptions::default();
1163 rules.rule_types = RuleTypes::All;
1164
1165 filter_set.add_filters(
1166 &*spider_network_blocker::adblock::ADBLOCK_PATTERNS,
1167 rules,
1168 );
1169
1170 Engine::from_filter_set(filter_set, true)
1171 };
1172 };
1173
1174 let blockable = ResourceType::Image == event.resource_type
1175 || event.resource_type == ResourceType::Media
1176 || event.resource_type == ResourceType::Stylesheet
1177 || event.resource_type == ResourceType::Document
1178 || event.resource_type == ResourceType::Fetch
1179 || event.resource_type == ResourceType::Xhr;
1180
1181 let u = &event.request.url;
1182
1183 let block_request = blockable
1184 && {
1186 let request = adblock::request::Request::preparsed(
1187 &u,
1188 "example.com",
1189 "example.com",
1190 &event.resource_type.as_ref().to_lowercase(),
1191 !event.request.is_same_site.unwrap_or_default());
1192
1193 AD_ENGINE.check_network_request(&request).matched
1194 };
1195
1196 block_request
1197 }
1198
1199 pub fn on_fetch_auth_required(&mut self, event: &EventAuthRequired) {
1200 let response = if self
1201 .attempted_authentications
1202 .contains(event.request_id.as_ref())
1203 {
1204 AuthChallengeResponseResponse::CancelAuth
1205 } else if self.credentials.is_some() {
1206 self.attempted_authentications
1207 .insert(event.request_id.clone().into());
1208 AuthChallengeResponseResponse::ProvideCredentials
1209 } else {
1210 AuthChallengeResponseResponse::Default
1211 };
1212
1213 let mut auth = AuthChallengeResponse::new(response);
1214 if let Some(creds) = self.credentials.clone() {
1215 auth.username = Some(creds.username);
1216 auth.password = Some(creds.password);
1217 }
1218 self.push_cdp_request(ContinueWithAuthParams::new(event.request_id.clone(), auth));
1219 }
1220
1221 pub fn set_offline_mode(&mut self, value: bool) {
1223 if self.offline == value {
1224 return;
1225 }
1226 self.offline = value;
1227 if let Ok(network) = EmulateNetworkConditionsByRuleParams::builder()
1228 .offline(self.offline)
1229 .matched_network_condition(
1230 NetworkConditions::builder()
1231 .url_pattern("")
1232 .latency(0)
1233 .download_throughput(-1.)
1234 .upload_throughput(-1.)
1235 .build()
1236 .unwrap(),
1237 )
1238 .build()
1239 {
1240 self.push_cdp_request(network);
1241 }
1242 }
1243
1244 pub fn on_request_will_be_sent(&mut self, event: &EventRequestWillBeSent) {
1246 if self.protocol_request_interception_enabled && !event.request.url.starts_with("data:") {
1247 if let Some(interception_id) = self
1248 .request_id_to_interception_id
1249 .remove(event.request_id.as_ref())
1250 {
1251 self.on_request(event, Some(interception_id));
1252 } else {
1253 self.requests_will_be_sent
1255 .insert(event.request_id.clone(), event.clone());
1256 }
1257 } else {
1258 self.on_request(event, None);
1259 }
1260 }
1261
1262 pub fn on_request_served_from_cache(&mut self, event: &EventRequestServedFromCache) {
1264 if let Some(request) = self.requests.get_mut(event.request_id.as_ref()) {
1265 request.from_memory_cache = true;
1266 }
1267 }
1268
1269 pub fn on_response_received(&mut self, event: &EventResponseReceived) {
1271 let mut request_failed = false;
1272
1273 let mut deducted: u64 = 0;
1275
1276 if let Some(max_bytes) = self.max_bytes_allowed.as_mut() {
1277 let before = *max_bytes;
1278
1279 let received_bytes: u64 = event.response.encoded_data_length as u64;
1281
1282 let content_length: Option<u64> = event
1284 .response
1285 .headers
1286 .inner()
1287 .get("content-length")
1288 .and_then(|v| v.as_str())
1289 .and_then(|s| s.trim().parse::<u64>().ok());
1290
1291 *max_bytes = max_bytes.saturating_sub(received_bytes);
1293
1294 if let Some(cl) = content_length {
1296 if cl > *max_bytes {
1297 *max_bytes = 0;
1298 }
1299 }
1300
1301 request_failed = *max_bytes == 0;
1302
1303 deducted = before.saturating_sub(*max_bytes);
1305 }
1306
1307 if deducted > 0 {
1309 self.queued_events
1310 .push_back(NetworkEvent::BytesConsumed(deducted));
1311 }
1312
1313 if request_failed && self.max_bytes_allowed.is_some() {
1315 self.set_block_all(true);
1316 }
1317
1318 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1319 request.set_response(event.response.clone());
1320 self.queued_events.push_back(if request_failed {
1321 NetworkEvent::RequestFailed(request)
1322 } else {
1323 NetworkEvent::RequestFinished(request)
1324 });
1325 }
1326 }
1327
1328 pub fn on_network_loading_finished(&mut self, event: &EventLoadingFinished) {
1330 if let Some(request) = self.requests.remove(event.request_id.as_ref()) {
1331 if let Some(interception_id) = request.interception_id.as_ref() {
1332 self.attempted_authentications
1333 .remove(interception_id.as_ref());
1334 }
1335 self.queued_events
1336 .push_back(NetworkEvent::RequestFinished(request));
1337 }
1338 }
1339
1340 pub fn on_network_loading_failed(&mut self, event: &EventLoadingFailed) {
1342 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1343 request.failure_text = Some(event.error_text.clone());
1344 if let Some(interception_id) = request.interception_id.as_ref() {
1345 self.attempted_authentications
1346 .remove(interception_id.as_ref());
1347 }
1348 self.queued_events
1349 .push_back(NetworkEvent::RequestFailed(request));
1350 }
1351 }
1352
1353 fn on_request(
1355 &mut self,
1356 event: &EventRequestWillBeSent,
1357 interception_id: Option<InterceptionId>,
1358 ) {
1359 let mut redirect_chain = Vec::new();
1360 let mut redirect_location = None;
1361
1362 if let Some(redirect_resp) = &event.redirect_response {
1363 if let Some(mut request) = self.requests.remove(event.request_id.as_ref()) {
1364 if is_redirect_status(redirect_resp.status) {
1365 if let Some(location) = redirect_resp.headers.inner()["Location"].as_str() {
1366 if redirect_resp.url != location {
1367 let fixed_location = location.replace(&redirect_resp.url, "");
1368
1369 if !fixed_location.is_empty() {
1370 if let Some(resp) = request.response.as_mut() {
1371 resp.headers.0["Location"] =
1372 serde_json::Value::String(fixed_location.clone());
1373 }
1374 }
1375
1376 redirect_location = Some(fixed_location);
1377 }
1378 }
1379 }
1380
1381 self.handle_request_redirect(
1382 &mut request,
1383 if let Some(redirect_location) = redirect_location {
1384 let mut redirect_resp = redirect_resp.clone();
1385
1386 if !redirect_location.is_empty() {
1387 redirect_resp.headers.0["Location"] =
1388 serde_json::Value::String(redirect_location);
1389 }
1390
1391 redirect_resp
1392 } else {
1393 redirect_resp.clone()
1394 },
1395 );
1396
1397 redirect_chain = std::mem::take(&mut request.redirect_chain);
1398 redirect_chain.push(request);
1399 }
1400 }
1401
1402 let request = HttpRequest::new(
1403 event.request_id.clone(),
1404 event.frame_id.clone(),
1405 interception_id,
1406 self.user_request_interception_enabled,
1407 redirect_chain,
1408 );
1409
1410 self.requests.insert(event.request_id.clone(), request);
1411 self.queued_events
1412 .push_back(NetworkEvent::Request(event.request_id.clone()));
1413 }
1414
1415 fn handle_request_redirect(&mut self, request: &mut HttpRequest, response: Response) {
1417 request.set_response(response);
1418 if let Some(interception_id) = request.interception_id.as_ref() {
1419 self.attempted_authentications
1420 .remove(interception_id.as_ref());
1421 }
1422 }
1423}
1424
1425#[derive(Debug)]
1426pub enum NetworkEvent {
1427 SendCdpRequest((MethodId, serde_json::Value)),
1429 Request(RequestId),
1431 Response(RequestId),
1433 RequestFailed(HttpRequest),
1435 RequestFinished(HttpRequest),
1437 BytesConsumed(u64),
1439}
1440
1441#[cfg(test)]
1442mod tests {
1443 use super::ALLOWED_MATCHER_3RD_PARTY;
1444 use crate::handler::network::NetworkManager;
1445 use std::time::Duration;
1446
1447 #[test]
1448 fn test_allowed_matcher_3rd_party() {
1449 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1451 assert!(
1452 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1453 "expected Cloudflare challenge script to be allowed"
1454 );
1455
1456 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1458 assert!(
1459 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1460 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1461 );
1462
1463 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1465 assert!(ALLOWED_MATCHER_3RD_PARTY
1466 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1467 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1468 }
1469
1470 #[test]
1471 fn test_script_allowed_by_default_when_not_blocklisted() {
1472 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1473 nm.set_page_url(
1474 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1475 );
1476
1477 let ok = "https://cdn.example.net/assets/some-app-bundle-12345.js";
1479 assert!(
1480 !nm.should_block_script_blocklist_only(ok),
1481 "expected non-blocklisted script to be allowed"
1482 );
1483 }
1484
1485 #[test]
1486 fn test_script_blocked_when_matches_ignore_trie_or_blocklist() {
1487 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1488 nm.set_page_url(
1489 "https://forum.cursor.com/t/is-2000-fast-requests-the-maximum/51085".to_string(),
1490 );
1491
1492 let bad = "https://cdn.example.net/js/analytics.js";
1494 assert!(
1495 nm.should_block_script_blocklist_only(bad),
1496 "expected analytics.js to be blocklisted"
1497 );
1498 }
1499
1500 #[test]
1501 fn test_allowed_matcher_3rd_party_sanity() {
1502 let cf_challenge = "https://www.something.com.ba/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=9abf7b523d90987e";
1504 assert!(
1505 ALLOWED_MATCHER_3RD_PARTY.is_match(cf_challenge),
1506 "expected Cloudflare challenge script to be allowed"
1507 );
1508
1509 let cf_insights = "https://static.cloudflareinsights.com/beacon.min.js/vcd15cbe7772f49c399c6a5babf22c1241717689176015";
1511 assert!(
1512 !ALLOWED_MATCHER_3RD_PARTY.is_match(cf_insights),
1513 "expected Cloudflare Insights beacon to remain blocked (not in allow-list)"
1514 );
1515
1516 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://js.stripe.com/v3/"));
1517 assert!(ALLOWED_MATCHER_3RD_PARTY
1518 .is_match("https://www.google.com/recaptcha/api.js?render=explicit"));
1519 assert!(ALLOWED_MATCHER_3RD_PARTY.is_match("https://code.jquery.com/jquery-3.7.1.min.js"));
1520 }
1521 #[test]
1522 fn test_dynamic_blacklist_blocks_url() {
1523 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1524 nm.set_page_url("https://example.com/".to_string());
1525
1526 nm.set_blacklist_patterns(["static.cloudflareinsights.com", "googletagmanager.com"]);
1527 assert!(nm.is_blacklisted("https://static.cloudflareinsights.com/beacon.min.js"));
1528 assert!(nm.is_blacklisted("https://www.googletagmanager.com/gtm.js?id=GTM-XXXX"));
1529
1530 assert!(!nm.is_blacklisted("https://cdn.example.net/assets/app.js"));
1531 }
1532
1533 #[test]
1534 fn test_blacklist_strict_wins_over_whitelist() {
1535 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1536 nm.set_page_url("https://example.com/".to_string());
1537
1538 nm.set_blacklist_patterns(["beacon.min.js"]);
1540 nm.set_whitelist_patterns(["beacon.min.js"]);
1541
1542 nm.set_blacklist_strict(true);
1543
1544 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1545 assert!(nm.is_whitelisted(u));
1546 assert!(nm.is_blacklisted(u));
1547
1548 assert!(nm.blacklist_strict);
1551 }
1552
1553 #[test]
1554 fn test_blacklist_non_strict_allows_whitelist_override() {
1555 let mut nm = NetworkManager::new(false, Duration::from_secs(30));
1556 nm.set_page_url("https://example.com/".to_string());
1557
1558 nm.set_blacklist_patterns(["beacon.min.js"]);
1559 nm.set_whitelist_patterns(["beacon.min.js"]);
1560
1561 nm.set_blacklist_strict(false);
1562
1563 let u = "https://static.cloudflareinsights.com/beacon.min.js";
1564 assert!(nm.is_blacklisted(u));
1565 assert!(nm.is_whitelisted(u));
1566 assert!(!nm.blacklist_strict);
1567 }
1568}